* [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv
[not found] <20240615115034.3891490-1-uk7b@foxmail.com>
@ 2024-06-15 11:50 ` uk7b
2024-06-15 11:52 ` flow gg
2024-06-24 20:07 ` Rémi Denis-Courmont
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v uk7b
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 4/4] lavc/vp9dsp: R-V V mc tap hv uk7b
2 siblings, 2 replies; 15+ messages in thread
From: uk7b @ 2024-06-15 11:50 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
vp9_avg_bilin_4hv_8bpp_c : 10.7 9.5
vp9_avg_bilin_4hv_8bpp_rvv_i32 : 4.0 3.5
vp9_avg_bilin_8hv_8bpp_c : 38.5 34.2
vp9_avg_bilin_8hv_8bpp_rvv_i32 : 7.2 6.5
vp9_avg_bilin_16hv_8bpp_c : 147.2 130.5
vp9_avg_bilin_16hv_8bpp_rvv_i32 : 14.5 12.7
vp9_avg_bilin_32hv_8bpp_c : 574.2 509.7
vp9_avg_bilin_32hv_8bpp_rvv_i32 : 42.5 38.0
vp9_avg_bilin_64hv_8bpp_c : 2321.2 2017.7
vp9_avg_bilin_64hv_8bpp_rvv_i32 : 163.5 131.0
vp9_put_bilin_4hv_8bpp_c : 10.0 8.7
vp9_put_bilin_4hv_8bpp_rvv_i32 : 3.5 3.0
vp9_put_bilin_8hv_8bpp_c : 35.2 31.2
vp9_put_bilin_8hv_8bpp_rvv_i32 : 6.5 5.7
vp9_put_bilin_16hv_8bpp_c : 134.0 119.0
vp9_put_bilin_16hv_8bpp_rvv_i32 : 12.7 11.5
vp9_put_bilin_32hv_8bpp_c : 538.5 464.2
vp9_put_bilin_32hv_8bpp_rvv_i32 : 39.7 35.2
vp9_put_bilin_64hv_8bpp_c : 2111.7 1833.2
vp9_put_bilin_64hv_8bpp_rvv_i32 : 138.5 122.5
---
libavcodec/riscv/vp9_mc_rvv.S | 38 +++++++++++++++++++++++++++++++++-
libavcodec/riscv/vp9dsp_init.c | 10 +++++++++
2 files changed, 47 insertions(+), 1 deletion(-)
diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index fb7377048a..5241562531 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -147,6 +147,40 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
endfunc
.endm
+.macro bilin_hv op
+func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
+ vsetvlstatic8 64, t0, 64
+.Lbilin_hv\op:
+.ifc \op,avg
+ csrwi vxrm, 0
+.endif
+ neg t1, a5
+ neg t2, a6
+ li t4, 8
+ bilin_load_h v24, put, a5
+ add a2, a2, a3
+1:
+ addi a4, a4, -1
+ bilin_load_h v4, put, a5
+ vwmulu.vx v16, v4, a6
+ vwmaccsu.vx v16, t2, v24
+ vwadd.wx v16, v16, t4
+ vnsra.wi v16, v16, 4
+ vadd.vv v0, v16, v24
+.ifc \op,avg
+ vle8.v v16, (a0)
+ vaaddu.vv v0, v0, v16
+.endif
+ vse8.v v0, (a0)
+ vmv.v.v v24, v4
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+.endm
+
.irp len, 64, 32, 16, 8, 4
copy_avg \len
.endr
@@ -155,6 +189,8 @@ bilin_h_v put, h, a5
bilin_h_v avg, h, a5
bilin_h_v put, v, a6
bilin_h_v avg, v, a6
+bilin_hv put
+bilin_hv avg
.macro func_bilin_h_v len, op, type
func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
@@ -165,7 +201,7 @@ endfunc
.irp len, 32, 16, 8, 4
.irp op, put, avg
- .irp type, h, v
+ .irp type, h, v, hv
func_bilin_h_v \len, \op, \type
.endr
.endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 9606d8545f..b3700dfb08 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv;
dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv;
dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
+ dsp->mc[0][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_64hv_rvv;
+ dsp->mc[0][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_64hv_rvv;
+ dsp->mc[1][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_32hv_rvv;
+ dsp->mc[1][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_32hv_rvv;
+ dsp->mc[2][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_16hv_rvv;
+ dsp->mc[2][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_16hv_rvv;
+ dsp->mc[3][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_8hv_rvv;
+ dsp->mc[3][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_8hv_rvv;
+ dsp->mc[4][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_4hv_rvv;
+ dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
#undef init_fpel
}
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
[not found] <20240615115034.3891490-1-uk7b@foxmail.com>
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
@ 2024-06-15 11:50 ` uk7b
2024-06-15 11:52 ` flow gg
2024-07-13 9:02 ` Rémi Denis-Courmont
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 4/4] lavc/vp9dsp: R-V V mc tap hv uk7b
2 siblings, 2 replies; 15+ messages in thread
From: uk7b @ 2024-06-15 11:50 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
vp9_avg_8tap_smooth_4h_8bpp_c : 12.7 11.2
vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 4.7 4.2
vp9_avg_8tap_smooth_4v_8bpp_c : 29.7 12.5
vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 4.7 4.2
vp9_avg_8tap_smooth_8h_8bpp_c : 48.7 42.2
vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.5 8.5
vp9_avg_8tap_smooth_8v_8bpp_c : 49.7 45.5
vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5
vp9_avg_8tap_smooth_16h_8bpp_c : 192.0 166.5
vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.7 19.5
vp9_avg_8tap_smooth_16v_8bpp_c : 191.2 175.2
vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.2 19.0
vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.2
vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 68.2 60.5
vp9_avg_8tap_smooth_32v_8bpp_c : 770.0 685.7
vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.0 59.5
vp9_avg_8tap_smooth_64h_8bpp_c : 3116.2 2648.2
vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 270.7 120.7
vp9_avg_8tap_smooth_64v_8bpp_c : 3058.5 2731.7
vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 266.5 119.0
vp9_put_8tap_smooth_4h_8bpp_c : 11.0 9.7
vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 3.7
vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.5
vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.0 3.7
vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5
vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7
vp9_put_8tap_smooth_8v_8bpp_c : 43.5 38.5
vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.7 7.7
vp9_put_8tap_smooth_16h_8bpp_c : 181.7 147.2
vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 20.0 18.0
vp9_put_8tap_smooth_16v_8bpp_c : 168.5 149.7
vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5
vp9_put_8tap_smooth_32h_8bpp_c : 675.0 586.5
vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 65.2 58.0
vp9_put_8tap_smooth_32v_8bpp_c : 664.7 591.2
vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.0 57.0
vp9_put_8tap_smooth_64h_8bpp_c : 2696.2 2339.0
vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 259.7 115.7
vp9_put_8tap_smooth_64v_8bpp_c : 2691.0 2348.5
vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0
---
libavcodec/riscv/vp9_mc_rvv.S | 200 +++++++++++++++++++++++++++++++++
libavcodec/riscv/vp9dsp.h | 72 ++++++++----
libavcodec/riscv/vp9dsp_init.c | 38 ++++++-
3 files changed, 285 insertions(+), 25 deletions(-)
diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 5241562531..5e81301aa5 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -36,6 +36,18 @@
.endif
.endm
+.macro vsetvlstatic16 len
+.ifc \len,4
+ vsetvli zero, zero, e16, mf2, ta, ma
+.elseif \len == 8
+ vsetvli zero, zero, e16, m1, ta, ma
+.elseif \len == 16
+ vsetvli zero, zero, e16, m2, ta, ma
+.else
+ vsetvli zero, zero, e16, m4, ta, ma
+.endif
+.endm
+
.macro copy_avg len
func ff_vp9_avg\len\()_rvv, zve32x
csrwi vxrm, 0
@@ -181,8 +193,196 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
endfunc
.endm
+.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
+.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
+.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
+
+.macro epel_filter name, type, regtype
+ lla \regtype\()2, ff_vp9_subpel_filters_\name
+
+.ifc \type,v
+ slli \regtype\()0, a6, 4
+.else
+ slli \regtype\()0, a5, 4
+.endif
+ add \regtype\()0, \regtype\()0, \regtype\()2
+
+ lh \regtype\()1, 2(\regtype\()0)
+ lh \regtype\()2, 4(\regtype\()0)
+ lh \regtype\()3, 6(\regtype\()0)
+ lh \regtype\()4, 8(\regtype\()0)
+ lh \regtype\()5, 10(\regtype\()0)
+ lh \regtype\()6, 12(\regtype\()0)
+
+.ifc \regtype,t
+ lh a7, 14(\regtype\()0)
+.else
+ lh s7, 14(\regtype\()0)
+.endif
+ lh \regtype\()0, 0(\regtype\()0)
+.endm
+
+.macro epel_load dst, len, op, name, type, from_mem, regtype
+.ifc \from_mem, 1
+ vle8.v v22, (a2)
+.ifc \type,v
+ add a5, a3, a2
+ sub a2, a2, a3
+ vle8.v v24, (a5)
+ vle8.v v20, (a2)
+ sh1add a2, a3, a5
+ add a5, a5, a3
+ vle8.v v26, (a5)
+ vle8.v v28, (a2)
+ add a2, a2, a3
+ vle8.v v30, (a2)
+.else
+ addi a5, a2, 1
+ addi a2, a2, -1
+ vle8.v v24, (a5)
+ vle8.v v20, (a2)
+ addi a5, a5, 2
+ addi a2, a2, 3
+ vle8.v v28, (a5)
+ vle8.v v26, (a2)
+ addi a2, a5, 1
+ vle8.v v30, (a2)
+.endif
+
+.ifc \name,smooth
+ vwmulu.vx v16, v24, \regtype\()4
+ vwmaccu.vx v16, \regtype\()2, v20
+ vwmaccu.vx v16, \regtype\()5, v26
+ vwmaccsu.vx v16, \regtype\()6, v28
+.else
+ vwmulu.vx v16, v28, \regtype\()6
+ vwmaccsu.vx v16, \regtype\()2, v20
+ vwmaccsu.vx v16, \regtype\()5, v26
+.endif
+
+.ifc \regtype,t
+ vwmaccsu.vx v16, a7, v30
+.else
+ vwmaccsu.vx v16, s7, v30
+.endif
+
+.ifc \type,v
+ sh1add a5, a3, a3
+ sub a2, a2, a5
+ sub a2, a2, a5
+ sub a5, a2, a3
+ vle8.v v28, (a2)
+ vle8.v v26, (a5)
+ sh1add a2, a3, a2
+.else
+ addi a5, a2, -7
+ addi a2, a2, -6
+ vle8.v v26, (a5)
+ vle8.v v28, (a2)
+ addi a2, a2, 2
+.endif
+
+.ifc \name,smooth
+ vwmaccsu.vx v16, \regtype\()1, v28
+.else
+ vwmaccu.vx v16, \regtype\()1, v28
+ vwmulu.vx v28, v24, \regtype\()4
+.endif
+ vwmaccsu.vx v16, \regtype\()0, v26
+ vwmulu.vx v20, v22, \regtype\()3
+.else
+.ifc \name,smooth
+ vwmulu.vx v16, v8, \regtype\()4
+ vwmaccu.vx v16, \regtype\()2, v4
+ vwmaccu.vx v16, \regtype\()5, v10
+ vwmaccsu.vx v16, \regtype\()6, v12
+ vwmaccsu.vx v16, \regtype\()1, v2
+.else
+ vwmulu.vx v16, v2, \regtype\()1
+ vwmaccu.vx v16, \regtype\()6, v12
+ vwmaccsu.vx v16, \regtype\()5, v10
+ vwmaccsu.vx v16, \regtype\()2, v4
+ vwmulu.vx v28, v8, \regtype\()4
+.endif
+ vwmaccsu.vx v16, \regtype\()0, v0
+ vwmulu.vx v20, v6, \regtype\()3
+
+.ifc \regtype,t
+ vwmaccsu.vx v16, a7, v14
+.else
+ vwmaccsu.vx v16, s7, v14
+.endif
+
+.endif
+ li a5, 64
+ vwadd.wx v16, v16, a5
+ vsetvlstatic16 \len
+
+.ifc \name,smooth
+ vwadd.vv v24, v16, v20
+.else
+ vwadd.vv v24, v16, v28
+ vwadd.wv v24, v24, v20
+.endif
+ vnsra.wi v24, v24, 7
+ vmax.vx v24, v24, zero
+ vsetvlstatic8 \len, zero, 32, m2
+
+ vnclipu.wi \dst, v24, 0
+.ifc \op,avg
+ vle8.v v24, (a0)
+ vaaddu.vv \dst, \dst, v24
+.endif
+
+.endm
+
+.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
+ epel_load \dst, \len, \op, \name, \type, \from_mem, \regtype
+ add a2, a2, a3
+.endm
+
+.macro epel len, op, name, type, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
+ epel_filter \name, \type, t
+.if \vlen < 256
+ vsetvlstatic8 \len, a5, 32, m2
+.else
+ vsetvlstatic8 \len, a5, 64, m2
+.endif
+.ifc \op,avg
+ csrwi vxrm, 0
+.endif
+
+1:
+ addi a4, a4, -1
+ epel_load v30, \len, \op, \name, \type, 1, t
+ vse8.v v30, (a0)
+.if \len == 64 && \vlen < 256
+ addi a0, a0, 32
+ addi a2, a2, 32
+ epel_load v30, \len, \op, \name, \type, 1, t
+ vse8.v v30, (a0)
+ addi a0, a0, -32
+ addi a2, a2, -32
+.endif
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+.endm
+
.irp len, 64, 32, 16, 8, 4
copy_avg \len
+ .irp op, put, avg
+ .irp name, regular, sharp, smooth
+ .irp type, h, v
+ epel \len, \op, \name, \type, 128
+ epel \len, \op, \name, \type, 256
+ .endr
+ .endr
+ .endr
.endr
bilin_h_v put, h, a5
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index 8fb326dae0..5fd64a1b8c 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
const uint8_t *a);
-#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) \
-void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
+#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen) \
+void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my);
@@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride, \
const uint8_t *src, ptrdiff_t srcstride, \
int h, int mx, int my);
-VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
VP9_BILINEAR_RISCV_RVV_FUNC(64);
VP9_BILINEAR_RISCV_RVV_FUNC(32);
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index b3700dfb08..3669070fca 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
# endif
#if HAVE_RVV
- if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) {
+ if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
+ int vlenb = ff_get_rv_vlenb();
+ if (vlenb >= 16) {
#define init_fpel(idx1, sz) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv; \
@@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
#undef init_fpel
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
+ ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen; \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
+ ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen; \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
+ ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
+
+#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \
+ init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \
+ init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \
+ init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \
+ init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \
+ init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen)
+
+ init_subpel2(0, 1, 0, h, put, 128);
+ init_subpel2(1, 1, 0, h, avg, 128);
+
+ if (flags & AV_CPU_FLAG_RVB_ADDR) {
+ init_subpel2(0, 0, 1, v, put, 128);
+ init_subpel2(1, 0, 1, v, avg, 128);
+ }
+
+ }
+ if (vlenb >= 32) {
+ init_subpel2(0, 1, 0, h, put, 256);
+ init_subpel2(1, 1, 0, h, avg, 256);
+
+ if (flags & AV_CPU_FLAG_RVB_ADDR) {
+ init_subpel2(0, 0, 1, v, put, 256);
+ init_subpel2(1, 0, 1, v, avg, 256);
+ }
+ }
}
#endif
#endif
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v4 4/4] lavc/vp9dsp: R-V V mc tap hv
[not found] <20240615115034.3891490-1-uk7b@foxmail.com>
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v uk7b
@ 2024-06-15 11:50 ` uk7b
2024-07-23 8:58 ` uk7b
2 siblings, 1 reply; 15+ messages in thread
From: uk7b @ 2024-06-15 11:50 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
vp9_avg_8tap_smooth_4hv_8bpp_c : 32.0 28.0
vp9_avg_8tap_smooth_4hv_8bpp_rvv_i32 : 15.0 13.2
vp9_avg_8tap_smooth_8hv_8bpp_c : 98.0 86.2
vp9_avg_8tap_smooth_8hv_8bpp_rvv_i32 : 23.7 21.2
vp9_avg_8tap_smooth_16hv_8bpp_c : 355.7 297.0
vp9_avg_8tap_smooth_16hv_8bpp_rvv_i32 : 47.0 41.5
vp9_avg_8tap_smooth_32hv_8bpp_c : 1272.7 1099.7
vp9_avg_8tap_smooth_32hv_8bpp_rvv_i32 : 134.7 119.7
vp9_avg_8tap_smooth_64hv_8bpp_c : 4937.0 4224.2
vp9_avg_8tap_smooth_64hv_8bpp_rvv_i32 : 528.5 228.5
vp9_put_8tap_smooth_4hv_8bpp_c : 30.2 26.7
vp9_put_8tap_smooth_4hv_8bpp_rvv_i32 : 30.5 12.5
vp9_put_8tap_smooth_8hv_8bpp_c : 91.5 81.2
vp9_put_8tap_smooth_8hv_8bpp_rvv_i32 : 22.7 20.2
vp9_put_8tap_smooth_16hv_8bpp_c : 313.2 277.5
vp9_put_8tap_smooth_16hv_8bpp_rvv_i32 : 45.2 40.2
vp9_put_8tap_smooth_32hv_8bpp_c : 1166.7 1022.2
vp9_put_8tap_smooth_32hv_8bpp_rvv_i32 : 131.7 117.2
vp9_put_8tap_smooth_64hv_8bpp_c : 4560.5 3961.7
vp9_put_8tap_smooth_64hv_8bpp_rvv_i32 : 517.0 223.2
---
libavcodec/riscv/vp9_mc_rvv.S | 75 ++++++++++++++++++++++++++++++++++
libavcodec/riscv/vp9dsp_init.c | 8 ++++
2 files changed, 83 insertions(+)
diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 5e81301aa5..474c9035ae 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -373,6 +373,77 @@ func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
endfunc
.endm
+#if __riscv_xlen == 64
+.macro epel_hv_once len, name, op
+ sub a2, a2, a3
+ sub a2, a2, a3
+ sub a2, a2, a3
+ .irp n,0,2,4,6,8,10,12,14
+ epel_load_inc v\n, \len, put, \name, h, 1, t
+ .endr
+ addi a4, a4, -1
+1:
+ addi a4, a4, -1
+ epel_load v30, \len, \op, \name, v, 0, s
+ vse8.v v30, (a0)
+ vmv.v.v v0, v2
+ vmv.v.v v2, v4
+ vmv.v.v v4, v6
+ vmv.v.v v6, v8
+ vmv.v.v v8, v10
+ vmv.v.v v10, v12
+ vmv.v.v v12, v14
+ epel_load v14, \len, put, \name, h, 1, t
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+ epel_load v30, \len, \op, \name, v, 0, s
+ vse8.v v30, (a0)
+.endm
+
+.macro epel_hv op, name, len, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()hv_rvv\vlen\(), zve32x
+ addi sp, sp, -64
+ .irp n,0,1,2,3,4,5,6,7
+ sd s\n, \n\()<<3(sp)
+ .endr
+.if \len == 64 && \vlen < 256
+ addi sp, sp, -48
+ .irp n,0,1,2,3,4,5
+ sd a\n, \n\()<<3(sp)
+ .endr
+.endif
+.ifc \op,avg
+ csrwi vxrm, 0
+.endif
+ epel_filter \name, h, t
+ epel_filter \name, v, s
+.if \vlen < 256
+ vsetvlstatic8 \len, a6, 32, m2
+.else
+ vsetvlstatic8 \len, a6, 64, m2
+.endif
+ epel_hv_once \len, \name, \op
+.if \len == 64 && \vlen < 256
+ .irp n,0,1,2,3,4,5
+ ld a\n, \n\()<<3(sp)
+ .endr
+ addi sp, sp, 48
+ addi a0, a0, 32
+ addi a2, a2, 32
+ epel_filter \name, h, t
+ epel_hv_once \len, \name, \op
+.endif
+ .irp n,0,1,2,3,4,5,6,7
+ ld s\n, \n\()<<3(sp)
+ .endr
+ addi sp, sp, 64
+
+ ret
+endfunc
+.endm
+#endif
+
.irp len, 64, 32, 16, 8, 4
copy_avg \len
.irp op, put, avg
@@ -381,6 +452,10 @@ endfunc
epel \len, \op, \name, \type, 128
epel \len, \op, \name, \type, 256
.endr
+ #if __riscv_xlen == 64
+ epel_hv \op, \name, \len, 128
+ epel_hv \op, \name, \len, 256
+ #endif
.endr
.endr
.endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 3669070fca..7b090c9889 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -119,6 +119,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
if (flags & AV_CPU_FLAG_RVB_ADDR) {
init_subpel2(0, 0, 1, v, put, 128);
init_subpel2(1, 0, 1, v, avg, 128);
+# if __riscv_xlen == 64
+ init_subpel2(0, 1, 1, hv, put, 128);
+ init_subpel2(1, 1, 1, hv, avg, 128);
+# endif
}
}
@@ -129,6 +133,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
if (flags & AV_CPU_FLAG_RVB_ADDR) {
init_subpel2(0, 0, 1, v, put, 256);
init_subpel2(1, 0, 1, v, avg, 256);
+# if __riscv_xlen == 64
+ init_subpel2(0, 1, 1, hv, put, 256);
+ init_subpel2(1, 1, 1, hv, avg, 256);
+# endif
}
}
}
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
@ 2024-06-15 11:52 ` flow gg
2024-06-24 20:07 ` Rémi Denis-Courmont
1 sibling, 0 replies; 15+ messages in thread
From: flow gg @ 2024-06-15 11:52 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> Copying vectors is rarely justified - mostly only before destructive
> instructions such as FMA.
It is slightly different from VP8. In VP8, many scalar values are positive,
so the related calculations can be easily replaced. However, in this
context of VP9, since t2 is a negative number, vwmaccsu is required.
Therefore, unlike the logic in VP8, we cannot use vwmulu.vx before
bilin_load to avoid vmv.
<uk7b@foxmail.com> 于2024年6月15日周六 19:51写道:
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908 X60
> vp9_avg_bilin_4hv_8bpp_c : 10.7 9.5
> vp9_avg_bilin_4hv_8bpp_rvv_i32 : 4.0 3.5
> vp9_avg_bilin_8hv_8bpp_c : 38.5 34.2
> vp9_avg_bilin_8hv_8bpp_rvv_i32 : 7.2 6.5
> vp9_avg_bilin_16hv_8bpp_c : 147.2 130.5
> vp9_avg_bilin_16hv_8bpp_rvv_i32 : 14.5 12.7
> vp9_avg_bilin_32hv_8bpp_c : 574.2 509.7
> vp9_avg_bilin_32hv_8bpp_rvv_i32 : 42.5 38.0
> vp9_avg_bilin_64hv_8bpp_c : 2321.2 2017.7
> vp9_avg_bilin_64hv_8bpp_rvv_i32 : 163.5 131.0
> vp9_put_bilin_4hv_8bpp_c : 10.0 8.7
> vp9_put_bilin_4hv_8bpp_rvv_i32 : 3.5 3.0
> vp9_put_bilin_8hv_8bpp_c : 35.2 31.2
> vp9_put_bilin_8hv_8bpp_rvv_i32 : 6.5 5.7
> vp9_put_bilin_16hv_8bpp_c : 134.0 119.0
> vp9_put_bilin_16hv_8bpp_rvv_i32 : 12.7 11.5
> vp9_put_bilin_32hv_8bpp_c : 538.5 464.2
> vp9_put_bilin_32hv_8bpp_rvv_i32 : 39.7 35.2
> vp9_put_bilin_64hv_8bpp_c : 2111.7 1833.2
> vp9_put_bilin_64hv_8bpp_rvv_i32 : 138.5 122.5
> ---
> libavcodec/riscv/vp9_mc_rvv.S | 38 +++++++++++++++++++++++++++++++++-
> libavcodec/riscv/vp9dsp_init.c | 10 +++++++++
> 2 files changed, 47 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index fb7377048a..5241562531 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -147,6 +147,40 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
> endfunc
> .endm
>
> +.macro bilin_hv op
> +func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> + vsetvlstatic8 64, t0, 64
> +.Lbilin_hv\op:
> +.ifc \op,avg
> + csrwi vxrm, 0
> +.endif
> + neg t1, a5
> + neg t2, a6
> + li t4, 8
> + bilin_load_h v24, put, a5
> + add a2, a2, a3
> +1:
> + addi a4, a4, -1
> + bilin_load_h v4, put, a5
> + vwmulu.vx v16, v4, a6
> + vwmaccsu.vx v16, t2, v24
> + vwadd.wx v16, v16, t4
> + vnsra.wi v16, v16, 4
> + vadd.vv v0, v16, v24
> +.ifc \op,avg
> + vle8.v v16, (a0)
> + vaaddu.vv v0, v0, v16
> +.endif
> + vse8.v v0, (a0)
> + vmv.v.v v24, v4
> + add a2, a2, a3
> + add a0, a0, a1
> + bnez a4, 1b
> +
> + ret
> +endfunc
> +.endm
> +
> .irp len, 64, 32, 16, 8, 4
> copy_avg \len
> .endr
> @@ -155,6 +189,8 @@ bilin_h_v put, h, a5
> bilin_h_v avg, h, a5
> bilin_h_v put, v, a6
> bilin_h_v avg, v, a6
> +bilin_hv put
> +bilin_hv avg
>
> .macro func_bilin_h_v len, op, type
> func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> @@ -165,7 +201,7 @@ endfunc
>
> .irp len, 32, 16, 8, 4
> .irp op, put, avg
> - .irp type, h, v
> + .irp type, h, v, hv
> func_bilin_h_v \len, \op, \type
> .endr
> .endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index 9606d8545f..b3700dfb08 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
> dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv;
> dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv;
> dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
> + dsp->mc[0][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_64hv_rvv;
> + dsp->mc[0][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_64hv_rvv;
> + dsp->mc[1][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_32hv_rvv;
> + dsp->mc[1][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_32hv_rvv;
> + dsp->mc[2][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_16hv_rvv;
> + dsp->mc[2][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_16hv_rvv;
> + dsp->mc[3][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_8hv_rvv;
> + dsp->mc[3][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_8hv_rvv;
> + dsp->mc[4][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_4hv_rvv;
> + dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
>
> #undef init_fpel
> }
> --
> 2.45.2
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v uk7b
@ 2024-06-15 11:52 ` flow gg
2024-07-13 9:02 ` Rémi Denis-Courmont
1 sibling, 0 replies; 15+ messages in thread
From: flow gg @ 2024-06-15 11:52 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> You can directly LLA filters + 16 * 8 * 2 and save one add. Same below.
You can
> also use .equ to alias the filter addresses, and avoid if's.
> That's a lot of address dependencies, which is going to hurt performance.
It
> might help to just spill more S registers if needed.
> This can be done in 3 instructions, even without mul. Of course you'll
again
> need a spare register.
Okay, updated them
> Use a macro parameter for the stride register.
Doing this will reduce one if-else statement in this patch, but in the next
patch, it will lead to adding multiple if-else statements. I think we can
leave it unchanged.
<uk7b@foxmail.com> 于2024年6月15日周六 19:51写道:
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908 X60
> vp9_avg_8tap_smooth_4h_8bpp_c : 12.7 11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 4.7 4.2
> vp9_avg_8tap_smooth_4v_8bpp_c : 29.7 12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 4.7 4.2
> vp9_avg_8tap_smooth_8h_8bpp_c : 48.7 42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.5 8.5
> vp9_avg_8tap_smooth_8v_8bpp_c : 49.7 45.5
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5
> vp9_avg_8tap_smooth_16h_8bpp_c : 192.0 166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.7 19.5
> vp9_avg_8tap_smooth_16v_8bpp_c : 191.2 175.2
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.2 19.0
> vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.2
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 68.2 60.5
> vp9_avg_8tap_smooth_32v_8bpp_c : 770.0 685.7
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.0 59.5
> vp9_avg_8tap_smooth_64h_8bpp_c : 3116.2 2648.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 270.7 120.7
> vp9_avg_8tap_smooth_64v_8bpp_c : 3058.5 2731.7
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 266.5 119.0
> vp9_put_8tap_smooth_4h_8bpp_c : 11.0 9.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 3.7
> vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.5
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.0 3.7
> vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7
> vp9_put_8tap_smooth_8v_8bpp_c : 43.5 38.5
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.7 7.7
> vp9_put_8tap_smooth_16h_8bpp_c : 181.7 147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 20.0 18.0
> vp9_put_8tap_smooth_16v_8bpp_c : 168.5 149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5
> vp9_put_8tap_smooth_32h_8bpp_c : 675.0 586.5
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 65.2 58.0
> vp9_put_8tap_smooth_32v_8bpp_c : 664.7 591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.0 57.0
> vp9_put_8tap_smooth_64h_8bpp_c : 2696.2 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 259.7 115.7
> vp9_put_8tap_smooth_64v_8bpp_c : 2691.0 2348.5
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0
> ---
> libavcodec/riscv/vp9_mc_rvv.S | 200 +++++++++++++++++++++++++++++++++
> libavcodec/riscv/vp9dsp.h | 72 ++++++++----
> libavcodec/riscv/vp9dsp_init.c | 38 ++++++-
> 3 files changed, 285 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 5241562531..5e81301aa5 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
> .endif
> .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> + vsetvli zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> + vsetvli zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> + vsetvli zero, zero, e16, m2, ta, ma
> +.else
> + vsetvli zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
> .macro copy_avg len
> func ff_vp9_avg\len\()_rvv, zve32x
> csrwi vxrm, 0
> @@ -181,8 +193,196 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> endfunc
> .endm
>
> +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> +
> +.macro epel_filter name, type, regtype
> + lla \regtype\()2, ff_vp9_subpel_filters_\name
> +
> +.ifc \type,v
> + slli \regtype\()0, a6, 4
> +.else
> + slli \regtype\()0, a5, 4
> +.endif
> + add \regtype\()0, \regtype\()0, \regtype\()2
> +
> + lh \regtype\()1, 2(\regtype\()0)
> + lh \regtype\()2, 4(\regtype\()0)
> + lh \regtype\()3, 6(\regtype\()0)
> + lh \regtype\()4, 8(\regtype\()0)
> + lh \regtype\()5, 10(\regtype\()0)
> + lh \regtype\()6, 12(\regtype\()0)
> +
> +.ifc \regtype,t
> + lh a7, 14(\regtype\()0)
> +.else
> + lh s7, 14(\regtype\()0)
> +.endif
> + lh \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst, len, op, name, type, from_mem, regtype
> +.ifc \from_mem, 1
> + vle8.v v22, (a2)
> +.ifc \type,v
> + add a5, a3, a2
> + sub a2, a2, a3
> + vle8.v v24, (a5)
> + vle8.v v20, (a2)
> + sh1add a2, a3, a5
> + add a5, a5, a3
> + vle8.v v26, (a5)
> + vle8.v v28, (a2)
> + add a2, a2, a3
> + vle8.v v30, (a2)
> +.else
> + addi a5, a2, 1
> + addi a2, a2, -1
> + vle8.v v24, (a5)
> + vle8.v v20, (a2)
> + addi a5, a5, 2
> + addi a2, a2, 3
> + vle8.v v28, (a5)
> + vle8.v v26, (a2)
> + addi a2, a5, 1
> + vle8.v v30, (a2)
> +.endif
> +
> +.ifc \name,smooth
> + vwmulu.vx v16, v24, \regtype\()4
> + vwmaccu.vx v16, \regtype\()2, v20
> + vwmaccu.vx v16, \regtype\()5, v26
> + vwmaccsu.vx v16, \regtype\()6, v28
> +.else
> + vwmulu.vx v16, v28, \regtype\()6
> + vwmaccsu.vx v16, \regtype\()2, v20
> + vwmaccsu.vx v16, \regtype\()5, v26
> +.endif
> +
> +.ifc \regtype,t
> + vwmaccsu.vx v16, a7, v30
> +.else
> + vwmaccsu.vx v16, s7, v30
> +.endif
> +
> +.ifc \type,v
> + sh1add a5, a3, a3
> + sub a2, a2, a5
> + sub a2, a2, a5
> + sub a5, a2, a3
> + vle8.v v28, (a2)
> + vle8.v v26, (a5)
> + sh1add a2, a3, a2
> +.else
> + addi a5, a2, -7
> + addi a2, a2, -6
> + vle8.v v26, (a5)
> + vle8.v v28, (a2)
> + addi a2, a2, 2
> +.endif
> +
> +.ifc \name,smooth
> + vwmaccsu.vx v16, \regtype\()1, v28
> +.else
> + vwmaccu.vx v16, \regtype\()1, v28
> + vwmulu.vx v28, v24, \regtype\()4
> +.endif
> + vwmaccsu.vx v16, \regtype\()0, v26
> + vwmulu.vx v20, v22, \regtype\()3
> +.else
> +.ifc \name,smooth
> + vwmulu.vx v16, v8, \regtype\()4
> + vwmaccu.vx v16, \regtype\()2, v4
> + vwmaccu.vx v16, \regtype\()5, v10
> + vwmaccsu.vx v16, \regtype\()6, v12
> + vwmaccsu.vx v16, \regtype\()1, v2
> +.else
> + vwmulu.vx v16, v2, \regtype\()1
> + vwmaccu.vx v16, \regtype\()6, v12
> + vwmaccsu.vx v16, \regtype\()5, v10
> + vwmaccsu.vx v16, \regtype\()2, v4
> + vwmulu.vx v28, v8, \regtype\()4
> +.endif
> + vwmaccsu.vx v16, \regtype\()0, v0
> + vwmulu.vx v20, v6, \regtype\()3
> +
> +.ifc \regtype,t
> + vwmaccsu.vx v16, a7, v14
> +.else
> + vwmaccsu.vx v16, s7, v14
> +.endif
> +
> +.endif
> + li a5, 64
> + vwadd.wx v16, v16, a5
> + vsetvlstatic16 \len
> +
> +.ifc \name,smooth
> + vwadd.vv v24, v16, v20
> +.else
> + vwadd.vv v24, v16, v28
> + vwadd.wv v24, v24, v20
> +.endif
> + vnsra.wi v24, v24, 7
> + vmax.vx v24, v24, zero
> + vsetvlstatic8 \len, zero, 32, m2
> +
> + vnclipu.wi \dst, v24, 0
> +.ifc \op,avg
> + vle8.v v24, (a0)
> + vaaddu.vv \dst, \dst, v24
> +.endif
> +
> +.endm
> +
> +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> + epel_load \dst, \len, \op, \name, \type, \from_mem, \regtype
> + add a2, a2, a3
> +.endm
> +
> +.macro epel len, op, name, type, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> + epel_filter \name, \type, t
> +.if \vlen < 256
> + vsetvlstatic8 \len, a5, 32, m2
> +.else
> + vsetvlstatic8 \len, a5, 64, m2
> +.endif
> +.ifc \op,avg
> + csrwi vxrm, 0
> +.endif
> +
> +1:
> + addi a4, a4, -1
> + epel_load v30, \len, \op, \name, \type, 1, t
> + vse8.v v30, (a0)
> +.if \len == 64 && \vlen < 256
> + addi a0, a0, 32
> + addi a2, a2, 32
> + epel_load v30, \len, \op, \name, \type, 1, t
> + vse8.v v30, (a0)
> + addi a0, a0, -32
> + addi a2, a2, -32
> +.endif
> + add a2, a2, a3
> + add a0, a0, a1
> + bnez a4, 1b
> +
> + ret
> +endfunc
> +.endm
> +
> .irp len, 64, 32, 16, 8, 4
> copy_avg \len
> + .irp op, put, avg
> + .irp name, regular, sharp, smooth
> + .irp type, h, v
> + epel \len, \op, \name, \type, 128
> + epel \len, \op, \name, \type, 256
> + .endr
> + .endr
> + .endr
> .endr
>
> bilin_h_v put, h, a5
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 8fb326dae0..5fd64a1b8c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride,
> const uint8_t *l,
> void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> const uint8_t *a);
>
> -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
> \
> -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride, \
> +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen)
> \
> +void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
> \
> + ptrdiff_t dststride,
> \
> const uint8_t *src,
> \
> ptrdiff_t srcstride,
> \
> int h, int mx, int my);
> \
>
> \
> -void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t
> dststride, \
> +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
> \
> + ptrdiff_t dststride,
> \
> const uint8_t *src,
> \
> ptrdiff_t srcstride,
> \
> int h, int mx, int my);
> \
>
> \
> -void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t
> dststride, \
> +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
> \
> + ptrdiff_t dststride,
> \
> const uint8_t *src,
> \
> ptrdiff_t srcstride,
> \
> int h, int mx, int my);
> \
>
> \
> -void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
> \
> + ptrdiff_t dststride,
> \
> const uint8_t *src,
> \
> ptrdiff_t srcstride,
> \
> int h, int mx, int my);
> \
>
> \
> -void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t
> dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
> \
> + ptrdiff_t dststride,
> \
> const uint8_t *src,
> \
> ptrdiff_t srcstride,
> \
> int h, int mx, int my);
> \
>
> \
> -void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t
> dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
> \
> + ptrdiff_t dststride,
> \
> const uint8_t *src,
> \
> ptrdiff_t srcstride,
> \
> int h, int mx, int my);
> @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> dststride, \
> const uint8_t *src, ptrdiff_t srcstride, \
> int h, int mx, int my);
>
> -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
>
> VP9_BILINEAR_RISCV_RVV_FUNC(64);
> VP9_BILINEAR_RISCV_RVV_FUNC(32);
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index b3700dfb08..3669070fca 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
> # endif
>
> #if HAVE_RVV
> - if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128)) {
> + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> + int vlenb = ff_get_rv_vlenb();
> + if (vlenb >= 16) {
>
> #define init_fpel(idx1, sz) \
> dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv; \
> @@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
> dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
>
> #undef init_fpel
> +
> +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \
> + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen; \
> + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen; \
> + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> +
> +#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \
> + init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \
> + init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \
> + init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \
> + init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \
> + init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen)
> +
> + init_subpel2(0, 1, 0, h, put, 128);
> + init_subpel2(1, 1, 0, h, avg, 128);
> +
> + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> + init_subpel2(0, 0, 1, v, put, 128);
> + init_subpel2(1, 0, 1, v, avg, 128);
> + }
> +
> + }
> + if (vlenb >= 32) {
> + init_subpel2(0, 1, 0, h, put, 256);
> + init_subpel2(1, 1, 0, h, avg, 256);
> +
> + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> + init_subpel2(0, 0, 1, v, put, 256);
> + init_subpel2(1, 0, 1, v, avg, 256);
> + }
> + }
> }
> #endif
> #endif
> --
> 2.45.2
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
2024-06-15 11:52 ` flow gg
@ 2024-06-24 20:07 ` Rémi Denis-Courmont
2024-06-30 11:39 ` flow gg
1 sibling, 1 reply; 15+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-24 20:07 UTC (permalink / raw)
To: ffmpeg-devel
Le lauantaina 15. kesäkuuta 2024, 14.50.32 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908 X60
> vp9_avg_bilin_4hv_8bpp_c : 10.7 9.5
> vp9_avg_bilin_4hv_8bpp_rvv_i32 : 4.0 3.5
> vp9_avg_bilin_8hv_8bpp_c : 38.5 34.2
> vp9_avg_bilin_8hv_8bpp_rvv_i32 : 7.2 6.5
> vp9_avg_bilin_16hv_8bpp_c : 147.2 130.5
> vp9_avg_bilin_16hv_8bpp_rvv_i32 : 14.5 12.7
> vp9_avg_bilin_32hv_8bpp_c : 574.2 509.7
> vp9_avg_bilin_32hv_8bpp_rvv_i32 : 42.5 38.0
> vp9_avg_bilin_64hv_8bpp_c : 2321.2 2017.7
> vp9_avg_bilin_64hv_8bpp_rvv_i32 : 163.5 131.0
> vp9_put_bilin_4hv_8bpp_c : 10.0 8.7
> vp9_put_bilin_4hv_8bpp_rvv_i32 : 3.5 3.0
> vp9_put_bilin_8hv_8bpp_c : 35.2 31.2
> vp9_put_bilin_8hv_8bpp_rvv_i32 : 6.5 5.7
> vp9_put_bilin_16hv_8bpp_c : 134.0 119.0
> vp9_put_bilin_16hv_8bpp_rvv_i32 : 12.7 11.5
> vp9_put_bilin_32hv_8bpp_c : 538.5 464.2
> vp9_put_bilin_32hv_8bpp_rvv_i32 : 39.7 35.2
> vp9_put_bilin_64hv_8bpp_c : 2111.7 1833.2
> vp9_put_bilin_64hv_8bpp_rvv_i32 : 138.5 122.5
> ---
> libavcodec/riscv/vp9_mc_rvv.S | 38 +++++++++++++++++++++++++++++++++-
> libavcodec/riscv/vp9dsp_init.c | 10 +++++++++
> 2 files changed, 47 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index fb7377048a..5241562531 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -147,6 +147,40 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
> endfunc
> .endm
>
> +.macro bilin_hv op
> +func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> + vsetvlstatic8 64, t0, 64
> +.Lbilin_hv\op:
> +.ifc \op,avg
> + csrwi vxrm, 0
> +.endif
> + neg t1, a5
> + neg t2, a6
> + li t4, 8
> + bilin_load_h v24, put, a5
> + add a2, a2, a3
> +1:
> + addi a4, a4, -1
> + bilin_load_h v4, put, a5
> + vwmulu.vx v16, v4, a6
> + vwmaccsu.vx v16, t2, v24
> + vwadd.wx v16, v16, t4
> + vnsra.wi v16, v16, 4
Why round manually?
It looks like vnclip.wi would be more straightforward here.
> + vadd.vv v0, v16, v24
> +.ifc \op,avg
> + vle8.v v16, (a0)
> + vaaddu.vv v0, v0, v16
> +.endif
> + vse8.v v0, (a0)
> + vmv.v.v v24, v4
> + add a2, a2, a3
> + add a0, a0, a1
> + bnez a4, 1b
> +
> + ret
> +endfunc
> +.endm
> +
> .irp len, 64, 32, 16, 8, 4
> copy_avg \len
> .endr
> @@ -155,6 +189,8 @@ bilin_h_v put, h, a5
> bilin_h_v avg, h, a5
> bilin_h_v put, v, a6
> bilin_h_v avg, v, a6
> +bilin_hv put
> +bilin_hv avg
>
> .macro func_bilin_h_v len, op, type
> func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> @@ -165,7 +201,7 @@ endfunc
>
> .irp len, 32, 16, 8, 4
> .irp op, put, avg
> - .irp type, h, v
> + .irp type, h, v, hv
> func_bilin_h_v \len, \op, \type
> .endr
> .endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
> index 9606d8545f..b3700dfb08 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][0][1][0] =
> ff_put_vp9_bilin_4h_rvv; dsp->mc[4][FILTER_BILINEAR ][1][0][1] =
> ff_avg_vp9_bilin_4v_rvv; dsp->mc[4][FILTER_BILINEAR ][1][1][0] =
> ff_avg_vp9_bilin_4h_rvv; + dsp->mc[0][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_64hv_rvv; + dsp->mc[0][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_64hv_rvv; + dsp->mc[1][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_32hv_rvv; + dsp->mc[1][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_32hv_rvv; + dsp->mc[2][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_16hv_rvv; + dsp->mc[2][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_16hv_rvv; + dsp->mc[3][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_8hv_rvv; + dsp->mc[3][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_8hv_rvv; + dsp->mc[4][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_4hv_rvv; + dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_4hv_rvv;
>
> #undef init_fpel
> }
--
Rémi Denis-Courmont
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv
2024-06-24 20:07 ` Rémi Denis-Courmont
@ 2024-06-30 11:39 ` flow gg
0 siblings, 0 replies; 15+ messages in thread
From: flow gg @ 2024-06-30 11:39 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Initially, I tried using `vnclip.wi` with reference to h264,
- vwadd.wx v16, v16, t4
- vnsra.wi v16, v16, 4
+ vnclip.wi v16, v16, 4
but couldn't find the correct way... I think there might be some overflow
issues that I didn't understand correctly. How do you think it should be
replaced?
Rémi Denis-Courmont <remi@remlab.net> 于2024年6月25日周二 04:07写道:
> Le lauantaina 15. kesäkuuta 2024, 14.50.32 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> > C908 X60
> > vp9_avg_bilin_4hv_8bpp_c : 10.7 9.5
> > vp9_avg_bilin_4hv_8bpp_rvv_i32 : 4.0 3.5
> > vp9_avg_bilin_8hv_8bpp_c : 38.5 34.2
> > vp9_avg_bilin_8hv_8bpp_rvv_i32 : 7.2 6.5
> > vp9_avg_bilin_16hv_8bpp_c : 147.2 130.5
> > vp9_avg_bilin_16hv_8bpp_rvv_i32 : 14.5 12.7
> > vp9_avg_bilin_32hv_8bpp_c : 574.2 509.7
> > vp9_avg_bilin_32hv_8bpp_rvv_i32 : 42.5 38.0
> > vp9_avg_bilin_64hv_8bpp_c : 2321.2 2017.7
> > vp9_avg_bilin_64hv_8bpp_rvv_i32 : 163.5 131.0
> > vp9_put_bilin_4hv_8bpp_c : 10.0 8.7
> > vp9_put_bilin_4hv_8bpp_rvv_i32 : 3.5 3.0
> > vp9_put_bilin_8hv_8bpp_c : 35.2 31.2
> > vp9_put_bilin_8hv_8bpp_rvv_i32 : 6.5 5.7
> > vp9_put_bilin_16hv_8bpp_c : 134.0 119.0
> > vp9_put_bilin_16hv_8bpp_rvv_i32 : 12.7 11.5
> > vp9_put_bilin_32hv_8bpp_c : 538.5 464.2
> > vp9_put_bilin_32hv_8bpp_rvv_i32 : 39.7 35.2
> > vp9_put_bilin_64hv_8bpp_c : 2111.7 1833.2
> > vp9_put_bilin_64hv_8bpp_rvv_i32 : 138.5 122.5
> > ---
> > libavcodec/riscv/vp9_mc_rvv.S | 38 +++++++++++++++++++++++++++++++++-
> > libavcodec/riscv/vp9dsp_init.c | 10 +++++++++
> > 2 files changed, 47 insertions(+), 1 deletion(-)
> >
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index fb7377048a..5241562531 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -147,6 +147,40 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
> > endfunc
> > .endm
> >
> > +.macro bilin_hv op
> > +func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> > + vsetvlstatic8 64, t0, 64
> > +.Lbilin_hv\op:
> > +.ifc \op,avg
> > + csrwi vxrm, 0
> > +.endif
> > + neg t1, a5
> > + neg t2, a6
> > + li t4, 8
> > + bilin_load_h v24, put, a5
> > + add a2, a2, a3
> > +1:
> > + addi a4, a4, -1
> > + bilin_load_h v4, put, a5
> > + vwmulu.vx v16, v4, a6
> > + vwmaccsu.vx v16, t2, v24
> > + vwadd.wx v16, v16, t4
> > + vnsra.wi v16, v16, 4
>
> Why round manually?
> It looks like vnclip.wi would be more straightforward here.
>
> > + vadd.vv v0, v16, v24
> > +.ifc \op,avg
> > + vle8.v v16, (a0)
> > + vaaddu.vv v0, v0, v16
> > +.endif
> > + vse8.v v0, (a0)
> > + vmv.v.v v24, v4
> > + add a2, a2, a3
> > + add a0, a0, a1
> > + bnez a4, 1b
> > +
> > + ret
> > +endfunc
> > +.endm
> > +
> > .irp len, 64, 32, 16, 8, 4
> > copy_avg \len
> > .endr
> > @@ -155,6 +189,8 @@ bilin_h_v put, h, a5
> > bilin_h_v avg, h, a5
> > bilin_h_v put, v, a6
> > bilin_h_v avg, v, a6
> > +bilin_hv put
> > +bilin_hv avg
> >
> > .macro func_bilin_h_v len, op, type
> > func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> > @@ -165,7 +201,7 @@ endfunc
> >
> > .irp len, 32, 16, 8, 4
> > .irp op, put, avg
> > - .irp type, h, v
> > + .irp type, h, v, hv
> > func_bilin_h_v \len, \op, \type
> > .endr
> > .endr
> > diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> > index 9606d8545f..b3700dfb08 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][0][1][0] =
> > ff_put_vp9_bilin_4h_rvv; dsp->mc[4][FILTER_BILINEAR ][1][0][1] =
> > ff_avg_vp9_bilin_4v_rvv; dsp->mc[4][FILTER_BILINEAR ][1][1][0] =
> > ff_avg_vp9_bilin_4h_rvv; + dsp->mc[0][FILTER_BILINEAR ][0][1][1] =
> > ff_put_vp9_bilin_64hv_rvv; + dsp->mc[0][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_64hv_rvv; + dsp->mc[1][FILTER_BILINEAR ][0][1][1] =
> > ff_put_vp9_bilin_32hv_rvv; + dsp->mc[1][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_32hv_rvv; + dsp->mc[2][FILTER_BILINEAR ][0][1][1] =
> > ff_put_vp9_bilin_16hv_rvv; + dsp->mc[2][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_16hv_rvv; + dsp->mc[3][FILTER_BILINEAR ][0][1][1] =
> > ff_put_vp9_bilin_8hv_rvv; + dsp->mc[3][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_8hv_rvv; + dsp->mc[4][FILTER_BILINEAR ][0][1][1] =
> > ff_put_vp9_bilin_4hv_rvv; + dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_4hv_rvv;
> >
> > #undef init_fpel
> > }
>
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v uk7b
2024-06-15 11:52 ` flow gg
@ 2024-07-13 9:02 ` Rémi Denis-Courmont
2024-07-23 8:51 ` uk7b
2024-07-23 8:56 ` flow gg
1 sibling, 2 replies; 15+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-13 9:02 UTC (permalink / raw)
To: ffmpeg-devel
Le lauantaina 15. kesäkuuta 2024, 14.50.33 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
OK, so I realise that this review is very late, but...
TBH it is very hard to review this due to the large extents of code
conditionals. This should avoidable at least partly. You can name macros for
each filter and then expand those macros instead of using if's.
Besides in my experience, it is more readable to leave the loads/stores to the
outer function or macros and factor only the calculations, whenever you need
to apply the same maths vertically and/or horizontally. This also sometimes
enables actually using shared code, e.g., the H.263 loop filter or the VC-1
ITX.
Lastly this seems to both add new optimisations *and* add specialisations for
256-bit vectors, which really should be separate patches, but maybe I just
don't understand the code. In any case, that would not really match with the
patch description.
> C908 X60
> vp9_avg_8tap_smooth_4h_8bpp_c : 12.7 11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 4.7 4.2
> vp9_avg_8tap_smooth_4v_8bpp_c : 29.7 12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 4.7 4.2
> vp9_avg_8tap_smooth_8h_8bpp_c : 48.7 42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.5 8.5
> vp9_avg_8tap_smooth_8v_8bpp_c : 49.7 45.5
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5
> vp9_avg_8tap_smooth_16h_8bpp_c : 192.0 166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.7 19.5
> vp9_avg_8tap_smooth_16v_8bpp_c : 191.2 175.2
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.2 19.0
> vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.2
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 68.2 60.5
> vp9_avg_8tap_smooth_32v_8bpp_c : 770.0 685.7
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.0 59.5
> vp9_avg_8tap_smooth_64h_8bpp_c : 3116.2 2648.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 270.7 120.7
> vp9_avg_8tap_smooth_64v_8bpp_c : 3058.5 2731.7
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 266.5 119.0
> vp9_put_8tap_smooth_4h_8bpp_c : 11.0 9.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 3.7
> vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.5
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.0 3.7
> vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7
> vp9_put_8tap_smooth_8v_8bpp_c : 43.5 38.5
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.7 7.7
> vp9_put_8tap_smooth_16h_8bpp_c : 181.7 147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 20.0 18.0
> vp9_put_8tap_smooth_16v_8bpp_c : 168.5 149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5
> vp9_put_8tap_smooth_32h_8bpp_c : 675.0 586.5
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 65.2 58.0
> vp9_put_8tap_smooth_32v_8bpp_c : 664.7 591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.0 57.0
> vp9_put_8tap_smooth_64h_8bpp_c : 2696.2 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 259.7 115.7
> vp9_put_8tap_smooth_64v_8bpp_c : 2691.0 2348.5
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0
> ---
> libavcodec/riscv/vp9_mc_rvv.S | 200 +++++++++++++++++++++++++++++++++
> libavcodec/riscv/vp9dsp.h | 72 ++++++++----
> libavcodec/riscv/vp9dsp_init.c | 38 ++++++-
> 3 files changed, 285 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 5241562531..5e81301aa5 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
> .endif
> .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> + vsetvli zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> + vsetvli zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> + vsetvli zero, zero, e16, m2, ta, ma
> +.else
> + vsetvli zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
> .macro copy_avg len
> func ff_vp9_avg\len\()_rvv, zve32x
> csrwi vxrm, 0
> @@ -181,8 +193,196 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> endfunc
> .endm
>
> +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> +
> +.macro epel_filter name, type, regtype
> + lla \regtype\()2, ff_vp9_subpel_filters_\name
> +
> +.ifc \type,v
> + slli \regtype\()0, a6, 4
> +.else
> + slli \regtype\()0, a5, 4
> +.endif
> + add \regtype\()0, \regtype\()0, \regtype\()2
> +
> + lh \regtype\()1, 2(\regtype\()0)
> + lh \regtype\()2, 4(\regtype\()0)
> + lh \regtype\()3, 6(\regtype\()0)
> + lh \regtype\()4, 8(\regtype\()0)
> + lh \regtype\()5, 10(\regtype\()0)
> + lh \regtype\()6, 12(\regtype\()0)
> +
> +.ifc \regtype,t
> + lh a7, 14(\regtype\()0)
> +.else
> + lh s7, 14(\regtype\()0)
> +.endif
> + lh \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst, len, op, name, type, from_mem, regtype
> +.ifc \from_mem, 1
> + vle8.v v22, (a2)
> +.ifc \type,v
> + add a5, a3, a2
> + sub a2, a2, a3
> + vle8.v v24, (a5)
> + vle8.v v20, (a2)
> + sh1add a2, a3, a5
> + add a5, a5, a3
> + vle8.v v26, (a5)
> + vle8.v v28, (a2)
> + add a2, a2, a3
> + vle8.v v30, (a2)
> +.else
> + addi a5, a2, 1
> + addi a2, a2, -1
> + vle8.v v24, (a5)
> + vle8.v v20, (a2)
> + addi a5, a5, 2
> + addi a2, a2, 3
> + vle8.v v28, (a5)
> + vle8.v v26, (a2)
> + addi a2, a5, 1
> + vle8.v v30, (a2)
> +.endif
> +
> +.ifc \name,smooth
> + vwmulu.vx v16, v24, \regtype\()4
> + vwmaccu.vx v16, \regtype\()2, v20
> + vwmaccu.vx v16, \regtype\()5, v26
> + vwmaccsu.vx v16, \regtype\()6, v28
> +.else
> + vwmulu.vx v16, v28, \regtype\()6
> + vwmaccsu.vx v16, \regtype\()2, v20
> + vwmaccsu.vx v16, \regtype\()5, v26
> +.endif
> +
> +.ifc \regtype,t
> + vwmaccsu.vx v16, a7, v30
> +.else
> + vwmaccsu.vx v16, s7, v30
> +.endif
> +
> +.ifc \type,v
> + sh1add a5, a3, a3
> + sub a2, a2, a5
> + sub a2, a2, a5
> + sub a5, a2, a3
> + vle8.v v28, (a2)
> + vle8.v v26, (a5)
> + sh1add a2, a3, a2
> +.else
> + addi a5, a2, -7
> + addi a2, a2, -6
> + vle8.v v26, (a5)
> + vle8.v v28, (a2)
> + addi a2, a2, 2
> +.endif
> +
> +.ifc \name,smooth
> + vwmaccsu.vx v16, \regtype\()1, v28
> +.else
> + vwmaccu.vx v16, \regtype\()1, v28
> + vwmulu.vx v28, v24, \regtype\()4
> +.endif
> + vwmaccsu.vx v16, \regtype\()0, v26
> + vwmulu.vx v20, v22, \regtype\()3
> +.else
> +.ifc \name,smooth
> + vwmulu.vx v16, v8, \regtype\()4
> + vwmaccu.vx v16, \regtype\()2, v4
> + vwmaccu.vx v16, \regtype\()5, v10
> + vwmaccsu.vx v16, \regtype\()6, v12
> + vwmaccsu.vx v16, \regtype\()1, v2
> +.else
> + vwmulu.vx v16, v2, \regtype\()1
> + vwmaccu.vx v16, \regtype\()6, v12
> + vwmaccsu.vx v16, \regtype\()5, v10
> + vwmaccsu.vx v16, \regtype\()2, v4
> + vwmulu.vx v28, v8, \regtype\()4
> +.endif
> + vwmaccsu.vx v16, \regtype\()0, v0
> + vwmulu.vx v20, v6, \regtype\()3
> +
> +.ifc \regtype,t
> + vwmaccsu.vx v16, a7, v14
> +.else
> + vwmaccsu.vx v16, s7, v14
> +.endif
> +
> +.endif
> + li a5, 64
> + vwadd.wx v16, v16, a5
> + vsetvlstatic16 \len
> +
> +.ifc \name,smooth
> + vwadd.vv v24, v16, v20
> +.else
> + vwadd.vv v24, v16, v28
> + vwadd.wv v24, v24, v20
> +.endif
> + vnsra.wi v24, v24, 7
> + vmax.vx v24, v24, zero
> + vsetvlstatic8 \len, zero, 32, m2
> +
> + vnclipu.wi \dst, v24, 0
> +.ifc \op,avg
> + vle8.v v24, (a0)
> + vaaddu.vv \dst, \dst, v24
> +.endif
> +
> +.endm
> +
> +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> + epel_load \dst, \len, \op, \name, \type, \from_mem, \regtype
> + add a2, a2, a3
> +.endm
> +
> +.macro epel len, op, name, type, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> + epel_filter \name, \type, t
> +.if \vlen < 256
> + vsetvlstatic8 \len, a5, 32, m2
> +.else
> + vsetvlstatic8 \len, a5, 64, m2
> +.endif
> +.ifc \op,avg
> + csrwi vxrm, 0
> +.endif
> +
> +1:
> + addi a4, a4, -1
> + epel_load v30, \len, \op, \name, \type, 1, t
> + vse8.v v30, (a0)
> +.if \len == 64 && \vlen < 256
> + addi a0, a0, 32
> + addi a2, a2, 32
> + epel_load v30, \len, \op, \name, \type, 1, t
> + vse8.v v30, (a0)
> + addi a0, a0, -32
> + addi a2, a2, -32
> +.endif
> + add a2, a2, a3
> + add a0, a0, a1
> + bnez a4, 1b
> +
> + ret
> +endfunc
> +.endm
> +
> .irp len, 64, 32, 16, 8, 4
> copy_avg \len
> + .irp op, put, avg
> + .irp name, regular, sharp, smooth
> + .irp type, h, v
> + epel \len, \op, \name, \type, 128
> + epel \len, \op, \name, \type, 256
> + .endr
> + .endr
> + .endr
> .endr
>
> bilin_h_v put, h, a5
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 8fb326dae0..5fd64a1b8c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, const uint8_t *a);
>
> -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
> \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride, \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx,
> min_vlen) \ +void
> ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \ +
> ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my);
> @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int
> mx, int my);
>
> -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
>
> VP9_BILINEAR_RISCV_RVV_FUNC(64);
> VP9_BILINEAR_RISCV_RVV_FUNC(32);
> diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
> index b3700dfb08..3669070fca 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) # endif
>
> #if HAVE_RVV
> - if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128))
> { + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> + int vlenb = ff_get_rv_vlenb();
> + if (vlenb >= 16) {
>
> #define init_fpel(idx1, sz) \
> dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv; \
> @@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_4hv_rvv;
>
> #undef init_fpel
> +
> +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \
> + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen; \
> + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen; \
> + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> +
> +#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \
> + init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \
> + init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \
> + init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \
> + init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \
> + init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen)
> +
> + init_subpel2(0, 1, 0, h, put, 128);
> + init_subpel2(1, 1, 0, h, avg, 128);
> +
> + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> + init_subpel2(0, 0, 1, v, put, 128);
> + init_subpel2(1, 0, 1, v, avg, 128);
> + }
> +
> + }
> + if (vlenb >= 32) {
> + init_subpel2(0, 1, 0, h, put, 256);
> + init_subpel2(1, 1, 0, h, avg, 256);
> +
> + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> + init_subpel2(0, 0, 1, v, put, 256);
> + init_subpel2(1, 0, 1, v, avg, 256);
> + }
> + }
> }
> #endif
> #endif
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
2024-07-13 9:02 ` Rémi Denis-Courmont
@ 2024-07-23 8:51 ` uk7b
2024-07-29 15:20 ` Rémi Denis-Courmont
2024-07-23 8:56 ` flow gg
1 sibling, 1 reply; 15+ messages in thread
From: uk7b @ 2024-07-23 8:51 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
vp9_avg_8tap_smooth_4h_8bpp_c : 12.7 11.2
vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 4.7 4.2
vp9_avg_8tap_smooth_4v_8bpp_c : 29.7 12.5
vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 4.7 4.2
vp9_avg_8tap_smooth_8h_8bpp_c : 48.7 42.2
vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.5 8.5
vp9_avg_8tap_smooth_8v_8bpp_c : 49.7 45.5
vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5
vp9_avg_8tap_smooth_16h_8bpp_c : 192.0 166.5
vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.7 19.5
vp9_avg_8tap_smooth_16v_8bpp_c : 191.2 175.2
vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.2 19.0
vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.2
vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 68.2 60.5
vp9_avg_8tap_smooth_32v_8bpp_c : 770.0 685.7
vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.0 59.5
vp9_avg_8tap_smooth_64h_8bpp_c : 3116.2 2648.2
vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 270.7 120.7
vp9_avg_8tap_smooth_64v_8bpp_c : 3058.5 2731.7
vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 266.5 119.0
vp9_put_8tap_smooth_4h_8bpp_c : 11.0 9.7
vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 3.7
vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.5
vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.0 3.7
vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5
vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7
vp9_put_8tap_smooth_8v_8bpp_c : 43.5 38.5
vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.7 7.7
vp9_put_8tap_smooth_16h_8bpp_c : 181.7 147.2
vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 20.0 18.0
vp9_put_8tap_smooth_16v_8bpp_c : 168.5 149.7
vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5
vp9_put_8tap_smooth_32h_8bpp_c : 675.0 586.5
vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 65.2 58.0
vp9_put_8tap_smooth_32v_8bpp_c : 664.7 591.2
vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.0 57.0
vp9_put_8tap_smooth_64h_8bpp_c : 2696.2 2339.0
vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 259.7 115.7
vp9_put_8tap_smooth_64v_8bpp_c : 2691.0 2348.5
vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0
---
libavcodec/riscv/vp9_mc_rvv.S | 193 +++++++++++++++++++++++++++++++++
libavcodec/riscv/vp9dsp.h | 72 ++++++++----
libavcodec/riscv/vp9dsp_init.c | 38 ++++++-
3 files changed, 278 insertions(+), 25 deletions(-)
diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 5241562531..6a4be7b9bd 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -36,6 +36,18 @@
.endif
.endm
+.macro vsetvlstatic16 len
+.ifc \len,4
+ vsetvli zero, zero, e16, mf2, ta, ma
+.elseif \len == 8
+ vsetvli zero, zero, e16, m1, ta, ma
+.elseif \len == 16
+ vsetvli zero, zero, e16, m2, ta, ma
+.else
+ vsetvli zero, zero, e16, m4, ta, ma
+.endif
+.endm
+
.macro copy_avg len
func ff_vp9_avg\len\()_rvv, zve32x
csrwi vxrm, 0
@@ -181,8 +193,189 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
endfunc
.endm
+.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
+.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
+.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
+
+.macro epel_filter name, type, regtype, arg
+ lla \regtype\()2, ff_vp9_subpel_filters_\name
+.ifc \type,v
+ slli \regtype\()0, a6, 4
+.else
+ slli \regtype\()0, a5, 4
+.endif
+ add \regtype\()0, \regtype\()0, \regtype\()2
+ lh \regtype\()1, 2(\regtype\()0)
+ lh \regtype\()2, 4(\regtype\()0)
+ lh \regtype\()3, 6(\regtype\()0)
+ lh \regtype\()4, 8(\regtype\()0)
+ lh \regtype\()5, 10(\regtype\()0)
+ lh \regtype\()6, 12(\regtype\()0)
+ lh \arg, 14(\regtype\()0)
+ lh \regtype\()0, 0(\regtype\()0)
+.endm
+
+.macro epel_load dst, len, op, name, type, from_mem, regtype
+.ifc \from_mem, 1
+ vle8.v v22, (a2)
+.ifc \type,v
+ add a5, a3, a2
+ sub a2, a2, a3
+ vle8.v v24, (a5)
+ vle8.v v20, (a2)
+ sh1add a2, a3, a5
+ add a5, a5, a3
+ vle8.v v26, (a5)
+ vle8.v v28, (a2)
+ add a2, a2, a3
+ vle8.v v30, (a2)
+.else
+ addi a5, a2, 1
+ addi a2, a2, -1
+ vle8.v v24, (a5)
+ vle8.v v20, (a2)
+ addi a5, a5, 2
+ addi a2, a2, 3
+ vle8.v v28, (a5)
+ vle8.v v26, (a2)
+ addi a2, a5, 1
+ vle8.v v30, (a2)
+.endif
+
+.ifc \name,smooth
+ vwmulu.vx v16, v24, \regtype\()4
+ vwmaccu.vx v16, \regtype\()2, v20
+ vwmaccu.vx v16, \regtype\()5, v26
+ vwmaccsu.vx v16, \regtype\()6, v28
+.else
+ vwmulu.vx v16, v28, \regtype\()6
+ vwmaccsu.vx v16, \regtype\()2, v20
+ vwmaccsu.vx v16, \regtype\()5, v26
+.endif
+
+.ifc \regtype,t
+ vwmaccsu.vx v16, a7, v30
+.else
+ vwmaccsu.vx v16, s7, v30
+.endif
+
+.ifc \type,v
+ sh1add a5, a3, a3
+ sub a2, a2, a5
+ sub a2, a2, a5
+ sub a5, a2, a3
+ vle8.v v28, (a2)
+ vle8.v v26, (a5)
+ sh1add a2, a3, a2
+.else
+ addi a5, a2, -7
+ addi a2, a2, -6
+ vle8.v v26, (a5)
+ vle8.v v28, (a2)
+ addi a2, a2, 2
+.endif
+
+.ifc \name,smooth
+ vwmaccsu.vx v16, \regtype\()1, v28
+.else
+ vwmaccu.vx v16, \regtype\()1, v28
+ vwmulu.vx v28, v24, \regtype\()4
+.endif
+ vwmaccsu.vx v16, \regtype\()0, v26
+ vwmulu.vx v20, v22, \regtype\()3
+.else
+.ifc \name,smooth
+ vwmulu.vx v16, v8, \regtype\()4
+ vwmaccu.vx v16, \regtype\()2, v4
+ vwmaccu.vx v16, \regtype\()5, v10
+ vwmaccsu.vx v16, \regtype\()6, v12
+ vwmaccsu.vx v16, \regtype\()1, v2
+.else
+ vwmulu.vx v16, v2, \regtype\()1
+ vwmaccu.vx v16, \regtype\()6, v12
+ vwmaccsu.vx v16, \regtype\()5, v10
+ vwmaccsu.vx v16, \regtype\()2, v4
+ vwmulu.vx v28, v8, \regtype\()4
+.endif
+ vwmaccsu.vx v16, \regtype\()0, v0
+ vwmulu.vx v20, v6, \regtype\()3
+
+.ifc \regtype,t
+ vwmaccsu.vx v16, a7, v14
+.else
+ vwmaccsu.vx v16, s7, v14
+.endif
+
+.endif
+ li a5, 64
+ vwadd.wx v16, v16, a5
+ vsetvlstatic16 \len
+
+.ifc \name,smooth
+ vwadd.vv v24, v16, v20
+.else
+ vwadd.vv v24, v16, v28
+ vwadd.wv v24, v24, v20
+.endif
+ vnsra.wi v24, v24, 7
+ vmax.vx v24, v24, zero
+ vsetvlstatic8 \len, zero, 32, m2
+
+ vnclipu.wi \dst, v24, 0
+.ifc \op,avg
+ vle8.v v24, (a0)
+ vaaddu.vv \dst, \dst, v24
+.endif
+
+.endm
+
+.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
+ epel_load \dst, \len, \op, \name, \type, \from_mem, \regtype
+ add a2, a2, a3
+.endm
+
+.macro epel len, op, name, type, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
+ epel_filter \name, \type, t, a7
+.if \vlen < 256
+ vsetvlstatic8 \len, a5, 32, m2
+.else
+ vsetvlstatic8 \len, a5, 64, m2
+.endif
+.ifc \op,avg
+ csrwi vxrm, 0
+.endif
+
+1:
+ addi a4, a4, -1
+ epel_load v30, \len, \op, \name, \type, 1, t
+ vse8.v v30, (a0)
+.if \len == 64 && \vlen < 256
+ addi a0, a0, 32
+ addi a2, a2, 32
+ epel_load v30, \len, \op, \name, \type, 1, t
+ vse8.v v30, (a0)
+ addi a0, a0, -32
+ addi a2, a2, -32
+.endif
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+.endm
+
.irp len, 64, 32, 16, 8, 4
copy_avg \len
+ .irp op, put, avg
+ .irp name, regular, sharp, smooth
+ .irp type, h, v
+ epel \len, \op, \name, \type, 128
+ epel \len, \op, \name, \type, 256
+ .endr
+ .endr
+ .endr
.endr
bilin_h_v put, h, a5
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index 8fb326dae0..5fd64a1b8c 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
const uint8_t *a);
-#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) \
-void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
+#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen) \
+void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my);
@@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride, \
const uint8_t *src, ptrdiff_t srcstride, \
int h, int mx, int my);
-VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
VP9_BILINEAR_RISCV_RVV_FUNC(64);
VP9_BILINEAR_RISCV_RVV_FUNC(32);
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index b3700dfb08..3669070fca 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
# endif
#if HAVE_RVV
- if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) {
+ if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
+ int vlenb = ff_get_rv_vlenb();
+ if (vlenb >= 16) {
#define init_fpel(idx1, sz) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv; \
@@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
#undef init_fpel
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
+ ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen; \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
+ ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen; \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
+ ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
+
+#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \
+ init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \
+ init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \
+ init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \
+ init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \
+ init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen)
+
+ init_subpel2(0, 1, 0, h, put, 128);
+ init_subpel2(1, 1, 0, h, avg, 128);
+
+ if (flags & AV_CPU_FLAG_RVB_ADDR) {
+ init_subpel2(0, 0, 1, v, put, 128);
+ init_subpel2(1, 0, 1, v, avg, 128);
+ }
+
+ }
+ if (vlenb >= 32) {
+ init_subpel2(0, 1, 0, h, put, 256);
+ init_subpel2(1, 1, 0, h, avg, 256);
+
+ if (flags & AV_CPU_FLAG_RVB_ADDR) {
+ init_subpel2(0, 0, 1, v, put, 256);
+ init_subpel2(1, 0, 1, v, avg, 256);
+ }
+ }
}
#endif
#endif
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
2024-07-13 9:02 ` Rémi Denis-Courmont
2024-07-23 8:51 ` uk7b
@ 2024-07-23 8:56 ` flow gg
1 sibling, 0 replies; 15+ messages in thread
From: flow gg @ 2024-07-23 8:56 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> TBH it is very hard to review this due to the large extents of code
> conditionals. This should avoidable at least partly. You can name macros
for
> each filter and then expand those macros instead of using if's.
Do you mean that before the addition of .equ ff_vp9_subpel_filters_xxx,
epel_filter had too many if statements?
Now the filter has only two if statements. Anyway, I have updated it and
reduced one more if.
> Besides in my experience, it is more readable to leave the loads/stores
to the
> outer function or macros and factor only the calculations, whenever you
need
> to apply the same maths vertically and/or horizontally. This also
sometimes
> enables actually using shared code, e.g., the H.263 loop filter or the
VC-1
> ITX.
There is an issue here because of insufficient vector registers, so vector
registers need to be reused.
If we use the H.263 method, it would require two more jumps.
Additionally, scalar registers are also insufficient. so need more stack.
I want to implement this as a macro for lengths of 4, 8, 16, 32, and 64
first.
In a subsequent patch, I will break down 4, 8, and 16 into one macro, and
32 or 64 into another macro.
This way, code can be better shared and some other adjustments like vlseg
...
> Lastly this seems to both add new optimisations *and* add specialisations
for
> 256-bit vectors, which really should be separate patches, but maybe I just
> don't understand the code. In any case, that would not really match with
the
> patch description.
I think the purpose of this patch is to implement 128b+256b RVV, so adding
the corresponding 128+256 functions in vp9dsp.h could also be part of this
patch?
Rémi Denis-Courmont <remi@remlab.net> 于2024年7月13日周六 17:02写道:
> Le lauantaina 15. kesäkuuta 2024, 14.50.33 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> OK, so I realise that this review is very late, but...
>
> TBH it is very hard to review this due to the large extents of code
> conditionals. This should avoidable at least partly. You can name macros
> for
> each filter and then expand those macros instead of using if's.
>
> Besides in my experience, it is more readable to leave the loads/stores to
> the
> outer function or macros and factor only the calculations, whenever you
> need
> to apply the same maths vertically and/or horizontally. This also
> sometimes
> enables actually using shared code, e.g., the H.263 loop filter or the
> VC-1
> ITX.
>
> Lastly this seems to both add new optimisations *and* add specialisations
> for
> 256-bit vectors, which really should be separate patches, but maybe I just
> don't understand the code. In any case, that would not really match with
> the
> patch description.
>
>
> > C908 X60
> > vp9_avg_8tap_smooth_4h_8bpp_c : 12.7 11.2
> > vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 4.7 4.2
> > vp9_avg_8tap_smooth_4v_8bpp_c : 29.7 12.5
> > vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 4.7 4.2
> > vp9_avg_8tap_smooth_8h_8bpp_c : 48.7 42.2
> > vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.5 8.5
> > vp9_avg_8tap_smooth_8v_8bpp_c : 49.7 45.5
> > vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5
> > vp9_avg_8tap_smooth_16h_8bpp_c : 192.0 166.5
> > vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.7 19.5
> > vp9_avg_8tap_smooth_16v_8bpp_c : 191.2 175.2
> > vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.2 19.0
> > vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.2
> > vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 68.2 60.5
> > vp9_avg_8tap_smooth_32v_8bpp_c : 770.0 685.7
> > vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.0 59.5
> > vp9_avg_8tap_smooth_64h_8bpp_c : 3116.2 2648.2
> > vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 270.7 120.7
> > vp9_avg_8tap_smooth_64v_8bpp_c : 3058.5 2731.7
> > vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 266.5 119.0
> > vp9_put_8tap_smooth_4h_8bpp_c : 11.0 9.7
> > vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 3.7
> > vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.5
> > vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.0 3.7
> > vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5
> > vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7
> > vp9_put_8tap_smooth_8v_8bpp_c : 43.5 38.5
> > vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.7 7.7
> > vp9_put_8tap_smooth_16h_8bpp_c : 181.7 147.2
> > vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 20.0 18.0
> > vp9_put_8tap_smooth_16v_8bpp_c : 168.5 149.7
> > vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5
> > vp9_put_8tap_smooth_32h_8bpp_c : 675.0 586.5
> > vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 65.2 58.0
> > vp9_put_8tap_smooth_32v_8bpp_c : 664.7 591.2
> > vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.0 57.0
> > vp9_put_8tap_smooth_64h_8bpp_c : 2696.2 2339.0
> > vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 259.7 115.7
> > vp9_put_8tap_smooth_64v_8bpp_c : 2691.0 2348.5
> > vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0
> > ---
> > libavcodec/riscv/vp9_mc_rvv.S | 200 +++++++++++++++++++++++++++++++++
> > libavcodec/riscv/vp9dsp.h | 72 ++++++++----
> > libavcodec/riscv/vp9dsp_init.c | 38 ++++++-
> > 3 files changed, 285 insertions(+), 25 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index 5241562531..5e81301aa5 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -36,6 +36,18 @@
> > .endif
> > .endm
> >
> > +.macro vsetvlstatic16 len
> > +.ifc \len,4
> > + vsetvli zero, zero, e16, mf2, ta, ma
> > +.elseif \len == 8
> > + vsetvli zero, zero, e16, m1, ta, ma
> > +.elseif \len == 16
> > + vsetvli zero, zero, e16, m2, ta, ma
> > +.else
> > + vsetvli zero, zero, e16, m4, ta, ma
> > +.endif
> > +.endm
> > +
> > .macro copy_avg len
> > func ff_vp9_avg\len\()_rvv, zve32x
> > csrwi vxrm, 0
> > @@ -181,8 +193,196 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> > endfunc
> > .endm
> >
> > +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> > +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> > +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> > +
> > +.macro epel_filter name, type, regtype
> > + lla \regtype\()2, ff_vp9_subpel_filters_\name
> > +
> > +.ifc \type,v
> > + slli \regtype\()0, a6, 4
> > +.else
> > + slli \regtype\()0, a5, 4
> > +.endif
> > + add \regtype\()0, \regtype\()0, \regtype\()2
> > +
> > + lh \regtype\()1, 2(\regtype\()0)
> > + lh \regtype\()2, 4(\regtype\()0)
> > + lh \regtype\()3, 6(\regtype\()0)
> > + lh \regtype\()4, 8(\regtype\()0)
> > + lh \regtype\()5, 10(\regtype\()0)
> > + lh \regtype\()6, 12(\regtype\()0)
> > +
> > +.ifc \regtype,t
> > + lh a7, 14(\regtype\()0)
> > +.else
> > + lh s7, 14(\regtype\()0)
> > +.endif
> > + lh \regtype\()0, 0(\regtype\()0)
> > +.endm
> > +
> > +.macro epel_load dst, len, op, name, type, from_mem, regtype
> > +.ifc \from_mem, 1
> > + vle8.v v22, (a2)
> > +.ifc \type,v
> > + add a5, a3, a2
> > + sub a2, a2, a3
> > + vle8.v v24, (a5)
> > + vle8.v v20, (a2)
> > + sh1add a2, a3, a5
> > + add a5, a5, a3
> > + vle8.v v26, (a5)
> > + vle8.v v28, (a2)
> > + add a2, a2, a3
> > + vle8.v v30, (a2)
> > +.else
> > + addi a5, a2, 1
> > + addi a2, a2, -1
> > + vle8.v v24, (a5)
> > + vle8.v v20, (a2)
> > + addi a5, a5, 2
> > + addi a2, a2, 3
> > + vle8.v v28, (a5)
> > + vle8.v v26, (a2)
> > + addi a2, a5, 1
> > + vle8.v v30, (a2)
> > +.endif
> > +
> > +.ifc \name,smooth
> > + vwmulu.vx v16, v24, \regtype\()4
> > + vwmaccu.vx v16, \regtype\()2, v20
> > + vwmaccu.vx v16, \regtype\()5, v26
> > + vwmaccsu.vx v16, \regtype\()6, v28
> > +.else
> > + vwmulu.vx v16, v28, \regtype\()6
> > + vwmaccsu.vx v16, \regtype\()2, v20
> > + vwmaccsu.vx v16, \regtype\()5, v26
> > +.endif
> > +
> > +.ifc \regtype,t
> > + vwmaccsu.vx v16, a7, v30
> > +.else
> > + vwmaccsu.vx v16, s7, v30
> > +.endif
> > +
> > +.ifc \type,v
> > + sh1add a5, a3, a3
> > + sub a2, a2, a5
> > + sub a2, a2, a5
> > + sub a5, a2, a3
> > + vle8.v v28, (a2)
> > + vle8.v v26, (a5)
> > + sh1add a2, a3, a2
> > +.else
> > + addi a5, a2, -7
> > + addi a2, a2, -6
> > + vle8.v v26, (a5)
> > + vle8.v v28, (a2)
> > + addi a2, a2, 2
> > +.endif
> > +
> > +.ifc \name,smooth
> > + vwmaccsu.vx v16, \regtype\()1, v28
> > +.else
> > + vwmaccu.vx v16, \regtype\()1, v28
> > + vwmulu.vx v28, v24, \regtype\()4
> > +.endif
> > + vwmaccsu.vx v16, \regtype\()0, v26
> > + vwmulu.vx v20, v22, \regtype\()3
> > +.else
> > +.ifc \name,smooth
> > + vwmulu.vx v16, v8, \regtype\()4
> > + vwmaccu.vx v16, \regtype\()2, v4
> > + vwmaccu.vx v16, \regtype\()5, v10
> > + vwmaccsu.vx v16, \regtype\()6, v12
> > + vwmaccsu.vx v16, \regtype\()1, v2
> > +.else
> > + vwmulu.vx v16, v2, \regtype\()1
> > + vwmaccu.vx v16, \regtype\()6, v12
> > + vwmaccsu.vx v16, \regtype\()5, v10
> > + vwmaccsu.vx v16, \regtype\()2, v4
> > + vwmulu.vx v28, v8, \regtype\()4
> > +.endif
> > + vwmaccsu.vx v16, \regtype\()0, v0
> > + vwmulu.vx v20, v6, \regtype\()3
> > +
> > +.ifc \regtype,t
> > + vwmaccsu.vx v16, a7, v14
> > +.else
> > + vwmaccsu.vx v16, s7, v14
> > +.endif
> > +
> > +.endif
> > + li a5, 64
> > + vwadd.wx v16, v16, a5
> > + vsetvlstatic16 \len
> > +
> > +.ifc \name,smooth
> > + vwadd.vv v24, v16, v20
> > +.else
> > + vwadd.vv v24, v16, v28
> > + vwadd.wv v24, v24, v20
> > +.endif
> > + vnsra.wi v24, v24, 7
> > + vmax.vx v24, v24, zero
> > + vsetvlstatic8 \len, zero, 32, m2
> > +
> > + vnclipu.wi \dst, v24, 0
> > +.ifc \op,avg
> > + vle8.v v24, (a0)
> > + vaaddu.vv \dst, \dst, v24
> > +.endif
> > +
> > +.endm
> > +
> > +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> > + epel_load \dst, \len, \op, \name, \type, \from_mem,
> \regtype
> > + add a2, a2, a3
> > +.endm
> > +
> > +.macro epel len, op, name, type, vlen
> > +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> > + epel_filter \name, \type, t
> > +.if \vlen < 256
> > + vsetvlstatic8 \len, a5, 32, m2
> > +.else
> > + vsetvlstatic8 \len, a5, 64, m2
> > +.endif
> > +.ifc \op,avg
> > + csrwi vxrm, 0
> > +.endif
> > +
> > +1:
> > + addi a4, a4, -1
> > + epel_load v30, \len, \op, \name, \type, 1, t
> > + vse8.v v30, (a0)
> > +.if \len == 64 && \vlen < 256
> > + addi a0, a0, 32
> > + addi a2, a2, 32
> > + epel_load v30, \len, \op, \name, \type, 1, t
> > + vse8.v v30, (a0)
> > + addi a0, a0, -32
> > + addi a2, a2, -32
> > +.endif
> > + add a2, a2, a3
> > + add a0, a0, a1
> > + bnez a4, 1b
> > +
> > + ret
> > +endfunc
> > +.endm
> > +
> > .irp len, 64, 32, 16, 8, 4
> > copy_avg \len
> > + .irp op, put, avg
> > + .irp name, regular, sharp, smooth
> > + .irp type, h, v
> > + epel \len, \op, \name, \type, 128
> > + epel \len, \op, \name, \type, 256
> > + .endr
> > + .endr
> > + .endr
> > .endr
> >
> > bilin_h_v put, h, a5
> > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> > index 8fb326dae0..5fd64a1b8c 100644
> > --- a/libavcodec/riscv/vp9dsp.h
> > +++ b/libavcodec/riscv/vp9dsp.h
> > @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride,
> const
> > uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const
> > uint8_t *l, const uint8_t *a);
> >
> > -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>
> > \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> > dststride, \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx,
> > min_vlen) \ +void
> > ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
> \ +
> > ptrdiff_t dststride,
>
> > \ const uint8_t *src, \ ptrdiff_t srcstride,
>
> > \ int h, int mx, int my); \ \ -void
> > ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
> > +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>
> > \ + ptrdiff_t dststride,
>
> > \ const uint8_t *src, \ ptrdiff_t srcstride,
>
> > \ int h, int mx, int my); \ \ -void
> > ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
> > +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>
> > \ + ptrdiff_t dststride,
>
> > \ const uint8_t *src, \ ptrdiff_t srcstride,
>
> > \ int h, int mx, int my); \ \ -void
> > ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
> > +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
>
> > \ + ptrdiff_t dststride,
>
> > \ const uint8_t *src, \ ptrdiff_t srcstride,
>
> > \ int h, int mx, int my); \ \ -void
> > ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
> > +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>
> > \ + ptrdiff_t dststride,
>
> > \ const uint8_t *src, \ ptrdiff_t srcstride,
>
> > \ int h, int mx, int my); \ \ -void
> > ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
> > +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>
> > \ + ptrdiff_t dststride,
>
> > \ const uint8_t *src, \ ptrdiff_t srcstride,
>
> > \ int h, int mx, int my);
> > @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> > dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int
> > mx, int my);
> >
> > -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> > -
> > -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> > -
> > -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
> >
> > VP9_BILINEAR_RISCV_RVV_FUNC(64);
> > VP9_BILINEAR_RISCV_RVV_FUNC(32);
> > diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> > index b3700dfb08..3669070fca 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) # endif
> >
> > #if HAVE_RVV
> > - if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128))
> > { + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> > + int vlenb = ff_get_rv_vlenb();
> > + if (vlenb >= 16) {
> >
> > #define init_fpel(idx1, sz) \
> > dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] =
> ff_vp9_avg##sz##_rvv; \
> > @@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_4hv_rvv;
> >
> > #undef init_fpel
> > +
> > +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \
> > + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
> > + ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen; \
> > + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
> > + ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen; \
> > + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
> > + ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> > +
> > +#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \
> > + init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \
> > + init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \
> > + init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \
> > + init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \
> > + init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen)
> > +
> > + init_subpel2(0, 1, 0, h, put, 128);
> > + init_subpel2(1, 1, 0, h, avg, 128);
> > +
> > + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> > + init_subpel2(0, 0, 1, v, put, 128);
> > + init_subpel2(1, 0, 1, v, avg, 128);
> > + }
> > +
> > + }
> > + if (vlenb >= 32) {
> > + init_subpel2(0, 1, 0, h, put, 256);
> > + init_subpel2(1, 1, 0, h, avg, 256);
> > +
> > + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> > + init_subpel2(0, 0, 1, v, put, 256);
> > + init_subpel2(1, 0, 1, v, avg, 256);
> > + }
> > + }
> > }
> > #endif
> > #endif
>
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v4 4/4] lavc/vp9dsp: R-V V mc tap hv
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 4/4] lavc/vp9dsp: R-V V mc tap hv uk7b
@ 2024-07-23 8:58 ` uk7b
2024-07-23 9:03 ` flow gg
0 siblings, 1 reply; 15+ messages in thread
From: uk7b @ 2024-07-23 8:58 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
vp9_avg_8tap_smooth_4hv_8bpp_c : 32.0 28.0
vp9_avg_8tap_smooth_4hv_8bpp_rvv_i32 : 15.0 13.2
vp9_avg_8tap_smooth_8hv_8bpp_c : 98.0 86.2
vp9_avg_8tap_smooth_8hv_8bpp_rvv_i32 : 23.7 21.2
vp9_avg_8tap_smooth_16hv_8bpp_c : 355.7 297.0
vp9_avg_8tap_smooth_16hv_8bpp_rvv_i32 : 47.0 41.5
vp9_avg_8tap_smooth_32hv_8bpp_c : 1272.7 1099.7
vp9_avg_8tap_smooth_32hv_8bpp_rvv_i32 : 134.7 119.7
vp9_avg_8tap_smooth_64hv_8bpp_c : 4937.0 4224.2
vp9_avg_8tap_smooth_64hv_8bpp_rvv_i32 : 528.5 228.5
vp9_put_8tap_smooth_4hv_8bpp_c : 30.2 26.7
vp9_put_8tap_smooth_4hv_8bpp_rvv_i32 : 30.5 12.5
vp9_put_8tap_smooth_8hv_8bpp_c : 91.5 81.2
vp9_put_8tap_smooth_8hv_8bpp_rvv_i32 : 22.7 20.2
vp9_put_8tap_smooth_16hv_8bpp_c : 313.2 277.5
vp9_put_8tap_smooth_16hv_8bpp_rvv_i32 : 45.2 40.2
vp9_put_8tap_smooth_32hv_8bpp_c : 1166.7 1022.2
vp9_put_8tap_smooth_32hv_8bpp_rvv_i32 : 131.7 117.2
vp9_put_8tap_smooth_64hv_8bpp_c : 4560.5 3961.7
vp9_put_8tap_smooth_64hv_8bpp_rvv_i32 : 517.0 223.2
---
libavcodec/riscv/vp9_mc_rvv.S | 75 ++++++++++++++++++++++++++++++++++
libavcodec/riscv/vp9dsp_init.c | 8 ++++
2 files changed, 83 insertions(+)
diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 6a4be7b9bd..26754ac6f8 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -366,6 +366,77 @@ func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
endfunc
.endm
+#if __riscv_xlen == 64
+.macro epel_hv_once len, name, op
+ sub a2, a2, a3
+ sub a2, a2, a3
+ sub a2, a2, a3
+ .irp n,0,2,4,6,8,10,12,14
+ epel_load_inc v\n, \len, put, \name, h, 1, t
+ .endr
+ addi a4, a4, -1
+1:
+ addi a4, a4, -1
+ epel_load v30, \len, \op, \name, v, 0, s
+ vse8.v v30, (a0)
+ vmv.v.v v0, v2
+ vmv.v.v v2, v4
+ vmv.v.v v4, v6
+ vmv.v.v v6, v8
+ vmv.v.v v8, v10
+ vmv.v.v v10, v12
+ vmv.v.v v12, v14
+ epel_load v14, \len, put, \name, h, 1, t
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+ epel_load v30, \len, \op, \name, v, 0, s
+ vse8.v v30, (a0)
+.endm
+
+.macro epel_hv op, name, len, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()hv_rvv\vlen\(), zve32x
+ addi sp, sp, -64
+ .irp n,0,1,2,3,4,5,6,7
+ sd s\n, \n\()<<3(sp)
+ .endr
+.if \len == 64 && \vlen < 256
+ addi sp, sp, -48
+ .irp n,0,1,2,3,4,5
+ sd a\n, \n\()<<3(sp)
+ .endr
+.endif
+.ifc \op,avg
+ csrwi vxrm, 0
+.endif
+ epel_filter \name, h, t, a7
+ epel_filter \name, v, s, s7
+.if \vlen < 256
+ vsetvlstatic8 \len, a6, 32, m2
+.else
+ vsetvlstatic8 \len, a6, 64, m2
+.endif
+ epel_hv_once \len, \name, \op
+.if \len == 64 && \vlen < 256
+ .irp n,0,1,2,3,4,5
+ ld a\n, \n\()<<3(sp)
+ .endr
+ addi sp, sp, 48
+ addi a0, a0, 32
+ addi a2, a2, 32
+ epel_filter \name, h, t, a7
+ epel_hv_once \len, \name, \op
+.endif
+ .irp n,0,1,2,3,4,5,6,7
+ ld s\n, \n\()<<3(sp)
+ .endr
+ addi sp, sp, 64
+
+ ret
+endfunc
+.endm
+#endif
+
.irp len, 64, 32, 16, 8, 4
copy_avg \len
.irp op, put, avg
@@ -374,6 +445,10 @@ endfunc
epel \len, \op, \name, \type, 128
epel \len, \op, \name, \type, 256
.endr
+ #if __riscv_xlen == 64
+ epel_hv \op, \name, \len, 128
+ epel_hv \op, \name, \len, 256
+ #endif
.endr
.endr
.endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 3669070fca..7b090c9889 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -119,6 +119,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
if (flags & AV_CPU_FLAG_RVB_ADDR) {
init_subpel2(0, 0, 1, v, put, 128);
init_subpel2(1, 0, 1, v, avg, 128);
+# if __riscv_xlen == 64
+ init_subpel2(0, 1, 1, hv, put, 128);
+ init_subpel2(1, 1, 1, hv, avg, 128);
+# endif
}
}
@@ -129,6 +133,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
if (flags & AV_CPU_FLAG_RVB_ADDR) {
init_subpel2(0, 0, 1, v, put, 256);
init_subpel2(1, 0, 1, v, avg, 256);
+# if __riscv_xlen == 64
+ init_subpel2(0, 1, 1, hv, put, 256);
+ init_subpel2(1, 1, 1, hv, avg, 256);
+# endif
}
}
}
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v4 4/4] lavc/vp9dsp: R-V V mc tap hv
2024-07-23 8:58 ` uk7b
@ 2024-07-23 9:03 ` flow gg
0 siblings, 0 replies; 15+ messages in thread
From: flow gg @ 2024-07-23 9:03 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Because of the 3/4 update, updated it."
<uk7b@foxmail.com> 于2024年7月23日周二 16:59写道:
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908 X60
> vp9_avg_8tap_smooth_4hv_8bpp_c : 32.0 28.0
> vp9_avg_8tap_smooth_4hv_8bpp_rvv_i32 : 15.0 13.2
> vp9_avg_8tap_smooth_8hv_8bpp_c : 98.0 86.2
> vp9_avg_8tap_smooth_8hv_8bpp_rvv_i32 : 23.7 21.2
> vp9_avg_8tap_smooth_16hv_8bpp_c : 355.7 297.0
> vp9_avg_8tap_smooth_16hv_8bpp_rvv_i32 : 47.0 41.5
> vp9_avg_8tap_smooth_32hv_8bpp_c : 1272.7 1099.7
> vp9_avg_8tap_smooth_32hv_8bpp_rvv_i32 : 134.7 119.7
> vp9_avg_8tap_smooth_64hv_8bpp_c : 4937.0 4224.2
> vp9_avg_8tap_smooth_64hv_8bpp_rvv_i32 : 528.5 228.5
> vp9_put_8tap_smooth_4hv_8bpp_c : 30.2 26.7
> vp9_put_8tap_smooth_4hv_8bpp_rvv_i32 : 30.5 12.5
> vp9_put_8tap_smooth_8hv_8bpp_c : 91.5 81.2
> vp9_put_8tap_smooth_8hv_8bpp_rvv_i32 : 22.7 20.2
> vp9_put_8tap_smooth_16hv_8bpp_c : 313.2 277.5
> vp9_put_8tap_smooth_16hv_8bpp_rvv_i32 : 45.2 40.2
> vp9_put_8tap_smooth_32hv_8bpp_c : 1166.7 1022.2
> vp9_put_8tap_smooth_32hv_8bpp_rvv_i32 : 131.7 117.2
> vp9_put_8tap_smooth_64hv_8bpp_c : 4560.5 3961.7
> vp9_put_8tap_smooth_64hv_8bpp_rvv_i32 : 517.0 223.2
> ---
> libavcodec/riscv/vp9_mc_rvv.S | 75 ++++++++++++++++++++++++++++++++++
> libavcodec/riscv/vp9dsp_init.c | 8 ++++
> 2 files changed, 83 insertions(+)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 6a4be7b9bd..26754ac6f8 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -366,6 +366,77 @@ func
> ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> endfunc
> .endm
>
> +#if __riscv_xlen == 64
> +.macro epel_hv_once len, name, op
> + sub a2, a2, a3
> + sub a2, a2, a3
> + sub a2, a2, a3
> + .irp n,0,2,4,6,8,10,12,14
> + epel_load_inc v\n, \len, put, \name, h, 1, t
> + .endr
> + addi a4, a4, -1
> +1:
> + addi a4, a4, -1
> + epel_load v30, \len, \op, \name, v, 0, s
> + vse8.v v30, (a0)
> + vmv.v.v v0, v2
> + vmv.v.v v2, v4
> + vmv.v.v v4, v6
> + vmv.v.v v6, v8
> + vmv.v.v v8, v10
> + vmv.v.v v10, v12
> + vmv.v.v v12, v14
> + epel_load v14, \len, put, \name, h, 1, t
> + add a2, a2, a3
> + add a0, a0, a1
> + bnez a4, 1b
> + epel_load v30, \len, \op, \name, v, 0, s
> + vse8.v v30, (a0)
> +.endm
> +
> +.macro epel_hv op, name, len, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()hv_rvv\vlen\(), zve32x
> + addi sp, sp, -64
> + .irp n,0,1,2,3,4,5,6,7
> + sd s\n, \n\()<<3(sp)
> + .endr
> +.if \len == 64 && \vlen < 256
> + addi sp, sp, -48
> + .irp n,0,1,2,3,4,5
> + sd a\n, \n\()<<3(sp)
> + .endr
> +.endif
> +.ifc \op,avg
> + csrwi vxrm, 0
> +.endif
> + epel_filter \name, h, t, a7
> + epel_filter \name, v, s, s7
> +.if \vlen < 256
> + vsetvlstatic8 \len, a6, 32, m2
> +.else
> + vsetvlstatic8 \len, a6, 64, m2
> +.endif
> + epel_hv_once \len, \name, \op
> +.if \len == 64 && \vlen < 256
> + .irp n,0,1,2,3,4,5
> + ld a\n, \n\()<<3(sp)
> + .endr
> + addi sp, sp, 48
> + addi a0, a0, 32
> + addi a2, a2, 32
> + epel_filter \name, h, t, a7
> + epel_hv_once \len, \name, \op
> +.endif
> + .irp n,0,1,2,3,4,5,6,7
> + ld s\n, \n\()<<3(sp)
> + .endr
> + addi sp, sp, 64
> +
> + ret
> +endfunc
> +.endm
> +#endif
> +
> .irp len, 64, 32, 16, 8, 4
> copy_avg \len
> .irp op, put, avg
> @@ -374,6 +445,10 @@ endfunc
> epel \len, \op, \name, \type, 128
> epel \len, \op, \name, \type, 256
> .endr
> + #if __riscv_xlen == 64
> + epel_hv \op, \name, \len, 128
> + epel_hv \op, \name, \len, 256
> + #endif
> .endr
> .endr
> .endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index 3669070fca..7b090c9889 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -119,6 +119,10 @@ static av_cold void
> vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
> if (flags & AV_CPU_FLAG_RVB_ADDR) {
> init_subpel2(0, 0, 1, v, put, 128);
> init_subpel2(1, 0, 1, v, avg, 128);
> +# if __riscv_xlen == 64
> + init_subpel2(0, 1, 1, hv, put, 128);
> + init_subpel2(1, 1, 1, hv, avg, 128);
> +# endif
> }
>
> }
> @@ -129,6 +133,10 @@ static av_cold void
> vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
> if (flags & AV_CPU_FLAG_RVB_ADDR) {
> init_subpel2(0, 0, 1, v, put, 256);
> init_subpel2(1, 0, 1, v, avg, 256);
> +# if __riscv_xlen == 64
> + init_subpel2(0, 1, 1, hv, put, 256);
> + init_subpel2(1, 1, 1, hv, avg, 256);
> +# endif
> }
> }
> }
> --
> 2.45.2
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
2024-07-23 8:51 ` uk7b
@ 2024-07-29 15:20 ` Rémi Denis-Courmont
2024-07-31 10:36 ` flow gg
0 siblings, 1 reply; 15+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-29 15:20 UTC (permalink / raw)
To: ffmpeg-devel
Le tiistaina 23. heinäkuuta 2024, 11.51.48 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908 X60
> vp9_avg_8tap_smooth_4h_8bpp_c : 12.7 11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 4.7 4.2
> vp9_avg_8tap_smooth_4v_8bpp_c : 29.7 12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 4.7 4.2
> vp9_avg_8tap_smooth_8h_8bpp_c : 48.7 42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.5 8.5
> vp9_avg_8tap_smooth_8v_8bpp_c : 49.7 45.5
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5
> vp9_avg_8tap_smooth_16h_8bpp_c : 192.0 166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.7 19.5
> vp9_avg_8tap_smooth_16v_8bpp_c : 191.2 175.2
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.2 19.0
> vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.2
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 68.2 60.5
> vp9_avg_8tap_smooth_32v_8bpp_c : 770.0 685.7
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.0 59.5
> vp9_avg_8tap_smooth_64h_8bpp_c : 3116.2 2648.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 270.7 120.7
> vp9_avg_8tap_smooth_64v_8bpp_c : 3058.5 2731.7
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 266.5 119.0
> vp9_put_8tap_smooth_4h_8bpp_c : 11.0 9.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 3.7
> vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.5
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.0 3.7
> vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7
> vp9_put_8tap_smooth_8v_8bpp_c : 43.5 38.5
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.7 7.7
> vp9_put_8tap_smooth_16h_8bpp_c : 181.7 147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 20.0 18.0
> vp9_put_8tap_smooth_16v_8bpp_c : 168.5 149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5
> vp9_put_8tap_smooth_32h_8bpp_c : 675.0 586.5
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 65.2 58.0
> vp9_put_8tap_smooth_32v_8bpp_c : 664.7 591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.0 57.0
> vp9_put_8tap_smooth_64h_8bpp_c : 2696.2 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 259.7 115.7
> vp9_put_8tap_smooth_64v_8bpp_c : 2691.0 2348.5
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0
> ---
> libavcodec/riscv/vp9_mc_rvv.S | 193 +++++++++++++++++++++++++++++++++
> libavcodec/riscv/vp9dsp.h | 72 ++++++++----
> libavcodec/riscv/vp9dsp_init.c | 38 ++++++-
> 3 files changed, 278 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 5241562531..6a4be7b9bd 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
> .endif
> .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> + vsetvli zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> + vsetvli zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> + vsetvli zero, zero, e16, m2, ta, ma
> +.else
> + vsetvli zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
> .macro copy_avg len
> func ff_vp9_avg\len\()_rvv, zve32x
> csrwi vxrm, 0
> @@ -181,8 +193,189 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> endfunc
> .endm
>
> +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> +
> +.macro epel_filter name, type, regtype, arg
> + lla \regtype\()2, ff_vp9_subpel_filters_\name
> +.ifc \type,v
> + slli \regtype\()0, a6, 4
> +.else
> + slli \regtype\()0, a5, 4
> +.endif
> + add \regtype\()0, \regtype\()0, \regtype\()2
> + lh \regtype\()1, 2(\regtype\()0)
> + lh \regtype\()2, 4(\regtype\()0)
> + lh \regtype\()3, 6(\regtype\()0)
> + lh \regtype\()4, 8(\regtype\()0)
> + lh \regtype\()5, 10(\regtype\()0)
> + lh \regtype\()6, 12(\regtype\()0)
> + lh \arg, 14(\regtype\()0)
> + lh \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst, len, op, name, type, from_mem, regtype
> +.ifc \from_mem, 1
> + vle8.v v22, (a2)
> +.ifc \type,v
> + add a5, a3, a2
> + sub a2, a2, a3
> + vle8.v v24, (a5)
> + vle8.v v20, (a2)
> + sh1add a2, a3, a5
> + add a5, a5, a3
> + vle8.v v26, (a5)
> + vle8.v v28, (a2)
> + add a2, a2, a3
> + vle8.v v30, (a2)
> +.else
> + addi a5, a2, 1
> + addi a2, a2, -1
> + vle8.v v24, (a5)
> + vle8.v v20, (a2)
> + addi a5, a5, 2
> + addi a2, a2, 3
> + vle8.v v28, (a5)
> + vle8.v v26, (a2)
> + addi a2, a5, 1
> + vle8.v v30, (a2)
> +.endif
> +
> +.ifc \name,smooth
> + vwmulu.vx v16, v24, \regtype\()4
> + vwmaccu.vx v16, \regtype\()2, v20
> + vwmaccu.vx v16, \regtype\()5, v26
> + vwmaccsu.vx v16, \regtype\()6, v28
> +.else
> + vwmulu.vx v16, v28, \regtype\()6
> + vwmaccsu.vx v16, \regtype\()2, v20
> + vwmaccsu.vx v16, \regtype\()5, v26
> +.endif
> +
> +.ifc \regtype,t
> + vwmaccsu.vx v16, a7, v30
> +.else
> + vwmaccsu.vx v16, s7, v30
> +.endif
> +
> +.ifc \type,v
> + sh1add a5, a3, a3
> + sub a2, a2, a5
> + sub a2, a2, a5
> + sub a5, a2, a3
> + vle8.v v28, (a2)
> + vle8.v v26, (a5)
> + sh1add a2, a3, a2
> +.else
> + addi a5, a2, -7
> + addi a2, a2, -6
> + vle8.v v26, (a5)
> + vle8.v v28, (a2)
> + addi a2, a2, 2
> +.endif
> +
> +.ifc \name,smooth
> + vwmaccsu.vx v16, \regtype\()1, v28
> +.else
> + vwmaccu.vx v16, \regtype\()1, v28
> + vwmulu.vx v28, v24, \regtype\()4
> +.endif
> + vwmaccsu.vx v16, \regtype\()0, v26
> + vwmulu.vx v20, v22, \regtype\()3
> +.else
> +.ifc \name,smooth
> + vwmulu.vx v16, v8, \regtype\()4
> + vwmaccu.vx v16, \regtype\()2, v4
> + vwmaccu.vx v16, \regtype\()5, v10
> + vwmaccsu.vx v16, \regtype\()6, v12
> + vwmaccsu.vx v16, \regtype\()1, v2
> +.else
> + vwmulu.vx v16, v2, \regtype\()1
> + vwmaccu.vx v16, \regtype\()6, v12
> + vwmaccsu.vx v16, \regtype\()5, v10
> + vwmaccsu.vx v16, \regtype\()2, v4
> + vwmulu.vx v28, v8, \regtype\()4
> +.endif
> + vwmaccsu.vx v16, \regtype\()0, v0
> + vwmulu.vx v20, v6, \regtype\()3
> +
> +.ifc \regtype,t
> + vwmaccsu.vx v16, a7, v14
> +.else
> + vwmaccsu.vx v16, s7, v14
> +.endif
> +
> +.endif
> + li a5, 64
> + vwadd.wx v16, v16, a5
Use rounding.
> + vsetvlstatic16 \len
> +
> +.ifc \name,smooth
> + vwadd.vv v24, v16, v20
> +.else
> + vwadd.vv v24, v16, v28
> + vwadd.wv v24, v24, v20
> +.endif
> + vnsra.wi v24, v24, 7
> + vmax.vx v24, v24, zero
> + vsetvlstatic8 \len, zero, 32, m2
> +
> + vnclipu.wi \dst, v24, 0
> +.ifc \op,avg
> + vle8.v v24, (a0)
> + vaaddu.vv \dst, \dst, v24
> +.endif
> +
> +.endm
> +
> +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> + epel_load \dst, \len, \op, \name, \type, \from_mem, \regtype
> + add a2, a2, a3
> +.endm
> +
> +.macro epel len, op, name, type, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> + epel_filter \name, \type, t, a7
> +.if \vlen < 256
> + vsetvlstatic8 \len, a5, 32, m2
> +.else
> + vsetvlstatic8 \len, a5, 64, m2
> +.endif
> +.ifc \op,avg
> + csrwi vxrm, 0
> +.endif
> +
> +1:
> + addi a4, a4, -1
> + epel_load v30, \len, \op, \name, \type, 1, t
> + vse8.v v30, (a0)
> +.if \len == 64 && \vlen < 256
> + addi a0, a0, 32
> + addi a2, a2, 32
> + epel_load v30, \len, \op, \name, \type, 1, t
> + vse8.v v30, (a0)
> + addi a0, a0, -32
> + addi a2, a2, -32
> +.endif
> + add a2, a2, a3
> + add a0, a0, a1
> + bnez a4, 1b
> +
> + ret
> +endfunc
> +.endm
> +
> .irp len, 64, 32, 16, 8, 4
> copy_avg \len
> + .irp op, put, avg
> + .irp name, regular, sharp, smooth
> + .irp type, h, v
> + epel \len, \op, \name, \type, 128
> + epel \len, \op, \name, \type, 256
> + .endr
> + .endr
> + .endr
> .endr
>
> bilin_h_v put, h, a5
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 8fb326dae0..5fd64a1b8c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, const uint8_t *a);
>
> -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
> \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride, \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx,
> min_vlen) \ +void
> ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \ +
> ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my);
> @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int
> mx, int my);
>
> -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
>
> VP9_BILINEAR_RISCV_RVV_FUNC(64);
> VP9_BILINEAR_RISCV_RVV_FUNC(32);
> diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
> index b3700dfb08..3669070fca 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) # endif
>
> #if HAVE_RVV
> - if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128))
> { + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> + int vlenb = ff_get_rv_vlenb();
> + if (vlenb >= 16) {
>
> #define init_fpel(idx1, sz) \
> dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv; \
> @@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_4hv_rvv;
>
> #undef init_fpel
> +
> +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \
> + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen; \
> + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen; \
> + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> +
> +#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \
> + init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \
> + init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \
> + init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \
> + init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \
> + init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen)
> +
> + init_subpel2(0, 1, 0, h, put, 128);
> + init_subpel2(1, 1, 0, h, avg, 128);
> +
> + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> + init_subpel2(0, 0, 1, v, put, 128);
> + init_subpel2(1, 0, 1, v, avg, 128);
> + }
> +
> + }
> + if (vlenb >= 32) {
> + init_subpel2(0, 1, 0, h, put, 256);
> + init_subpel2(1, 1, 0, h, avg, 256);
> +
> + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> + init_subpel2(0, 0, 1, v, put, 256);
> + init_subpel2(1, 0, 1, v, avg, 256);
> + }
> + }
> }
> #endif
> #endif
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
2024-07-29 15:20 ` Rémi Denis-Courmont
@ 2024-07-31 10:36 ` flow gg
2024-07-31 19:10 ` Rémi Denis-Courmont
0 siblings, 1 reply; 15+ messages in thread
From: flow gg @ 2024-07-31 10:36 UTC (permalink / raw)
To: FFmpeg development discussions and patches
I'm a bit confused because the calculation here goes up to 32 bits and then
returns to 8 bits. It seems that the vmax and vnclipu instructions can't be
removed by using round-related instructions?
Rémi Denis-Courmont <remi@remlab.net> 于2024年7月29日周一 23:21写道:
> Le tiistaina 23. heinäkuuta 2024, 11.51.48 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> > C908 X60
> > vp9_avg_8tap_smooth_4h_8bpp_c : 12.7 11.2
> > vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 4.7 4.2
> > vp9_avg_8tap_smooth_4v_8bpp_c : 29.7 12.5
> > vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 4.7 4.2
> > vp9_avg_8tap_smooth_8h_8bpp_c : 48.7 42.2
> > vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.5 8.5
> > vp9_avg_8tap_smooth_8v_8bpp_c : 49.7 45.5
> > vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5
> > vp9_avg_8tap_smooth_16h_8bpp_c : 192.0 166.5
> > vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.7 19.5
> > vp9_avg_8tap_smooth_16v_8bpp_c : 191.2 175.2
> > vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.2 19.0
> > vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.2
> > vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 68.2 60.5
> > vp9_avg_8tap_smooth_32v_8bpp_c : 770.0 685.7
> > vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.0 59.5
> > vp9_avg_8tap_smooth_64h_8bpp_c : 3116.2 2648.2
> > vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 270.7 120.7
> > vp9_avg_8tap_smooth_64v_8bpp_c : 3058.5 2731.7
> > vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 266.5 119.0
> > vp9_put_8tap_smooth_4h_8bpp_c : 11.0 9.7
> > vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 3.7
> > vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.5
> > vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.0 3.7
> > vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5
> > vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7
> > vp9_put_8tap_smooth_8v_8bpp_c : 43.5 38.5
> > vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.7 7.7
> > vp9_put_8tap_smooth_16h_8bpp_c : 181.7 147.2
> > vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 20.0 18.0
> > vp9_put_8tap_smooth_16v_8bpp_c : 168.5 149.7
> > vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5
> > vp9_put_8tap_smooth_32h_8bpp_c : 675.0 586.5
> > vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 65.2 58.0
> > vp9_put_8tap_smooth_32v_8bpp_c : 664.7 591.2
> > vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.0 57.0
> > vp9_put_8tap_smooth_64h_8bpp_c : 2696.2 2339.0
> > vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 259.7 115.7
> > vp9_put_8tap_smooth_64v_8bpp_c : 2691.0 2348.5
> > vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0
> > ---
> > libavcodec/riscv/vp9_mc_rvv.S | 193 +++++++++++++++++++++++++++++++++
> > libavcodec/riscv/vp9dsp.h | 72 ++++++++----
> > libavcodec/riscv/vp9dsp_init.c | 38 ++++++-
> > 3 files changed, 278 insertions(+), 25 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index 5241562531..6a4be7b9bd 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -36,6 +36,18 @@
> > .endif
> > .endm
> >
> > +.macro vsetvlstatic16 len
> > +.ifc \len,4
> > + vsetvli zero, zero, e16, mf2, ta, ma
> > +.elseif \len == 8
> > + vsetvli zero, zero, e16, m1, ta, ma
> > +.elseif \len == 16
> > + vsetvli zero, zero, e16, m2, ta, ma
> > +.else
> > + vsetvli zero, zero, e16, m4, ta, ma
> > +.endif
> > +.endm
> > +
> > .macro copy_avg len
> > func ff_vp9_avg\len\()_rvv, zve32x
> > csrwi vxrm, 0
> > @@ -181,8 +193,189 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> > endfunc
> > .endm
> >
> > +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> > +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> > +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> > +
> > +.macro epel_filter name, type, regtype, arg
> > + lla \regtype\()2, ff_vp9_subpel_filters_\name
> > +.ifc \type,v
> > + slli \regtype\()0, a6, 4
> > +.else
> > + slli \regtype\()0, a5, 4
> > +.endif
> > + add \regtype\()0, \regtype\()0, \regtype\()2
> > + lh \regtype\()1, 2(\regtype\()0)
> > + lh \regtype\()2, 4(\regtype\()0)
> > + lh \regtype\()3, 6(\regtype\()0)
> > + lh \regtype\()4, 8(\regtype\()0)
> > + lh \regtype\()5, 10(\regtype\()0)
> > + lh \regtype\()6, 12(\regtype\()0)
> > + lh \arg, 14(\regtype\()0)
> > + lh \regtype\()0, 0(\regtype\()0)
> > +.endm
> > +
> > +.macro epel_load dst, len, op, name, type, from_mem, regtype
> > +.ifc \from_mem, 1
> > + vle8.v v22, (a2)
> > +.ifc \type,v
> > + add a5, a3, a2
> > + sub a2, a2, a3
> > + vle8.v v24, (a5)
> > + vle8.v v20, (a2)
> > + sh1add a2, a3, a5
> > + add a5, a5, a3
> > + vle8.v v26, (a5)
> > + vle8.v v28, (a2)
> > + add a2, a2, a3
> > + vle8.v v30, (a2)
> > +.else
> > + addi a5, a2, 1
> > + addi a2, a2, -1
> > + vle8.v v24, (a5)
> > + vle8.v v20, (a2)
> > + addi a5, a5, 2
> > + addi a2, a2, 3
> > + vle8.v v28, (a5)
> > + vle8.v v26, (a2)
> > + addi a2, a5, 1
> > + vle8.v v30, (a2)
> > +.endif
> > +
> > +.ifc \name,smooth
> > + vwmulu.vx v16, v24, \regtype\()4
> > + vwmaccu.vx v16, \regtype\()2, v20
> > + vwmaccu.vx v16, \regtype\()5, v26
> > + vwmaccsu.vx v16, \regtype\()6, v28
> > +.else
> > + vwmulu.vx v16, v28, \regtype\()6
> > + vwmaccsu.vx v16, \regtype\()2, v20
> > + vwmaccsu.vx v16, \regtype\()5, v26
> > +.endif
> > +
> > +.ifc \regtype,t
> > + vwmaccsu.vx v16, a7, v30
> > +.else
> > + vwmaccsu.vx v16, s7, v30
> > +.endif
> > +
> > +.ifc \type,v
> > + sh1add a5, a3, a3
> > + sub a2, a2, a5
> > + sub a2, a2, a5
> > + sub a5, a2, a3
> > + vle8.v v28, (a2)
> > + vle8.v v26, (a5)
> > + sh1add a2, a3, a2
> > +.else
> > + addi a5, a2, -7
> > + addi a2, a2, -6
> > + vle8.v v26, (a5)
> > + vle8.v v28, (a2)
> > + addi a2, a2, 2
> > +.endif
> > +
> > +.ifc \name,smooth
> > + vwmaccsu.vx v16, \regtype\()1, v28
> > +.else
> > + vwmaccu.vx v16, \regtype\()1, v28
> > + vwmulu.vx v28, v24, \regtype\()4
> > +.endif
> > + vwmaccsu.vx v16, \regtype\()0, v26
> > + vwmulu.vx v20, v22, \regtype\()3
> > +.else
> > +.ifc \name,smooth
> > + vwmulu.vx v16, v8, \regtype\()4
> > + vwmaccu.vx v16, \regtype\()2, v4
> > + vwmaccu.vx v16, \regtype\()5, v10
> > + vwmaccsu.vx v16, \regtype\()6, v12
> > + vwmaccsu.vx v16, \regtype\()1, v2
> > +.else
> > + vwmulu.vx v16, v2, \regtype\()1
> > + vwmaccu.vx v16, \regtype\()6, v12
> > + vwmaccsu.vx v16, \regtype\()5, v10
> > + vwmaccsu.vx v16, \regtype\()2, v4
> > + vwmulu.vx v28, v8, \regtype\()4
> > +.endif
> > + vwmaccsu.vx v16, \regtype\()0, v0
> > + vwmulu.vx v20, v6, \regtype\()3
> > +
> > +.ifc \regtype,t
> > + vwmaccsu.vx v16, a7, v14
> > +.else
> > + vwmaccsu.vx v16, s7, v14
> > +.endif
> > +
> > +.endif
> > + li a5, 64
> > + vwadd.wx v16, v16, a5
>
> Use rounding.
>
> > + vsetvlstatic16 \len
> > +
> > +.ifc \name,smooth
> > + vwadd.vv v24, v16, v20
> > +.else
> > + vwadd.vv v24, v16, v28
> > + vwadd.wv v24, v24, v20
> > +.endif
> > + vnsra.wi v24, v24, 7
> > + vmax.vx v24, v24, zero
> > + vsetvlstatic8 \len, zero, 32, m2
> > +
> > + vnclipu.wi \dst, v24, 0
> > +.ifc \op,avg
> > + vle8.v v24, (a0)
> > + vaaddu.vv \dst, \dst, v24
> > +.endif
> > +
> > +.endm
> > +
> > +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> > + epel_load \dst, \len, \op, \name, \type, \from_mem,
> \regtype
> > + add a2, a2, a3
> > +.endm
> > +
> > +.macro epel len, op, name, type, vlen
> > +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> > + epel_filter \name, \type, t, a7
> > +.if \vlen < 256
> > + vsetvlstatic8 \len, a5, 32, m2
> > +.else
> > + vsetvlstatic8 \len, a5, 64, m2
> > +.endif
> > +.ifc \op,avg
> > + csrwi vxrm, 0
> > +.endif
> > +
> > +1:
> > + addi a4, a4, -1
> > + epel_load v30, \len, \op, \name, \type, 1, t
> > + vse8.v v30, (a0)
> > +.if \len == 64 && \vlen < 256
> > + addi a0, a0, 32
> > + addi a2, a2, 32
> > + epel_load v30, \len, \op, \name, \type, 1, t
> > + vse8.v v30, (a0)
> > + addi a0, a0, -32
> > + addi a2, a2, -32
> > +.endif
> > + add a2, a2, a3
> > + add a0, a0, a1
> > + bnez a4, 1b
> > +
> > + ret
> > +endfunc
> > +.endm
> > +
> > .irp len, 64, 32, 16, 8, 4
> > copy_avg \len
> > + .irp op, put, avg
> > + .irp name, regular, sharp, smooth
> > + .irp type, h, v
> > + epel \len, \op, \name, \type, 128
> > + epel \len, \op, \name, \type, 256
> > + .endr
> > + .endr
> > + .endr
> > .endr
> >
> > bilin_h_v put, h, a5
> > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> > index 8fb326dae0..5fd64a1b8c 100644
> > --- a/libavcodec/riscv/vp9dsp.h
> > +++ b/libavcodec/riscv/vp9dsp.h
> > @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride,
> const
> > uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const
> > uint8_t *l, const uint8_t *a);
> >
> > -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>
> > \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> > dststride, \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx,
> > min_vlen) \ +void
> > ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
> \ +
> > ptrdiff_t dststride,
>
> > \ const uint8_t *src, \ ptrdiff_t srcstride,
>
> > \ int h, int mx, int my); \ \ -void
> > ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
> > +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>
> > \ + ptrdiff_t dststride,
>
> > \ const uint8_t *src, \ ptrdiff_t srcstride,
>
> > \ int h, int mx, int my); \ \ -void
> > ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
> > +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>
> > \ + ptrdiff_t dststride,
>
> > \ const uint8_t *src, \ ptrdiff_t srcstride,
>
> > \ int h, int mx, int my); \ \ -void
> > ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
> > +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
>
> > \ + ptrdiff_t dststride,
>
> > \ const uint8_t *src, \ ptrdiff_t srcstride,
>
> > \ int h, int mx, int my); \ \ -void
> > ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
> > +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>
> > \ + ptrdiff_t dststride,
>
> > \ const uint8_t *src, \ ptrdiff_t srcstride,
>
> > \ int h, int mx, int my); \ \ -void
> > ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
> > +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>
> > \ + ptrdiff_t dststride,
>
> > \ const uint8_t *src, \ ptrdiff_t srcstride,
>
> > \ int h, int mx, int my);
> > @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> > dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int
> > mx, int my);
> >
> > -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> > -
> > -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> > -
> > -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
> >
> > VP9_BILINEAR_RISCV_RVV_FUNC(64);
> > VP9_BILINEAR_RISCV_RVV_FUNC(32);
> > diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> > index b3700dfb08..3669070fca 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) # endif
> >
> > #if HAVE_RVV
> > - if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128))
> > { + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> > + int vlenb = ff_get_rv_vlenb();
> > + if (vlenb >= 16) {
> >
> > #define init_fpel(idx1, sz) \
> > dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] =
> ff_vp9_avg##sz##_rvv; \
> > @@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_4hv_rvv;
> >
> > #undef init_fpel
> > +
> > +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \
> > + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
> > + ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen; \
> > + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
> > + ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen; \
> > + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
> > + ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> > +
> > +#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \
> > + init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \
> > + init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \
> > + init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \
> > + init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \
> > + init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen)
> > +
> > + init_subpel2(0, 1, 0, h, put, 128);
> > + init_subpel2(1, 1, 0, h, avg, 128);
> > +
> > + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> > + init_subpel2(0, 0, 1, v, put, 128);
> > + init_subpel2(1, 0, 1, v, avg, 128);
> > + }
> > +
> > + }
> > + if (vlenb >= 32) {
> > + init_subpel2(0, 1, 0, h, put, 256);
> > + init_subpel2(1, 1, 0, h, avg, 256);
> > +
> > + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> > + init_subpel2(0, 0, 1, v, put, 256);
> > + init_subpel2(1, 0, 1, v, avg, 256);
> > + }
> > + }
> > }
> > #endif
> > #endif
>
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
2024-07-31 10:36 ` flow gg
@ 2024-07-31 19:10 ` Rémi Denis-Courmont
0 siblings, 0 replies; 15+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-31 19:10 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Le keskiviikkona 31. heinäkuuta 2024, 13.36.00 EEST flow gg a écrit :
> I'm a bit confused because the calculation here goes up to 32 bits and then
> returns to 8 bits. It seems that the vmax and vnclipu instructions can't be
> removed by using round-related instructions?
You seem to be adding 64 then dividing by 128. That's rounding up.
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2024-07-31 19:10 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <20240615115034.3891490-1-uk7b@foxmail.com>
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
2024-06-15 11:52 ` flow gg
2024-06-24 20:07 ` Rémi Denis-Courmont
2024-06-30 11:39 ` flow gg
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v uk7b
2024-06-15 11:52 ` flow gg
2024-07-13 9:02 ` Rémi Denis-Courmont
2024-07-23 8:51 ` uk7b
2024-07-29 15:20 ` Rémi Denis-Courmont
2024-07-31 10:36 ` flow gg
2024-07-31 19:10 ` Rémi Denis-Courmont
2024-07-23 8:56 ` flow gg
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 4/4] lavc/vp9dsp: R-V V mc tap hv uk7b
2024-07-23 8:58 ` uk7b
2024-07-23 9:03 ` flow gg
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git