* [FFmpeg-devel] [PATCH v2 2/5] lavc/vp9dsp: R-V V mc bilin h v [not found] <20240521171319.2629938-1-uk7b@foxmail.com> @ 2024-05-21 17:13 ` uk7b 2024-05-21 18:29 ` Rémi Denis-Courmont 2024-05-21 17:13 ` [FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap " uk7b ` (2 subsequent siblings) 3 siblings, 1 reply; 11+ messages in thread From: uk7b @ 2024-05-21 17:13 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908: vp9_avg_bilin_4h_8bpp_c: 5.2 vp9_avg_bilin_4h_8bpp_rvv_i64: 2.2 vp9_avg_bilin_4v_8bpp_c: 5.5 vp9_avg_bilin_4v_8bpp_rvv_i64: 2.2 vp9_avg_bilin_8h_8bpp_c: 20.0 vp9_avg_bilin_8h_8bpp_rvv_i64: 4.5 vp9_avg_bilin_8v_8bpp_c: 21.0 vp9_avg_bilin_8v_8bpp_rvv_i64: 4.2 vp9_avg_bilin_16h_8bpp_c: 78.2 vp9_avg_bilin_16h_8bpp_rvv_i64: 9.0 vp9_avg_bilin_16v_8bpp_c: 82.0 vp9_avg_bilin_16v_8bpp_rvv_i64: 9.0 vp9_avg_bilin_32h_8bpp_c: 325.5 vp9_avg_bilin_32h_8bpp_rvv_i64: 26.2 vp9_avg_bilin_32v_8bpp_c: 326.2 vp9_avg_bilin_32v_8bpp_rvv_i64: 26.2 vp9_avg_bilin_64h_8bpp_c: 1265.7 vp9_avg_bilin_64h_8bpp_rvv_i64: 91.5 vp9_avg_bilin_64v_8bpp_c: 1317.0 vp9_avg_bilin_64v_8bpp_rvv_i64: 91.2 vp9_put_bilin_4h_8bpp_c: 4.5 vp9_put_bilin_4h_8bpp_rvv_i64: 1.7 vp9_put_bilin_4v_8bpp_c: 4.7 vp9_put_bilin_4v_8bpp_rvv_i64: 1.7 vp9_put_bilin_8h_8bpp_c: 17.0 vp9_put_bilin_8h_8bpp_rvv_i64: 3.5 vp9_put_bilin_8v_8bpp_c: 18.0 vp9_put_bilin_8v_8bpp_rvv_i64: 3.5 vp9_put_bilin_16h_8bpp_c: 65.2 vp9_put_bilin_16h_8bpp_rvv_i64: 7.5 vp9_put_bilin_16v_8bpp_c: 85.7 vp9_put_bilin_16v_8bpp_rvv_i64: 7.5 vp9_put_bilin_32h_8bpp_c: 257.5 vp9_put_bilin_32h_8bpp_rvv_i64: 23.5 vp9_put_bilin_32v_8bpp_c: 274.5 vp9_put_bilin_32v_8bpp_rvv_i64: 23.5 vp9_put_bilin_64h_8bpp_c: 1040.5 vp9_put_bilin_64h_8bpp_rvv_i64: 82.5 vp9_put_bilin_64v_8bpp_c: 1108.7 vp9_put_bilin_64v_8bpp_rvv_i64: 82.2 --- libavcodec/riscv/vp9_mc_rvv.S | 43 ++++++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp.h | 12 +++++----- libavcodec/riscv/vp9dsp_init.c | 21 +++++++++++++++++ 3 files changed, 70 insertions(+), 6 deletions(-) diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S index 7cb38ec94a..739380d9a9 100644 --- a/libavcodec/riscv/vp9_mc_rvv.S +++ b/libavcodec/riscv/vp9_mc_rvv.S @@ -53,6 +53,49 @@ func ff_vp9_avg\len\()_rvv, zve32x endfunc .endm +.macro bilin_load dst len op type mn +.ifc \type,v + add t5, a2, a3 +.else + addi t5, a2, 1 +.endif + vle8.v v8, (a2) + vle8.v v0, (t5) + vwmulu.vx v16, v0, \mn + vwmaccsu.vx v16, t1, v8 + vwadd.wx v16, v16, t4 + vnsra.wi v16, v16, 4 + vadd.vv \dst, v16, v8 +.ifc \op,avg + vle8.v v16, (a0) + vaaddu.vv \dst, \dst, v16 +.endif +.endm + +.macro bilin_h_v len op type mn +func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x +.ifc \op,avg + csrwi vxrm, 0 +.endif + vsetvlstatic8 \len, t0, 64 + li t4, 8 + neg t1, \mn +1: + addi a4, a4, -1 + bilin_load v0, \len, \op, \type, \mn + vse8.v v0, (a0) + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + + ret +endfunc +.endm + .irp len, 64, 32, 16, 8, 4 copy_avg \len + .irp op, put, avg + bilin_h_v \len, \op, h, a5 + bilin_h_v \len, \op, v, a6 + .endr .endr diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h index ff8431591c..8fb326dae0 100644 --- a/libavcodec/riscv/vp9dsp.h +++ b/libavcodec/riscv/vp9dsp.h @@ -113,27 +113,27 @@ void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \ int h, int mx, int my); #define VP9_BILINEAR_RISCV_RVV_FUNC(SIZE) \ -void ff_put_bilin_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_put_vp9_bilin_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_put_bilin_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_put_vp9_bilin_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_put_bilin_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_put_vp9_bilin_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_avg_bilin_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_avg_vp9_bilin_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_avg_bilin_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_avg_vp9_bilin_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_avg_bilin_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_avg_vp9_bilin_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int mx, int my); diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index 454dcd963f..9606d8545f 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -63,6 +63,27 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) init_fpel(3, 8); init_fpel(4, 4); + dsp->mc[0][FILTER_BILINEAR ][0][0][1] = ff_put_vp9_bilin_64v_rvv; + dsp->mc[0][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_64h_rvv; + dsp->mc[0][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_64v_rvv; + dsp->mc[0][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_64h_rvv; + dsp->mc[1][FILTER_BILINEAR ][0][0][1] = ff_put_vp9_bilin_32v_rvv; + dsp->mc[1][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_32h_rvv; + dsp->mc[1][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_32v_rvv; + dsp->mc[1][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_32h_rvv; + dsp->mc[2][FILTER_BILINEAR ][0][0][1] = ff_put_vp9_bilin_16v_rvv; + dsp->mc[2][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_16h_rvv; + dsp->mc[2][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_16v_rvv; + dsp->mc[2][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_16h_rvv; + dsp->mc[3][FILTER_BILINEAR ][0][0][1] = ff_put_vp9_bilin_8v_rvv; + dsp->mc[3][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_8h_rvv; + dsp->mc[3][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_8v_rvv; + dsp->mc[3][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_8h_rvv; + dsp->mc[4][FILTER_BILINEAR ][0][0][1] = ff_put_vp9_bilin_4v_rvv; + dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv; + dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv; + dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv; + #undef init_fpel } #endif -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 2/5] lavc/vp9dsp: R-V V mc bilin h v 2024-05-21 17:13 ` [FFmpeg-devel] [PATCH v2 2/5] lavc/vp9dsp: R-V V mc bilin h v uk7b @ 2024-05-21 18:29 ` Rémi Denis-Courmont 2024-05-21 18:36 ` flow gg 0 siblings, 1 reply; 11+ messages in thread From: Rémi Denis-Courmont @ 2024-05-21 18:29 UTC (permalink / raw) To: ffmpeg-devel Le tiistaina 21. toukokuuta 2024, 20.13.16 EEST uk7b@foxmail.com a écrit : > From: sunyuechi <sunyuechi@iscas.ac.cn> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S > index 7cb38ec94a..739380d9a9 100644 > --- a/libavcodec/riscv/vp9_mc_rvv.S > +++ b/libavcodec/riscv/vp9_mc_rvv.S > @@ -53,6 +53,49 @@ func ff_vp9_avg\len\()_rvv, zve32x > endfunc > .endm > > +.macro bilin_load dst len op type mn Commas, please. -- Rémi Denis-Courmont http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 2/5] lavc/vp9dsp: R-V V mc bilin h v 2024-05-21 18:29 ` Rémi Denis-Courmont @ 2024-05-21 18:36 ` flow gg 2024-05-26 13:26 ` Rémi Denis-Courmont 0 siblings, 1 reply; 11+ messages in thread From: flow gg @ 2024-05-21 18:36 UTC (permalink / raw) To: FFmpeg development discussions and patches Do macros definition also need a comma? I noticed that many of my old code and SiFive's code don't have a comma Rémi Denis-Courmont <remi@remlab.net> 于2024年5月22日周三 02:29写道: > Le tiistaina 21. toukokuuta 2024, 20.13.16 EEST uk7b@foxmail.com a écrit : > > From: sunyuechi <sunyuechi@iscas.ac.cn> > > > diff --git a/libavcodec/riscv/vp9_mc_rvv.S > b/libavcodec/riscv/vp9_mc_rvv.S > > index 7cb38ec94a..739380d9a9 100644 > > --- a/libavcodec/riscv/vp9_mc_rvv.S > > +++ b/libavcodec/riscv/vp9_mc_rvv.S > > @@ -53,6 +53,49 @@ func ff_vp9_avg\len\()_rvv, zve32x > > endfunc > > .endm > > > > +.macro bilin_load dst len op type mn > > Commas, please. > > -- > Rémi Denis-Courmont > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 2/5] lavc/vp9dsp: R-V V mc bilin h v 2024-05-21 18:36 ` flow gg @ 2024-05-26 13:26 ` Rémi Denis-Courmont 0 siblings, 0 replies; 11+ messages in thread From: Rémi Denis-Courmont @ 2024-05-26 13:26 UTC (permalink / raw) To: FFmpeg development discussions and patches Le tiistaina 21. toukokuuta 2024, 21.36.33 EEST flow gg a écrit : > Do macros definition also need a comma? Indeed not. I'd rather be safe than sorry though. -- レミ・デニ-クールモン http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 11+ messages in thread
* [FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap h v [not found] <20240521171319.2629938-1-uk7b@foxmail.com> 2024-05-21 17:13 ` [FFmpeg-devel] [PATCH v2 2/5] lavc/vp9dsp: R-V V mc bilin h v uk7b @ 2024-05-21 17:13 ` uk7b 2024-05-25 10:17 ` Rémi Denis-Courmont 2024-05-21 17:13 ` [FFmpeg-devel] [PATCH v2 4/5] lavc/vp9dsp: R-V V mc bilin hv uk7b 2024-05-21 17:13 ` [FFmpeg-devel] [PATCH v2 5/5] lavc/vp9dsp: R-V V mc tap hv uk7b 3 siblings, 1 reply; 11+ messages in thread From: uk7b @ 2024-05-21 17:13 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908 X60 vp9_avg_8tap_smooth_4h_8bpp_c : 13.0 11.2 vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 5.0 4.2 vp9_avg_8tap_smooth_4v_8bpp_c : 13.7 12.5 vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 5.0 4.2 vp9_avg_8tap_smooth_8h_8bpp_c : 49.5 42.2 vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.2 8.5 vp9_avg_8tap_smooth_8v_8bpp_c : 66.5 45.0 vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5 vp9_avg_8tap_smooth_16h_8bpp_c : 192.7 166.5 vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.2 18.7 vp9_avg_8tap_smooth_16v_8bpp_c : 192.2 175.7 vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.5 19.0 vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.7 vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 83.5 60.0 vp9_avg_8tap_smooth_32v_8bpp_c : 770.5 689.2 vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.2 60.0 vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2 vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 283.5 119.2 vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0 vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 305.2 119.0 vp9_put_8tap_smooth_4h_8bpp_c : 11.2 9.7 vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 4.0 vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.7 vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.2 4.0 vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5 vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7 vp9_put_8tap_smooth_8v_8bpp_c : 44.2 38.7 vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.5 7.7 vp9_put_8tap_smooth_16h_8bpp_c : 165.7 147.2 vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 19.5 17.5 vp9_put_8tap_smooth_16v_8bpp_c : 169.0 149.7 vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5 vp9_put_8tap_smooth_32h_8bpp_c : 659.7 586.7 vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 64.2 57.2 vp9_put_8tap_smooth_32v_8bpp_c : 680.5 591.2 vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.2 57.2 vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0 vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 255.5 114.2 vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7 vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0 --- libavcodec/riscv/vp9_mc_rvv.S | 243 +++++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp.h | 72 ++++++---- libavcodec/riscv/vp9dsp_init.c | 38 +++++- 3 files changed, 328 insertions(+), 25 deletions(-) diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S index 739380d9a9..adba4afb90 100644 --- a/libavcodec/riscv/vp9_mc_rvv.S +++ b/libavcodec/riscv/vp9_mc_rvv.S @@ -36,6 +36,18 @@ .endif .endm +.macro vsetvlstatic16 len +.ifc \len,4 + vsetvli zero, zero, e16, mf2, ta, ma +.elseif \len == 8 + vsetvli zero, zero, e16, m1, ta, ma +.elseif \len == 16 + vsetvli zero, zero, e16, m2, ta, ma +.else + vsetvli zero, zero, e16, m4, ta, ma +.endif +.endm + .macro copy_avg len func ff_vp9_avg\len\()_rvv, zve32x csrwi vxrm, 0 @@ -92,10 +104,241 @@ func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x endfunc .endm +const subpel_filters_regular + .byte 0, 0, 0, 128, 0, 0, 0, 0 + .byte 0, 1, -5, 126, 8, -3, 1, 0 + .byte -1, 3, -10, 122, 18, -6, 2, 0 + .byte -1, 4, -13, 118, 27, -9, 3, -1 + .byte -1, 4, -16, 112, 37, -11, 4, -1 + .byte -1, 5, -18, 105, 48, -14, 4, -1 + .byte -1, 5, -19, 97, 58, -16, 5, -1 + .byte -1, 6, -19, 88, 68, -18, 5, -1 + .byte -1, 6, -19, 78, 78, -19, 6, -1 + .byte -1, 5, -18, 68, 88, -19, 6, -1 + .byte -1, 5, -16, 58, 97, -19, 5, -1 + .byte -1, 4, -14, 48, 105, -18, 5, -1 + .byte -1, 4, -11, 37, 112, -16, 4, -1 + .byte -1, 3, -9, 27, 118, -13, 4, -1 + .byte 0, 2, -6, 18, 122, -10, 3, -1 + .byte 0, 1, -3, 8, 126, -5, 1, 0 +subpel_filters_sharp: + .byte 0, 0, 0, 128, 0, 0, 0, 0 + .byte -1, 3, -7, 127, 8, -3, 1, 0 + .byte -2, 5, -13, 125, 17, -6, 3, -1 + .byte -3, 7, -17, 121, 27, -10, 5, -2 + .byte -4, 9, -20, 115, 37, -13, 6, -2 + .byte -4, 10, -23, 108, 48, -16, 8, -3 + .byte -4, 10, -24, 100, 59, -19, 9, -3 + .byte -4, 11, -24, 90, 70, -21, 10, -4 + .byte -4, 11, -23, 80, 80, -23, 11, -4 + .byte -4, 10, -21, 70, 90, -24, 11, -4 + .byte -3, 9, -19, 59, 100, -24, 10, -4 + .byte -3, 8, -16, 48, 108, -23, 10, -4 + .byte -2, 6, -13, 37, 115, -20, 9, -4 + .byte -2, 5, -10, 27, 121, -17, 7, -3 + .byte -1, 3, -6, 17, 125, -13, 5, -2 + .byte 0, 1, -3, 8, 127, -7, 3, -1 +subpel_filters_smooth: + .byte 0, 0, 0, 128, 0, 0, 0, 0 + .byte -3, -1, 32, 64, 38, 1, -3, 0 + .byte -2, -2, 29, 63, 41, 2, -3, 0 + .byte -2, -2, 26, 63, 43, 4, -4, 0 + .byte -2, -3, 24, 62, 46, 5, -4, 0 + .byte -2, -3, 21, 60, 49, 7, -4, 0 + .byte -1, -4, 18, 59, 51, 9, -4, 0 + .byte -1, -4, 16, 57, 53, 12, -4, -1 + .byte -1, -4, 14, 55, 55, 14, -4, -1 + .byte -1, -4, 12, 53, 57, 16, -4, -1 + .byte 0, -4, 9, 51, 59, 18, -4, -1 + .byte 0, -4, 7, 49, 60, 21, -3, -2 + .byte 0, -4, 5, 46, 62, 24, -3, -2 + .byte 0, -4, 4, 43, 63, 26, -2, -2 + .byte 0, -3, 2, 41, 63, 29, -2, -2 + .byte 0, -3, 1, 38, 64, 32, -1, -3 +endconst + +.macro epel_filter name type regtype + lla \regtype\()2, subpel_filters_\name + li \regtype\()1, 8 +.ifc \type,v + mul \regtype\()0, a6, \regtype\()1 +.else + mul \regtype\()0, a5, \regtype\()1 +.endif + add \regtype\()0, \regtype\()0, \regtype\()2 + .irp n,1,2,3,4,5,6 + lb \regtype\n, \n(\regtype\()0) + .endr +.ifc \regtype,t + lb a7, 7(\regtype\()0) +.else + lb s7, 7(\regtype\()0) +.endif + lb \regtype\()0, 0(\regtype\()0) +.endm + +.macro epel_load dst len op name type from_mem regtype + li a5, 64 +.ifc \from_mem, 1 + vle8.v v22, (a2) +.ifc \type,v + sub a2, a2, a3 + vle8.v v20, (a2) + sh1add a2, a3, a2 + vle8.v v24, (a2) + add a2, a2, a3 + vle8.v v26, (a2) + add a2, a2, a3 + vle8.v v28, (a2) + add a2, a2, a3 + vle8.v v30, (a2) +.else + addi a2, a2, -1 + vle8.v v20, (a2) + addi a2, a2, 2 + vle8.v v24, (a2) + addi a2, a2, 1 + vle8.v v26, (a2) + addi a2, a2, 1 + vle8.v v28, (a2) + addi a2, a2, 1 + vle8.v v30, (a2) +.endif + +.ifc \name,smooth + vwmulu.vx v16, v24, \regtype\()4 + vwmaccu.vx v16, \regtype\()2, v20 + vwmaccu.vx v16, \regtype\()5, v26 + vwmaccsu.vx v16, \regtype\()6, v28 +.else + vwmulu.vx v16, v28, \regtype\()6 + vwmaccsu.vx v16, \regtype\()2, v20 + vwmaccsu.vx v16, \regtype\()5, v26 +.endif + +.ifc \regtype,t + vwmaccsu.vx v16, a7, v30 +.else + vwmaccsu.vx v16, s7, v30 +.endif + +.ifc \type,v + .rept 6 + sub a2, a2, a3 + .endr + vle8.v v28, (a2) + sub a2, a2, a3 + vle8.v v26, (a2) + sh1add a2, a3, a2 + add a2, a2, a3 +.else + addi a2, a2, -6 + vle8.v v28, (a2) + addi a2, a2, -1 + vle8.v v26, (a2) + addi a2, a2, 3 +.endif + +.ifc \name,smooth + vwmaccsu.vx v16, \regtype\()1, v28 +.else + vwmaccu.vx v16, \regtype\()1, v28 + vwmulu.vx v28, v24, \regtype\()4 +.endif + vwmaccsu.vx v16, \regtype\()0, v26 + vwmulu.vx v20, v22, \regtype\()3 +.else +.ifc \name,smooth + vwmulu.vx v16, v8, \regtype\()4 + vwmaccu.vx v16, \regtype\()2, v4 + vwmaccu.vx v16, \regtype\()5, v10 + vwmaccsu.vx v16, \regtype\()6, v12 + vwmaccsu.vx v16, \regtype\()1, v2 +.else + vwmulu.vx v16, v2, \regtype\()1 + vwmaccu.vx v16, \regtype\()6, v12 + vwmaccsu.vx v16, \regtype\()5, v10 + vwmaccsu.vx v16, \regtype\()2, v4 + vwmulu.vx v28, v8, \regtype\()4 +.endif + vwmaccsu.vx v16, \regtype\()0, v0 + vwmulu.vx v20, v6, \regtype\()3 + +.ifc \regtype,t + vwmaccsu.vx v16, a7, v14 +.else + vwmaccsu.vx v16, s7, v14 +.endif + +.endif + vwadd.wx v16, v16, a5 + vsetvlstatic16 \len + +.ifc \name,smooth + vwadd.vv v24, v16, v20 +.else + vwadd.vv v24, v16, v28 + vwadd.wv v24, v24, v20 +.endif + vnsra.wi v24, v24, 7 + vmax.vx v24, v24, zero + vsetvlstatic8 \len, zero, 32, m2 + + vnclipu.wi \dst, v24, 0 +.ifc \op,avg + vle8.v v24, (a0) + vaaddu.vv \dst, \dst, v24 +.endif + +.endm + +.macro epel_load_inc dst len op name type from_mem regtype + epel_load \dst, \len, \op, \name, \type, \from_mem, \regtype + add a2, a2, a3 +.endm + +.macro epel len op name type vlen +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x + epel_filter \name, \type, t +.if \vlen < 256 + vsetvlstatic8 \len, a5, 32, m2 +.else + vsetvlstatic8 \len, a5, 64, m2 +.endif +.ifc \op,avg + csrwi vxrm, 0 +.endif + +1: + addi a4, a4, -1 + epel_load v30, \len, \op, \name, \type, 1, t + vse8.v v30, (a0) +.if \len == 64 && \vlen < 256 + addi a0, a0, 32 + addi a2, a2, 32 + epel_load v30, \len, \op, \name, \type, 1, t + vse8.v v30, (a0) + addi a0, a0, -32 + addi a2, a2, -32 +.endif + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + + ret +endfunc +.endm + .irp len, 64, 32, 16, 8, 4 copy_avg \len .irp op, put, avg bilin_h_v \len, \op, h, a5 bilin_h_v \len, \op, v, a6 + .irp name, regular, sharp, smooth + .irp type, h, v + epel \len, \op, \name, \type, 128 + epel \len, \op, \name, \type, 256 + .endr + .endr .endr .endr diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h index 8fb326dae0..5fd64a1b8c 100644 --- a/libavcodec/riscv/vp9dsp.h +++ b/libavcodec/riscv/vp9dsp.h @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a); -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen) \ +void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int mx, int my); -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR); -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR); -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR); -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR); -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR); - -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP); -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP); -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP); -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP); -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP); - -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH); -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH); -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH); -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH); -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH); +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128); +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128); +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128); +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128); +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128); + +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128); +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128); +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128); +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128); +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128); + +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128); +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128); +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128); +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128); +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128); + +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256); +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256); +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256); +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256); +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256); + +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256); +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256); +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256); +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256); +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256); + +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256); +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256); +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256); +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256); +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256); VP9_BILINEAR_RISCV_RVV_FUNC(64); VP9_BILINEAR_RISCV_RVV_FUNC(32); diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index 9606d8545f..314a1e5808 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) # endif #if HAVE_RVV - if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) { + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) { + if (ff_rv_vlen_least(128)) { #define init_fpel(idx1, sz) \ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv; \ @@ -85,7 +86,42 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv; #undef init_fpel + +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \ + ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \ + ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \ + ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen; + +#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \ + init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \ + init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \ + init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \ + init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \ + init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen) + + init_subpel2(0, 1, 0, h, put, 128); + init_subpel2(1, 1, 0, h, avg, 128); + + if (flags & AV_CPU_FLAG_RVB_ADDR) { + init_subpel2(0, 0, 1, v, put, 128); + init_subpel2(1, 0, 1, v, avg, 128); + } + + } + if (ff_rv_vlen_least(256)) { + init_subpel2(0, 1, 0, h, put, 256); + init_subpel2(1, 1, 0, h, avg, 256); + + if (flags & AV_CPU_FLAG_RVB_ADDR) { + init_subpel2(0, 0, 1, v, put, 256); + init_subpel2(1, 0, 1, v, avg, 256); + } } + } + #endif #endif } -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap h v 2024-05-21 17:13 ` [FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap " uk7b @ 2024-05-25 10:17 ` Rémi Denis-Courmont 2024-05-25 10:38 ` flow gg 0 siblings, 1 reply; 11+ messages in thread From: Rémi Denis-Courmont @ 2024-05-25 10:17 UTC (permalink / raw) To: ffmpeg-devel Le tiistaina 21. toukokuuta 2024, 20.13.17 EEST uk7b@foxmail.com a écrit : > From: sunyuechi <sunyuechi@iscas.ac.cn> > > C908 X60 > vp9_avg_8tap_smooth_4h_8bpp_c : 13.0 11.2 > vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 5.0 4.2 > vp9_avg_8tap_smooth_4v_8bpp_c : 13.7 12.5 > vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 5.0 4.2 > vp9_avg_8tap_smooth_8h_8bpp_c : 49.5 42.2 > vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.2 8.5 > vp9_avg_8tap_smooth_8v_8bpp_c : 66.5 45.0 > vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5 > vp9_avg_8tap_smooth_16h_8bpp_c : 192.7 166.5 > vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.2 18.7 > vp9_avg_8tap_smooth_16v_8bpp_c : 192.2 175.7 > vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.5 19.0 > vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.7 > vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 83.5 60.0 > vp9_avg_8tap_smooth_32v_8bpp_c : 770.5 689.2 > vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.2 60.0 > vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2 > vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 283.5 119.2 > vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0 > vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 305.2 119.0 > vp9_put_8tap_smooth_4h_8bpp_c : 11.2 9.7 > vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 4.0 > vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.7 > vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.2 4.0 > vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5 > vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7 > vp9_put_8tap_smooth_8v_8bpp_c : 44.2 38.7 > vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.5 7.7 > vp9_put_8tap_smooth_16h_8bpp_c : 165.7 147.2 > vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 19.5 17.5 > vp9_put_8tap_smooth_16v_8bpp_c : 169.0 149.7 > vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5 > vp9_put_8tap_smooth_32h_8bpp_c : 659.7 586.7 > vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 64.2 57.2 > vp9_put_8tap_smooth_32v_8bpp_c : 680.5 591.2 > vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.2 57.2 > vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0 > vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 255.5 114.2 > vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7 > vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0 > --- > libavcodec/riscv/vp9_mc_rvv.S | 243 +++++++++++++++++++++++++++++++++ > libavcodec/riscv/vp9dsp.h | 72 ++++++---- > libavcodec/riscv/vp9dsp_init.c | 38 +++++- > 3 files changed, 328 insertions(+), 25 deletions(-) > > diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S > index 739380d9a9..adba4afb90 100644 > --- a/libavcodec/riscv/vp9_mc_rvv.S > +++ b/libavcodec/riscv/vp9_mc_rvv.S > @@ -36,6 +36,18 @@ > .endif > .endm > > +.macro vsetvlstatic16 len > +.ifc \len,4 > + vsetvli zero, zero, e16, mf2, ta, ma > +.elseif \len == 8 > + vsetvli zero, zero, e16, m1, ta, ma > +.elseif \len == 16 > + vsetvli zero, zero, e16, m2, ta, ma > +.else > + vsetvli zero, zero, e16, m4, ta, ma > +.endif > +.endm > + > .macro copy_avg len > func ff_vp9_avg\len\()_rvv, zve32x > csrwi vxrm, 0 > @@ -92,10 +104,241 @@ func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x > endfunc > .endm > > +const subpel_filters_regular > + .byte 0, 0, 0, 128, 0, 0, 0, 0 > + .byte 0, 1, -5, 126, 8, -3, 1, 0 > + .byte -1, 3, -10, 122, 18, -6, 2, 0 > + .byte -1, 4, -13, 118, 27, -9, 3, -1 > + .byte -1, 4, -16, 112, 37, -11, 4, -1 > + .byte -1, 5, -18, 105, 48, -14, 4, -1 > + .byte -1, 5, -19, 97, 58, -16, 5, -1 > + .byte -1, 6, -19, 88, 68, -18, 5, -1 > + .byte -1, 6, -19, 78, 78, -19, 6, -1 > + .byte -1, 5, -18, 68, 88, -19, 6, -1 > + .byte -1, 5, -16, 58, 97, -19, 5, -1 > + .byte -1, 4, -14, 48, 105, -18, 5, -1 > + .byte -1, 4, -11, 37, 112, -16, 4, -1 > + .byte -1, 3, -9, 27, 118, -13, 4, -1 > + .byte 0, 2, -6, 18, 122, -10, 3, -1 > + .byte 0, 1, -3, 8, 126, -5, 1, 0 > +subpel_filters_sharp: > + .byte 0, 0, 0, 128, 0, 0, 0, 0 > + .byte -1, 3, -7, 127, 8, -3, 1, 0 > + .byte -2, 5, -13, 125, 17, -6, 3, -1 > + .byte -3, 7, -17, 121, 27, -10, 5, -2 > + .byte -4, 9, -20, 115, 37, -13, 6, -2 > + .byte -4, 10, -23, 108, 48, -16, 8, -3 > + .byte -4, 10, -24, 100, 59, -19, 9, -3 > + .byte -4, 11, -24, 90, 70, -21, 10, -4 > + .byte -4, 11, -23, 80, 80, -23, 11, -4 > + .byte -4, 10, -21, 70, 90, -24, 11, -4 > + .byte -3, 9, -19, 59, 100, -24, 10, -4 > + .byte -3, 8, -16, 48, 108, -23, 10, -4 > + .byte -2, 6, -13, 37, 115, -20, 9, -4 > + .byte -2, 5, -10, 27, 121, -17, 7, -3 > + .byte -1, 3, -6, 17, 125, -13, 5, -2 > + .byte 0, 1, -3, 8, 127, -7, 3, -1 > +subpel_filters_smooth: > + .byte 0, 0, 0, 128, 0, 0, 0, 0 > + .byte -3, -1, 32, 64, 38, 1, -3, 0 > + .byte -2, -2, 29, 63, 41, 2, -3, 0 > + .byte -2, -2, 26, 63, 43, 4, -4, 0 > + .byte -2, -3, 24, 62, 46, 5, -4, 0 > + .byte -2, -3, 21, 60, 49, 7, -4, 0 > + .byte -1, -4, 18, 59, 51, 9, -4, 0 > + .byte -1, -4, 16, 57, 53, 12, -4, -1 > + .byte -1, -4, 14, 55, 55, 14, -4, -1 > + .byte -1, -4, 12, 53, 57, 16, -4, -1 > + .byte 0, -4, 9, 51, 59, 18, -4, -1 > + .byte 0, -4, 7, 49, 60, 21, -3, -2 > + .byte 0, -4, 5, 46, 62, 24, -3, -2 > + .byte 0, -4, 4, 43, 63, 26, -2, -2 > + .byte 0, -3, 2, 41, 63, 29, -2, -2 > + .byte 0, -3, 1, 38, 64, 32, -1, -3 > +endconst Is there a reason that you cannot use the tables from C code? > + > +.macro epel_filter name type regtype > + lla \regtype\()2, subpel_filters_\name It should be possible to spare one ADDI by using just AUIPC here, and folding the immediate offset into the LB's below (see also H.263 loop filter). > + li \regtype\()1, 8 > +.ifc \type,v > + mul \regtype\()0, a6, \regtype\()1 > +.else > + mul \regtype\()0, a5, \regtype\()1 slli 3 ? > +.endif > + add \regtype\()0, \regtype\()0, \regtype\()2 > + .irp n,1,2,3,4,5,6 > + lb \regtype\n, \n(\regtype\()0) > + .endr > +.ifc \regtype,t > + lb a7, 7(\regtype\()0) > +.else > + lb s7, 7(\regtype\()0) > +.endif > + lb \regtype\()0, 0(\regtype\()0) > +.endm > + > +.macro epel_load dst len op name type from_mem regtype > + li a5, 64 > +.ifc \from_mem, 1 > + vle8.v v22, (a2) > +.ifc \type,v > + sub a2, a2, a3 > + vle8.v v20, (a2) > + sh1add a2, a3, a2 > + vle8.v v24, (a2) > + add a2, a2, a3 > + vle8.v v26, (a2) > + add a2, a2, a3 > + vle8.v v28, (a2) > + add a2, a2, a3 > + vle8.v v30, (a2) > +.else > + addi a2, a2, -1 > + vle8.v v20, (a2) > + addi a2, a2, 2 > + vle8.v v24, (a2) > + addi a2, a2, 1 > + vle8.v v26, (a2) > + addi a2, a2, 1 > + vle8.v v28, (a2) > + addi a2, a2, 1 > + vle8.v v30, (a2) > +.endif > + > +.ifc \name,smooth > + vwmulu.vx v16, v24, \regtype\()4 > + vwmaccu.vx v16, \regtype\()2, v20 > + vwmaccu.vx v16, \regtype\()5, v26 > + vwmaccsu.vx v16, \regtype\()6, v28 > +.else > + vwmulu.vx v16, v28, \regtype\()6 > + vwmaccsu.vx v16, \regtype\()2, v20 > + vwmaccsu.vx v16, \regtype\()5, v26 > +.endif > + > +.ifc \regtype,t > + vwmaccsu.vx v16, a7, v30 > +.else > + vwmaccsu.vx v16, s7, v30 > +.endif > + > +.ifc \type,v > + .rept 6 > + sub a2, a2, a3 > + .endr > + vle8.v v28, (a2) > + sub a2, a2, a3 > + vle8.v v26, (a2) > + sh1add a2, a3, a2 > + add a2, a2, a3 > +.else > + addi a2, a2, -6 > + vle8.v v28, (a2) > + addi a2, a2, -1 > + vle8.v v26, (a2) > + addi a2, a2, 3 > +.endif > + > +.ifc \name,smooth > + vwmaccsu.vx v16, \regtype\()1, v28 > +.else > + vwmaccu.vx v16, \regtype\()1, v28 > + vwmulu.vx v28, v24, \regtype\()4 > +.endif > + vwmaccsu.vx v16, \regtype\()0, v26 > + vwmulu.vx v20, v22, \regtype\()3 > +.else > +.ifc \name,smooth > + vwmulu.vx v16, v8, \regtype\()4 > + vwmaccu.vx v16, \regtype\()2, v4 > + vwmaccu.vx v16, \regtype\()5, v10 > + vwmaccsu.vx v16, \regtype\()6, v12 > + vwmaccsu.vx v16, \regtype\()1, v2 > +.else > + vwmulu.vx v16, v2, \regtype\()1 > + vwmaccu.vx v16, \regtype\()6, v12 > + vwmaccsu.vx v16, \regtype\()5, v10 > + vwmaccsu.vx v16, \regtype\()2, v4 > + vwmulu.vx v28, v8, \regtype\()4 > +.endif > + vwmaccsu.vx v16, \regtype\()0, v0 > + vwmulu.vx v20, v6, \regtype\()3 > + > +.ifc \regtype,t > + vwmaccsu.vx v16, a7, v14 > +.else > + vwmaccsu.vx v16, s7, v14 > +.endif > + > +.endif > + vwadd.wx v16, v16, a5 > + vsetvlstatic16 \len > + > +.ifc \name,smooth > + vwadd.vv v24, v16, v20 > +.else > + vwadd.vv v24, v16, v28 > + vwadd.wv v24, v24, v20 > +.endif > + vnsra.wi v24, v24, 7 > + vmax.vx v24, v24, zero > + vsetvlstatic8 \len, zero, 32, m2 > + > + vnclipu.wi \dst, v24, 0 > +.ifc \op,avg > + vle8.v v24, (a0) > + vaaddu.vv \dst, \dst, v24 > +.endif > + > +.endm > + > +.macro epel_load_inc dst len op name type from_mem regtype > + epel_load \dst, \len, \op, \name, \type, \from_mem, \regtype > + add a2, a2, a3 > +.endm > + > +.macro epel len op name type vlen > +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x > + epel_filter \name, \type, t > +.if \vlen < 256 > + vsetvlstatic8 \len, a5, 32, m2 > +.else > + vsetvlstatic8 \len, a5, 64, m2 > +.endif > +.ifc \op,avg > + csrwi vxrm, 0 > +.endif > + > +1: > + addi a4, a4, -1 > + epel_load v30, \len, \op, \name, \type, 1, t > + vse8.v v30, (a0) > +.if \len == 64 && \vlen < 256 > + addi a0, a0, 32 > + addi a2, a2, 32 > + epel_load v30, \len, \op, \name, \type, 1, t > + vse8.v v30, (a0) > + addi a0, a0, -32 > + addi a2, a2, -32 > +.endif > + add a2, a2, a3 > + add a0, a0, a1 > + bnez a4, 1b > + > + ret > +endfunc > +.endm > + > .irp len, 64, 32, 16, 8, 4 > copy_avg \len > .irp op, put, avg > bilin_h_v \len, \op, h, a5 > bilin_h_v \len, \op, v, a6 > + .irp name, regular, sharp, smooth AFAICT, regular and sharp are identical, except for the base address of the filter table, so it should be possible to share the byte code. Similarly, it should be possible to share most of the horizontal and vertical code (maybe also for bilinear. not just EPel) with separate load/store then inner procedures. The H.263 loop filter already does that though with almost no overhead, though H.263 is obviously simpler than VP9. A French philosopher famously said that Perfect is the ennemy of Good. Generally, as with VVC, nested repetition macros for finely specialised functions tend to generate way too much byte code, and this ends up being worse rather than better in the big picture. > + .irp type, h, v > + epel \len, \op, \name, \type, 128 > + epel \len, \op, \name, \type, 256 > + .endr > + .endr > .endr > .endr > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h > index 8fb326dae0..5fd64a1b8c 100644 > --- a/libavcodec/riscv/vp9dsp.h > +++ b/libavcodec/riscv/vp9dsp.h > @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const > uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const > uint8_t *l, const uint8_t *a); > > -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) > \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t > dststride, \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, > min_vlen) \ +void > ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \ + > ptrdiff_t dststride, > \ const uint8_t *src, \ ptrdiff_t srcstride, > \ int h, int mx, int my); \ \ -void > ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \ > +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, > \ + ptrdiff_t dststride, > \ const uint8_t *src, \ ptrdiff_t srcstride, > \ int h, int mx, int my); \ \ -void > ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \ > +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, > \ + ptrdiff_t dststride, > \ const uint8_t *src, \ ptrdiff_t srcstride, > \ int h, int mx, int my); \ \ -void > ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ > +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, > \ + ptrdiff_t dststride, > \ const uint8_t *src, \ ptrdiff_t srcstride, > \ int h, int mx, int my); \ \ -void > ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \ > +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, > \ + ptrdiff_t dststride, > \ const uint8_t *src, \ ptrdiff_t srcstride, > \ int h, int mx, int my); \ \ -void > ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \ > +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, > \ + ptrdiff_t dststride, > \ const uint8_t *src, \ ptrdiff_t srcstride, > \ int h, int mx, int my); > @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t > dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int > mx, int my); > > -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR); > -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR); > -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR); > -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR); > -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR); > - > -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP); > -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP); > -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP); > -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP); > -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP); > - > -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH); > -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH); > -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH); > -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH); > -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH); > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128); > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128); > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128); > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128); > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128); > + > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128); > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128); > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128); > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128); > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128); > + > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128); > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128); > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128); > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128); > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128); > + > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256); > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256); > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256); > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256); > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256); > + > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256); > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256); > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256); > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256); > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256); > + > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256); > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256); > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256); > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256); > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256); > > VP9_BILINEAR_RISCV_RVV_FUNC(64); > VP9_BILINEAR_RISCV_RVV_FUNC(32); > diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c > index 9606d8545f..314a1e5808 100644 > --- a/libavcodec/riscv/vp9dsp_init.c > +++ b/libavcodec/riscv/vp9dsp_init.c > @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext > *dsp, int bpp) # endif > > #if HAVE_RVV > - if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) > { + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) { > + if (ff_rv_vlen_least(128)) { > > #define init_fpel(idx1, sz) \ > dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv; \ > @@ -85,7 +86,42 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext > *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][0] = > ff_avg_vp9_bilin_4h_rvv; > > #undef init_fpel > + > +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \ > + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \ > + ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen; \ > + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \ > + ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen; \ > + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \ > + ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen; > + > +#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \ > + init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \ > + init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \ > + init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \ > + init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \ > + init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen) > + > + init_subpel2(0, 1, 0, h, put, 128); > + init_subpel2(1, 1, 0, h, avg, 128); > + > + if (flags & AV_CPU_FLAG_RVB_ADDR) { > + init_subpel2(0, 0, 1, v, put, 128); > + init_subpel2(1, 0, 1, v, avg, 128); > + } > + > + } > + if (ff_rv_vlen_least(256)) { > + init_subpel2(0, 1, 0, h, put, 256); > + init_subpel2(1, 1, 0, h, avg, 256); > + > + if (flags & AV_CPU_FLAG_RVB_ADDR) { > + init_subpel2(0, 0, 1, v, put, 256); > + init_subpel2(1, 0, 1, v, avg, 256); > + } > } > + } > + > #endif > #endif > } -- Rémi Denis-Courmont http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap h v 2024-05-25 10:17 ` Rémi Denis-Courmont @ 2024-05-25 10:38 ` flow gg 2024-05-25 10:47 ` flow gg 2024-05-25 17:36 ` Rémi Denis-Courmont 0 siblings, 2 replies; 11+ messages in thread From: flow gg @ 2024-05-25 10:38 UTC (permalink / raw) To: FFmpeg development discussions and patches > Is there a reason that you cannot use the tables from C code? Similar to VP8, to adjust the positive and negative data and prevent small probability overflow during calculations. > AFAICT, regular and sharp are identical, except for the base address of the > filter table, so it should be possible to share the byte code Initially, they used the same code, but after testing hundreds of times, there were always a few failures... Because the data in the table is different, when regular, sharp, and smooth use the same code, there will always be a small amount of overflow. Different signed and unsigned calculations are needed. > A French philosopher famously said that Perfect is the ennemy of Good. > Generally, as with VVC, nested repetition macros for finely specialised > functions tend to generate way too much byte code, and this ends up being > worse rather than better in the big picture. Got it, I will try to update. Rémi Denis-Courmont <remi@remlab.net> 于2024年5月25日周六 18:17写道: > Le tiistaina 21. toukokuuta 2024, 20.13.17 EEST uk7b@foxmail.com a écrit : > > From: sunyuechi <sunyuechi@iscas.ac.cn> > > > > C908 X60 > > vp9_avg_8tap_smooth_4h_8bpp_c : 13.0 11.2 > > vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 5.0 4.2 > > vp9_avg_8tap_smooth_4v_8bpp_c : 13.7 12.5 > > vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 5.0 4.2 > > vp9_avg_8tap_smooth_8h_8bpp_c : 49.5 42.2 > > vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.2 8.5 > > vp9_avg_8tap_smooth_8v_8bpp_c : 66.5 45.0 > > vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5 > > vp9_avg_8tap_smooth_16h_8bpp_c : 192.7 166.5 > > vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.2 18.7 > > vp9_avg_8tap_smooth_16v_8bpp_c : 192.2 175.7 > > vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.5 19.0 > > vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.7 > > vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 83.5 60.0 > > vp9_avg_8tap_smooth_32v_8bpp_c : 770.5 689.2 > > vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.2 60.0 > > vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2 > > vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 283.5 119.2 > > vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0 > > vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 305.2 119.0 > > vp9_put_8tap_smooth_4h_8bpp_c : 11.2 9.7 > > vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 4.0 > > vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.7 > > vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.2 4.0 > > vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5 > > vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7 > > vp9_put_8tap_smooth_8v_8bpp_c : 44.2 38.7 > > vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.5 7.7 > > vp9_put_8tap_smooth_16h_8bpp_c : 165.7 147.2 > > vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 19.5 17.5 > > vp9_put_8tap_smooth_16v_8bpp_c : 169.0 149.7 > > vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5 > > vp9_put_8tap_smooth_32h_8bpp_c : 659.7 586.7 > > vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 64.2 57.2 > > vp9_put_8tap_smooth_32v_8bpp_c : 680.5 591.2 > > vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.2 57.2 > > vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0 > > vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 255.5 114.2 > > vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7 > > vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0 > > --- > > libavcodec/riscv/vp9_mc_rvv.S | 243 +++++++++++++++++++++++++++++++++ > > libavcodec/riscv/vp9dsp.h | 72 ++++++---- > > libavcodec/riscv/vp9dsp_init.c | 38 +++++- > > 3 files changed, 328 insertions(+), 25 deletions(-) > > > > diff --git a/libavcodec/riscv/vp9_mc_rvv.S > b/libavcodec/riscv/vp9_mc_rvv.S > > index 739380d9a9..adba4afb90 100644 > > --- a/libavcodec/riscv/vp9_mc_rvv.S > > +++ b/libavcodec/riscv/vp9_mc_rvv.S > > @@ -36,6 +36,18 @@ > > .endif > > .endm > > > > +.macro vsetvlstatic16 len > > +.ifc \len,4 > > + vsetvli zero, zero, e16, mf2, ta, ma > > +.elseif \len == 8 > > + vsetvli zero, zero, e16, m1, ta, ma > > +.elseif \len == 16 > > + vsetvli zero, zero, e16, m2, ta, ma > > +.else > > + vsetvli zero, zero, e16, m4, ta, ma > > +.endif > > +.endm > > + > > .macro copy_avg len > > func ff_vp9_avg\len\()_rvv, zve32x > > csrwi vxrm, 0 > > @@ -92,10 +104,241 @@ func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, > zve32x > > endfunc > > .endm > > > > +const subpel_filters_regular > > + .byte 0, 0, 0, 128, 0, 0, 0, 0 > > + .byte 0, 1, -5, 126, 8, -3, 1, 0 > > + .byte -1, 3, -10, 122, 18, -6, 2, 0 > > + .byte -1, 4, -13, 118, 27, -9, 3, -1 > > + .byte -1, 4, -16, 112, 37, -11, 4, -1 > > + .byte -1, 5, -18, 105, 48, -14, 4, -1 > > + .byte -1, 5, -19, 97, 58, -16, 5, -1 > > + .byte -1, 6, -19, 88, 68, -18, 5, -1 > > + .byte -1, 6, -19, 78, 78, -19, 6, -1 > > + .byte -1, 5, -18, 68, 88, -19, 6, -1 > > + .byte -1, 5, -16, 58, 97, -19, 5, -1 > > + .byte -1, 4, -14, 48, 105, -18, 5, -1 > > + .byte -1, 4, -11, 37, 112, -16, 4, -1 > > + .byte -1, 3, -9, 27, 118, -13, 4, -1 > > + .byte 0, 2, -6, 18, 122, -10, 3, -1 > > + .byte 0, 1, -3, 8, 126, -5, 1, 0 > > +subpel_filters_sharp: > > + .byte 0, 0, 0, 128, 0, 0, 0, 0 > > + .byte -1, 3, -7, 127, 8, -3, 1, 0 > > + .byte -2, 5, -13, 125, 17, -6, 3, -1 > > + .byte -3, 7, -17, 121, 27, -10, 5, -2 > > + .byte -4, 9, -20, 115, 37, -13, 6, -2 > > + .byte -4, 10, -23, 108, 48, -16, 8, -3 > > + .byte -4, 10, -24, 100, 59, -19, 9, -3 > > + .byte -4, 11, -24, 90, 70, -21, 10, -4 > > + .byte -4, 11, -23, 80, 80, -23, 11, -4 > > + .byte -4, 10, -21, 70, 90, -24, 11, -4 > > + .byte -3, 9, -19, 59, 100, -24, 10, -4 > > + .byte -3, 8, -16, 48, 108, -23, 10, -4 > > + .byte -2, 6, -13, 37, 115, -20, 9, -4 > > + .byte -2, 5, -10, 27, 121, -17, 7, -3 > > + .byte -1, 3, -6, 17, 125, -13, 5, -2 > > + .byte 0, 1, -3, 8, 127, -7, 3, -1 > > +subpel_filters_smooth: > > + .byte 0, 0, 0, 128, 0, 0, 0, 0 > > + .byte -3, -1, 32, 64, 38, 1, -3, 0 > > + .byte -2, -2, 29, 63, 41, 2, -3, 0 > > + .byte -2, -2, 26, 63, 43, 4, -4, 0 > > + .byte -2, -3, 24, 62, 46, 5, -4, 0 > > + .byte -2, -3, 21, 60, 49, 7, -4, 0 > > + .byte -1, -4, 18, 59, 51, 9, -4, 0 > > + .byte -1, -4, 16, 57, 53, 12, -4, -1 > > + .byte -1, -4, 14, 55, 55, 14, -4, -1 > > + .byte -1, -4, 12, 53, 57, 16, -4, -1 > > + .byte 0, -4, 9, 51, 59, 18, -4, -1 > > + .byte 0, -4, 7, 49, 60, 21, -3, -2 > > + .byte 0, -4, 5, 46, 62, 24, -3, -2 > > + .byte 0, -4, 4, 43, 63, 26, -2, -2 > > + .byte 0, -3, 2, 41, 63, 29, -2, -2 > > + .byte 0, -3, 1, 38, 64, 32, -1, -3 > > +endconst > > Is there a reason that you cannot use the tables from C code? > > > + > > +.macro epel_filter name type regtype > > + lla \regtype\()2, subpel_filters_\name > > It should be possible to spare one ADDI by using just AUIPC here, and > folding > the immediate offset into the LB's below (see also H.263 loop filter). > > > + li \regtype\()1, 8 > > +.ifc \type,v > > + mul \regtype\()0, a6, \regtype\()1 > > +.else > > + mul \regtype\()0, a5, \regtype\()1 > > slli 3 ? > > > +.endif > > + add \regtype\()0, \regtype\()0, \regtype\()2 > > + .irp n,1,2,3,4,5,6 > > + lb \regtype\n, \n(\regtype\()0) > > + .endr > > +.ifc \regtype,t > > + lb a7, 7(\regtype\()0) > > +.else > > + lb s7, 7(\regtype\()0) > > +.endif > > + lb \regtype\()0, 0(\regtype\()0) > > +.endm > > + > > +.macro epel_load dst len op name type from_mem regtype > > + li a5, 64 > > +.ifc \from_mem, 1 > > + vle8.v v22, (a2) > > +.ifc \type,v > > + sub a2, a2, a3 > > + vle8.v v20, (a2) > > + sh1add a2, a3, a2 > > + vle8.v v24, (a2) > > + add a2, a2, a3 > > + vle8.v v26, (a2) > > + add a2, a2, a3 > > + vle8.v v28, (a2) > > + add a2, a2, a3 > > + vle8.v v30, (a2) > > +.else > > + addi a2, a2, -1 > > + vle8.v v20, (a2) > > + addi a2, a2, 2 > > + vle8.v v24, (a2) > > + addi a2, a2, 1 > > + vle8.v v26, (a2) > > + addi a2, a2, 1 > > + vle8.v v28, (a2) > > + addi a2, a2, 1 > > + vle8.v v30, (a2) > > +.endif > > + > > +.ifc \name,smooth > > + vwmulu.vx v16, v24, \regtype\()4 > > + vwmaccu.vx v16, \regtype\()2, v20 > > + vwmaccu.vx v16, \regtype\()5, v26 > > + vwmaccsu.vx v16, \regtype\()6, v28 > > +.else > > + vwmulu.vx v16, v28, \regtype\()6 > > + vwmaccsu.vx v16, \regtype\()2, v20 > > + vwmaccsu.vx v16, \regtype\()5, v26 > > +.endif > > + > > +.ifc \regtype,t > > + vwmaccsu.vx v16, a7, v30 > > +.else > > + vwmaccsu.vx v16, s7, v30 > > +.endif > > + > > +.ifc \type,v > > + .rept 6 > > + sub a2, a2, a3 > > + .endr > > + vle8.v v28, (a2) > > + sub a2, a2, a3 > > + vle8.v v26, (a2) > > + sh1add a2, a3, a2 > > + add a2, a2, a3 > > +.else > > + addi a2, a2, -6 > > + vle8.v v28, (a2) > > + addi a2, a2, -1 > > + vle8.v v26, (a2) > > + addi a2, a2, 3 > > +.endif > > + > > +.ifc \name,smooth > > + vwmaccsu.vx v16, \regtype\()1, v28 > > +.else > > + vwmaccu.vx v16, \regtype\()1, v28 > > + vwmulu.vx v28, v24, \regtype\()4 > > +.endif > > + vwmaccsu.vx v16, \regtype\()0, v26 > > + vwmulu.vx v20, v22, \regtype\()3 > > +.else > > +.ifc \name,smooth > > + vwmulu.vx v16, v8, \regtype\()4 > > + vwmaccu.vx v16, \regtype\()2, v4 > > + vwmaccu.vx v16, \regtype\()5, v10 > > + vwmaccsu.vx v16, \regtype\()6, v12 > > + vwmaccsu.vx v16, \regtype\()1, v2 > > +.else > > + vwmulu.vx v16, v2, \regtype\()1 > > + vwmaccu.vx v16, \regtype\()6, v12 > > + vwmaccsu.vx v16, \regtype\()5, v10 > > + vwmaccsu.vx v16, \regtype\()2, v4 > > + vwmulu.vx v28, v8, \regtype\()4 > > +.endif > > + vwmaccsu.vx v16, \regtype\()0, v0 > > + vwmulu.vx v20, v6, \regtype\()3 > > + > > +.ifc \regtype,t > > + vwmaccsu.vx v16, a7, v14 > > +.else > > + vwmaccsu.vx v16, s7, v14 > > +.endif > > + > > +.endif > > + vwadd.wx v16, v16, a5 > > + vsetvlstatic16 \len > > + > > +.ifc \name,smooth > > + vwadd.vv v24, v16, v20 > > +.else > > + vwadd.vv v24, v16, v28 > > + vwadd.wv v24, v24, v20 > > +.endif > > + vnsra.wi v24, v24, 7 > > + vmax.vx v24, v24, zero > > + vsetvlstatic8 \len, zero, 32, m2 > > + > > + vnclipu.wi \dst, v24, 0 > > +.ifc \op,avg > > + vle8.v v24, (a0) > > + vaaddu.vv \dst, \dst, v24 > > +.endif > > + > > +.endm > > + > > +.macro epel_load_inc dst len op name type from_mem regtype > > + epel_load \dst, \len, \op, \name, \type, \from_mem, > \regtype > > + add a2, a2, a3 > > +.endm > > + > > +.macro epel len op name type vlen > > +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x > > + epel_filter \name, \type, t > > +.if \vlen < 256 > > + vsetvlstatic8 \len, a5, 32, m2 > > +.else > > + vsetvlstatic8 \len, a5, 64, m2 > > +.endif > > +.ifc \op,avg > > + csrwi vxrm, 0 > > +.endif > > + > > +1: > > + addi a4, a4, -1 > > + epel_load v30, \len, \op, \name, \type, 1, t > > + vse8.v v30, (a0) > > +.if \len == 64 && \vlen < 256 > > + addi a0, a0, 32 > > + addi a2, a2, 32 > > + epel_load v30, \len, \op, \name, \type, 1, t > > + vse8.v v30, (a0) > > + addi a0, a0, -32 > > + addi a2, a2, -32 > > +.endif > > + add a2, a2, a3 > > + add a0, a0, a1 > > + bnez a4, 1b > > + > > + ret > > +endfunc > > +.endm > > + > > .irp len, 64, 32, 16, 8, 4 > > copy_avg \len > > .irp op, put, avg > > bilin_h_v \len, \op, h, a5 > > bilin_h_v \len, \op, v, a6 > > + .irp name, regular, sharp, smooth > > AFAICT, regular and sharp are identical, except for the base address of > the > filter table, so it should be possible to share the byte code. Similarly, > it > should be possible to share most of the horizontal and vertical code > (maybe > also for bilinear. not just EPel) with separate load/store then inner > procedures. The H.263 loop filter already does that though with almost no > overhead, though > H.263 is obviously simpler than VP9. > > A French philosopher famously said that Perfect is the ennemy of Good. > Generally, as with VVC, nested repetition macros for finely specialised > functions tend to generate way too much byte code, and this ends up being > worse rather than better in the big picture. > > > + .irp type, h, v > > + epel \len, \op, \name, \type, 128 > > + epel \len, \op, \name, \type, 256 > > + .endr > > + .endr > > .endr > > .endr > > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h > > index 8fb326dae0..5fd64a1b8c 100644 > > --- a/libavcodec/riscv/vp9dsp.h > > +++ b/libavcodec/riscv/vp9dsp.h > > @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, > const > > uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const > > uint8_t *l, const uint8_t *a); > > > > -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) > > > \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t > > dststride, \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, > > min_vlen) \ +void > > ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, > \ + > > ptrdiff_t dststride, > > > \ const uint8_t *src, \ ptrdiff_t srcstride, > > > \ int h, int mx, int my); \ \ -void > > ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \ > > +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, > > > \ + ptrdiff_t dststride, > > > \ const uint8_t *src, \ ptrdiff_t srcstride, > > > \ int h, int mx, int my); \ \ -void > > ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \ > > +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, > > > \ + ptrdiff_t dststride, > > > \ const uint8_t *src, \ ptrdiff_t srcstride, > > > \ int h, int mx, int my); \ \ -void > > ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ > > +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, > > > \ + ptrdiff_t dststride, > > > \ const uint8_t *src, \ ptrdiff_t srcstride, > > > \ int h, int mx, int my); \ \ -void > > ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \ > > +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, > > > \ + ptrdiff_t dststride, > > > \ const uint8_t *src, \ ptrdiff_t srcstride, > > > \ int h, int mx, int my); \ \ -void > > ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \ > > +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, > > > \ + ptrdiff_t dststride, > > > \ const uint8_t *src, \ ptrdiff_t srcstride, > > > \ int h, int mx, int my); > > @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t > > dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int > > mx, int my); > > > > -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR); > > -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR); > > -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR); > > -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR); > > -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR); > > - > > -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP); > > -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP); > > -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP); > > -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP); > > -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP); > > - > > -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH); > > -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH); > > -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH); > > -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH); > > -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH); > > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128); > > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128); > > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128); > > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128); > > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128); > > + > > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128); > > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128); > > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128); > > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128); > > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128); > > + > > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128); > > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128); > > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128); > > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128); > > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128); > > + > > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256); > > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256); > > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256); > > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256); > > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256); > > + > > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256); > > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256); > > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256); > > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256); > > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256); > > + > > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256); > > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256); > > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256); > > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256); > > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256); > > > > VP9_BILINEAR_RISCV_RVV_FUNC(64); > > VP9_BILINEAR_RISCV_RVV_FUNC(32); > > diff --git a/libavcodec/riscv/vp9dsp_init.c > b/libavcodec/riscv/vp9dsp_init.c > > index 9606d8545f..314a1e5808 100644 > > --- a/libavcodec/riscv/vp9dsp_init.c > > +++ b/libavcodec/riscv/vp9dsp_init.c > > @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext > > *dsp, int bpp) # endif > > > > #if HAVE_RVV > > - if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && > ff_rv_vlen_least(128)) > > { + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) { > > + if (ff_rv_vlen_least(128)) { > > > > #define init_fpel(idx1, sz) \ > > dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = > ff_vp9_avg##sz##_rvv; \ > > @@ -85,7 +86,42 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext > > *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][0] = > > ff_avg_vp9_bilin_4h_rvv; > > > > #undef init_fpel > > + > > +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \ > > + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \ > > + ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen; \ > > + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \ > > + ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen; \ > > + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \ > > + ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen; > > + > > +#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \ > > + init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \ > > + init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \ > > + init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \ > > + init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \ > > + init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen) > > + > > + init_subpel2(0, 1, 0, h, put, 128); > > + init_subpel2(1, 1, 0, h, avg, 128); > > + > > + if (flags & AV_CPU_FLAG_RVB_ADDR) { > > + init_subpel2(0, 0, 1, v, put, 128); > > + init_subpel2(1, 0, 1, v, avg, 128); > > + } > > + > > + } > > + if (ff_rv_vlen_least(256)) { > > + init_subpel2(0, 1, 0, h, put, 256); > > + init_subpel2(1, 1, 0, h, avg, 256); > > + > > + if (flags & AV_CPU_FLAG_RVB_ADDR) { > > + init_subpel2(0, 0, 1, v, put, 256); > > + init_subpel2(1, 0, 1, v, avg, 256); > > + } > > } > > + } > > + > > #endif > > #endif > > } > > > -- > Rémi Denis-Courmont > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap h v 2024-05-25 10:38 ` flow gg @ 2024-05-25 10:47 ` flow gg 2024-05-25 17:36 ` Rémi Denis-Courmont 1 sibling, 0 replies; 11+ messages in thread From: flow gg @ 2024-05-25 10:47 UTC (permalink / raw) To: FFmpeg development discussions and patches One more thing I remember is that after adjusting the sign, vmacc can be used; otherwise, due to the sign, mul + add are needed. flow gg <hlefthleft@gmail.com> 于2024年5月25日周六 18:38写道: > > Is there a reason that you cannot use the tables from C code? > > Similar to VP8, to adjust the positive and negative data and prevent small > probability overflow during calculations. > > > AFAICT, regular and sharp are identical, except for the base address of > the > > filter table, so it should be possible to share the byte code > > Initially, they used the same code, but after testing hundreds of times, > there were always a few failures... > > Because the data in the table is different, when regular, sharp, and > smooth use the same code, there will always be a small amount of overflow. > Different signed and unsigned calculations are needed. > > > A French philosopher famously said that Perfect is the ennemy of Good. > > Generally, as with VVC, nested repetition macros for finely specialised > > functions tend to generate way too much byte code, and this ends up being > > worse rather than better in the big picture. > > Got it, I will try to update. > > Rémi Denis-Courmont <remi@remlab.net> 于2024年5月25日周六 18:17写道: > >> Le tiistaina 21. toukokuuta 2024, 20.13.17 EEST uk7b@foxmail.com a écrit >> : >> > From: sunyuechi <sunyuechi@iscas.ac.cn> >> > >> > C908 X60 >> > vp9_avg_8tap_smooth_4h_8bpp_c : 13.0 11.2 >> > vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 5.0 4.2 >> > vp9_avg_8tap_smooth_4v_8bpp_c : 13.7 12.5 >> > vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 5.0 4.2 >> > vp9_avg_8tap_smooth_8h_8bpp_c : 49.5 42.2 >> > vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.2 8.5 >> > vp9_avg_8tap_smooth_8v_8bpp_c : 66.5 45.0 >> > vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5 >> > vp9_avg_8tap_smooth_16h_8bpp_c : 192.7 166.5 >> > vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.2 18.7 >> > vp9_avg_8tap_smooth_16v_8bpp_c : 192.2 175.7 >> > vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.5 19.0 >> > vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.7 >> > vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 83.5 60.0 >> > vp9_avg_8tap_smooth_32v_8bpp_c : 770.5 689.2 >> > vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.2 60.0 >> > vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2 >> > vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 283.5 119.2 >> > vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0 >> > vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 305.2 119.0 >> > vp9_put_8tap_smooth_4h_8bpp_c : 11.2 9.7 >> > vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 4.0 >> > vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.7 >> > vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.2 4.0 >> > vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5 >> > vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7 >> > vp9_put_8tap_smooth_8v_8bpp_c : 44.2 38.7 >> > vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.5 7.7 >> > vp9_put_8tap_smooth_16h_8bpp_c : 165.7 147.2 >> > vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 19.5 17.5 >> > vp9_put_8tap_smooth_16v_8bpp_c : 169.0 149.7 >> > vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5 >> > vp9_put_8tap_smooth_32h_8bpp_c : 659.7 586.7 >> > vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 64.2 57.2 >> > vp9_put_8tap_smooth_32v_8bpp_c : 680.5 591.2 >> > vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.2 57.2 >> > vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0 >> > vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 255.5 114.2 >> > vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7 >> > vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0 >> > --- >> > libavcodec/riscv/vp9_mc_rvv.S | 243 +++++++++++++++++++++++++++++++++ >> > libavcodec/riscv/vp9dsp.h | 72 ++++++---- >> > libavcodec/riscv/vp9dsp_init.c | 38 +++++- >> > 3 files changed, 328 insertions(+), 25 deletions(-) >> > >> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S >> b/libavcodec/riscv/vp9_mc_rvv.S >> > index 739380d9a9..adba4afb90 100644 >> > --- a/libavcodec/riscv/vp9_mc_rvv.S >> > +++ b/libavcodec/riscv/vp9_mc_rvv.S >> > @@ -36,6 +36,18 @@ >> > .endif >> > .endm >> > >> > +.macro vsetvlstatic16 len >> > +.ifc \len,4 >> > + vsetvli zero, zero, e16, mf2, ta, ma >> > +.elseif \len == 8 >> > + vsetvli zero, zero, e16, m1, ta, ma >> > +.elseif \len == 16 >> > + vsetvli zero, zero, e16, m2, ta, ma >> > +.else >> > + vsetvli zero, zero, e16, m4, ta, ma >> > +.endif >> > +.endm >> > + >> > .macro copy_avg len >> > func ff_vp9_avg\len\()_rvv, zve32x >> > csrwi vxrm, 0 >> > @@ -92,10 +104,241 @@ func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, >> zve32x >> > endfunc >> > .endm >> > >> > +const subpel_filters_regular >> > + .byte 0, 0, 0, 128, 0, 0, 0, 0 >> > + .byte 0, 1, -5, 126, 8, -3, 1, 0 >> > + .byte -1, 3, -10, 122, 18, -6, 2, 0 >> > + .byte -1, 4, -13, 118, 27, -9, 3, -1 >> > + .byte -1, 4, -16, 112, 37, -11, 4, -1 >> > + .byte -1, 5, -18, 105, 48, -14, 4, -1 >> > + .byte -1, 5, -19, 97, 58, -16, 5, -1 >> > + .byte -1, 6, -19, 88, 68, -18, 5, -1 >> > + .byte -1, 6, -19, 78, 78, -19, 6, -1 >> > + .byte -1, 5, -18, 68, 88, -19, 6, -1 >> > + .byte -1, 5, -16, 58, 97, -19, 5, -1 >> > + .byte -1, 4, -14, 48, 105, -18, 5, -1 >> > + .byte -1, 4, -11, 37, 112, -16, 4, -1 >> > + .byte -1, 3, -9, 27, 118, -13, 4, -1 >> > + .byte 0, 2, -6, 18, 122, -10, 3, -1 >> > + .byte 0, 1, -3, 8, 126, -5, 1, 0 >> > +subpel_filters_sharp: >> > + .byte 0, 0, 0, 128, 0, 0, 0, 0 >> > + .byte -1, 3, -7, 127, 8, -3, 1, 0 >> > + .byte -2, 5, -13, 125, 17, -6, 3, -1 >> > + .byte -3, 7, -17, 121, 27, -10, 5, -2 >> > + .byte -4, 9, -20, 115, 37, -13, 6, -2 >> > + .byte -4, 10, -23, 108, 48, -16, 8, -3 >> > + .byte -4, 10, -24, 100, 59, -19, 9, -3 >> > + .byte -4, 11, -24, 90, 70, -21, 10, -4 >> > + .byte -4, 11, -23, 80, 80, -23, 11, -4 >> > + .byte -4, 10, -21, 70, 90, -24, 11, -4 >> > + .byte -3, 9, -19, 59, 100, -24, 10, -4 >> > + .byte -3, 8, -16, 48, 108, -23, 10, -4 >> > + .byte -2, 6, -13, 37, 115, -20, 9, -4 >> > + .byte -2, 5, -10, 27, 121, -17, 7, -3 >> > + .byte -1, 3, -6, 17, 125, -13, 5, -2 >> > + .byte 0, 1, -3, 8, 127, -7, 3, -1 >> > +subpel_filters_smooth: >> > + .byte 0, 0, 0, 128, 0, 0, 0, 0 >> > + .byte -3, -1, 32, 64, 38, 1, -3, 0 >> > + .byte -2, -2, 29, 63, 41, 2, -3, 0 >> > + .byte -2, -2, 26, 63, 43, 4, -4, 0 >> > + .byte -2, -3, 24, 62, 46, 5, -4, 0 >> > + .byte -2, -3, 21, 60, 49, 7, -4, 0 >> > + .byte -1, -4, 18, 59, 51, 9, -4, 0 >> > + .byte -1, -4, 16, 57, 53, 12, -4, -1 >> > + .byte -1, -4, 14, 55, 55, 14, -4, -1 >> > + .byte -1, -4, 12, 53, 57, 16, -4, -1 >> > + .byte 0, -4, 9, 51, 59, 18, -4, -1 >> > + .byte 0, -4, 7, 49, 60, 21, -3, -2 >> > + .byte 0, -4, 5, 46, 62, 24, -3, -2 >> > + .byte 0, -4, 4, 43, 63, 26, -2, -2 >> > + .byte 0, -3, 2, 41, 63, 29, -2, -2 >> > + .byte 0, -3, 1, 38, 64, 32, -1, -3 >> > +endconst >> >> Is there a reason that you cannot use the tables from C code? >> >> > + >> > +.macro epel_filter name type regtype >> > + lla \regtype\()2, subpel_filters_\name >> >> It should be possible to spare one ADDI by using just AUIPC here, and >> folding >> the immediate offset into the LB's below (see also H.263 loop filter). >> >> > + li \regtype\()1, 8 >> > +.ifc \type,v >> > + mul \regtype\()0, a6, \regtype\()1 >> > +.else >> > + mul \regtype\()0, a5, \regtype\()1 >> >> slli 3 ? >> >> > +.endif >> > + add \regtype\()0, \regtype\()0, \regtype\()2 >> > + .irp n,1,2,3,4,5,6 >> > + lb \regtype\n, \n(\regtype\()0) >> > + .endr >> > +.ifc \regtype,t >> > + lb a7, 7(\regtype\()0) >> > +.else >> > + lb s7, 7(\regtype\()0) >> > +.endif >> > + lb \regtype\()0, 0(\regtype\()0) >> > +.endm >> > + >> > +.macro epel_load dst len op name type from_mem regtype >> > + li a5, 64 >> > +.ifc \from_mem, 1 >> > + vle8.v v22, (a2) >> > +.ifc \type,v >> > + sub a2, a2, a3 >> > + vle8.v v20, (a2) >> > + sh1add a2, a3, a2 >> > + vle8.v v24, (a2) >> > + add a2, a2, a3 >> > + vle8.v v26, (a2) >> > + add a2, a2, a3 >> > + vle8.v v28, (a2) >> > + add a2, a2, a3 >> > + vle8.v v30, (a2) >> > +.else >> > + addi a2, a2, -1 >> > + vle8.v v20, (a2) >> > + addi a2, a2, 2 >> > + vle8.v v24, (a2) >> > + addi a2, a2, 1 >> > + vle8.v v26, (a2) >> > + addi a2, a2, 1 >> > + vle8.v v28, (a2) >> > + addi a2, a2, 1 >> > + vle8.v v30, (a2) >> > +.endif >> > + >> > +.ifc \name,smooth >> > + vwmulu.vx v16, v24, \regtype\()4 >> > + vwmaccu.vx v16, \regtype\()2, v20 >> > + vwmaccu.vx v16, \regtype\()5, v26 >> > + vwmaccsu.vx v16, \regtype\()6, v28 >> > +.else >> > + vwmulu.vx v16, v28, \regtype\()6 >> > + vwmaccsu.vx v16, \regtype\()2, v20 >> > + vwmaccsu.vx v16, \regtype\()5, v26 >> > +.endif >> > + >> > +.ifc \regtype,t >> > + vwmaccsu.vx v16, a7, v30 >> > +.else >> > + vwmaccsu.vx v16, s7, v30 >> > +.endif >> > + >> > +.ifc \type,v >> > + .rept 6 >> > + sub a2, a2, a3 >> > + .endr >> > + vle8.v v28, (a2) >> > + sub a2, a2, a3 >> > + vle8.v v26, (a2) >> > + sh1add a2, a3, a2 >> > + add a2, a2, a3 >> > +.else >> > + addi a2, a2, -6 >> > + vle8.v v28, (a2) >> > + addi a2, a2, -1 >> > + vle8.v v26, (a2) >> > + addi a2, a2, 3 >> > +.endif >> > + >> > +.ifc \name,smooth >> > + vwmaccsu.vx v16, \regtype\()1, v28 >> > +.else >> > + vwmaccu.vx v16, \regtype\()1, v28 >> > + vwmulu.vx v28, v24, \regtype\()4 >> > +.endif >> > + vwmaccsu.vx v16, \regtype\()0, v26 >> > + vwmulu.vx v20, v22, \regtype\()3 >> > +.else >> > +.ifc \name,smooth >> > + vwmulu.vx v16, v8, \regtype\()4 >> > + vwmaccu.vx v16, \regtype\()2, v4 >> > + vwmaccu.vx v16, \regtype\()5, v10 >> > + vwmaccsu.vx v16, \regtype\()6, v12 >> > + vwmaccsu.vx v16, \regtype\()1, v2 >> > +.else >> > + vwmulu.vx v16, v2, \regtype\()1 >> > + vwmaccu.vx v16, \regtype\()6, v12 >> > + vwmaccsu.vx v16, \regtype\()5, v10 >> > + vwmaccsu.vx v16, \regtype\()2, v4 >> > + vwmulu.vx v28, v8, \regtype\()4 >> > +.endif >> > + vwmaccsu.vx v16, \regtype\()0, v0 >> > + vwmulu.vx v20, v6, \regtype\()3 >> > + >> > +.ifc \regtype,t >> > + vwmaccsu.vx v16, a7, v14 >> > +.else >> > + vwmaccsu.vx v16, s7, v14 >> > +.endif >> > + >> > +.endif >> > + vwadd.wx v16, v16, a5 >> > + vsetvlstatic16 \len >> > + >> > +.ifc \name,smooth >> > + vwadd.vv v24, v16, v20 >> > +.else >> > + vwadd.vv v24, v16, v28 >> > + vwadd.wv v24, v24, v20 >> > +.endif >> > + vnsra.wi v24, v24, 7 >> > + vmax.vx v24, v24, zero >> > + vsetvlstatic8 \len, zero, 32, m2 >> > + >> > + vnclipu.wi \dst, v24, 0 >> > +.ifc \op,avg >> > + vle8.v v24, (a0) >> > + vaaddu.vv \dst, \dst, v24 >> > +.endif >> > + >> > +.endm >> > + >> > +.macro epel_load_inc dst len op name type from_mem regtype >> > + epel_load \dst, \len, \op, \name, \type, \from_mem, >> \regtype >> > + add a2, a2, a3 >> > +.endm >> > + >> > +.macro epel len op name type vlen >> > +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x >> > + epel_filter \name, \type, t >> > +.if \vlen < 256 >> > + vsetvlstatic8 \len, a5, 32, m2 >> > +.else >> > + vsetvlstatic8 \len, a5, 64, m2 >> > +.endif >> > +.ifc \op,avg >> > + csrwi vxrm, 0 >> > +.endif >> > + >> > +1: >> > + addi a4, a4, -1 >> > + epel_load v30, \len, \op, \name, \type, 1, t >> > + vse8.v v30, (a0) >> > +.if \len == 64 && \vlen < 256 >> > + addi a0, a0, 32 >> > + addi a2, a2, 32 >> > + epel_load v30, \len, \op, \name, \type, 1, t >> > + vse8.v v30, (a0) >> > + addi a0, a0, -32 >> > + addi a2, a2, -32 >> > +.endif >> > + add a2, a2, a3 >> > + add a0, a0, a1 >> > + bnez a4, 1b >> > + >> > + ret >> > +endfunc >> > +.endm >> > + >> > .irp len, 64, 32, 16, 8, 4 >> > copy_avg \len >> > .irp op, put, avg >> > bilin_h_v \len, \op, h, a5 >> > bilin_h_v \len, \op, v, a6 >> > + .irp name, regular, sharp, smooth >> >> AFAICT, regular and sharp are identical, except for the base address of >> the >> filter table, so it should be possible to share the byte code. Similarly, >> it >> should be possible to share most of the horizontal and vertical code >> (maybe >> also for bilinear. not just EPel) with separate load/store then inner >> procedures. The H.263 loop filter already does that though with almost no >> overhead, though >> H.263 is obviously simpler than VP9. >> >> A French philosopher famously said that Perfect is the ennemy of Good. >> Generally, as with VVC, nested repetition macros for finely specialised >> functions tend to generate way too much byte code, and this ends up being >> worse rather than better in the big picture. >> >> > + .irp type, h, v >> > + epel \len, \op, \name, \type, 128 >> > + epel \len, \op, \name, \type, 256 >> > + .endr >> > + .endr >> > .endr >> > .endr >> > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h >> > index 8fb326dae0..5fd64a1b8c 100644 >> > --- a/libavcodec/riscv/vp9dsp.h >> > +++ b/libavcodec/riscv/vp9dsp.h >> > @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, >> const >> > uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const >> > uint8_t *l, const uint8_t *a); >> > >> > -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) >> >> > \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t >> > dststride, \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, >> > min_vlen) \ +void >> > ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, >> \ + >> > ptrdiff_t dststride, >> >> > \ const uint8_t *src, \ ptrdiff_t srcstride, >> >> > \ int h, int mx, int my); \ \ -void >> > ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, >> \ >> > +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, >> >> > \ + ptrdiff_t dststride, >> >> > \ const uint8_t *src, \ ptrdiff_t srcstride, >> >> > \ int h, int mx, int my); \ \ -void >> > ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, >> \ >> > +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, >> >> > \ + ptrdiff_t dststride, >> >> > \ const uint8_t *src, \ ptrdiff_t srcstride, >> >> > \ int h, int mx, int my); \ \ -void >> > ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, >> \ >> > +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, >> >> > \ + ptrdiff_t dststride, >> >> > \ const uint8_t *src, \ ptrdiff_t srcstride, >> >> > \ int h, int mx, int my); \ \ -void >> > ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, >> \ >> > +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, >> >> > \ + ptrdiff_t dststride, >> >> > \ const uint8_t *src, \ ptrdiff_t srcstride, >> >> > \ int h, int mx, int my); \ \ -void >> > ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, >> \ >> > +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, >> >> > \ + ptrdiff_t dststride, >> >> > \ const uint8_t *src, \ ptrdiff_t srcstride, >> >> > \ int h, int mx, int my); >> > @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, >> ptrdiff_t >> > dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int >> > mx, int my); >> > >> > -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR); >> > -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR); >> > -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR); >> > -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR); >> > -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR); >> > - >> > -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP); >> > -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP); >> > -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP); >> > -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP); >> > -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP); >> > - >> > -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH); >> > -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH); >> > -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH); >> > -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH); >> > -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH); >> > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128); >> > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128); >> > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128); >> > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128); >> > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128); >> > + >> > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128); >> > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128); >> > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128); >> > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128); >> > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128); >> > + >> > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128); >> > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128); >> > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128); >> > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128); >> > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128); >> > + >> > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256); >> > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256); >> > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256); >> > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256); >> > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256); >> > + >> > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256); >> > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256); >> > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256); >> > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256); >> > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256); >> > + >> > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256); >> > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256); >> > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256); >> > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256); >> > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256); >> > >> > VP9_BILINEAR_RISCV_RVV_FUNC(64); >> > VP9_BILINEAR_RISCV_RVV_FUNC(32); >> > diff --git a/libavcodec/riscv/vp9dsp_init.c >> b/libavcodec/riscv/vp9dsp_init.c >> > index 9606d8545f..314a1e5808 100644 >> > --- a/libavcodec/riscv/vp9dsp_init.c >> > +++ b/libavcodec/riscv/vp9dsp_init.c >> > @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext >> > *dsp, int bpp) # endif >> > >> > #if HAVE_RVV >> > - if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && >> ff_rv_vlen_least(128)) >> > { + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) { >> > + if (ff_rv_vlen_least(128)) { >> > >> > #define init_fpel(idx1, sz) \ >> > dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = >> ff_vp9_avg##sz##_rvv; \ >> > @@ -85,7 +86,42 @@ static av_cold void >> vp9dsp_mc_init_riscv(VP9DSPContext >> > *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][0] = >> > ff_avg_vp9_bilin_4h_rvv; >> > >> > #undef init_fpel >> > + >> > +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \ >> > + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \ >> > + ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen; \ >> > + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \ >> > + ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen; \ >> > + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \ >> > + ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen; >> > + >> > +#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \ >> > + init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \ >> > + init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \ >> > + init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \ >> > + init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \ >> > + init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen) >> > + >> > + init_subpel2(0, 1, 0, h, put, 128); >> > + init_subpel2(1, 1, 0, h, avg, 128); >> > + >> > + if (flags & AV_CPU_FLAG_RVB_ADDR) { >> > + init_subpel2(0, 0, 1, v, put, 128); >> > + init_subpel2(1, 0, 1, v, avg, 128); >> > + } >> > + >> > + } >> > + if (ff_rv_vlen_least(256)) { >> > + init_subpel2(0, 1, 0, h, put, 256); >> > + init_subpel2(1, 1, 0, h, avg, 256); >> > + >> > + if (flags & AV_CPU_FLAG_RVB_ADDR) { >> > + init_subpel2(0, 0, 1, v, put, 256); >> > + init_subpel2(1, 0, 1, v, avg, 256); >> > + } >> > } >> > + } >> > + >> > #endif >> > #endif >> > } >> >> >> -- >> Rémi Denis-Courmont >> http://www.remlab.net/ >> >> >> >> _______________________________________________ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> >> To unsubscribe, visit link above, or email >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >> > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap h v 2024-05-25 10:38 ` flow gg 2024-05-25 10:47 ` flow gg @ 2024-05-25 17:36 ` Rémi Denis-Courmont 1 sibling, 0 replies; 11+ messages in thread From: Rémi Denis-Courmont @ 2024-05-25 17:36 UTC (permalink / raw) To: FFmpeg development discussions and patches Le lauantaina 25. toukokuuta 2024, 13.38.39 EEST flow gg a écrit : > > Is there a reason that you cannot use the tables from C code? > > Similar to VP8, to adjust the positive and negative data and prevent small > probability overflow during calculations. > > > AFAICT, regular and sharp are identical, except for the base address of > > the filter table, so it should be possible to share the byte code > > Initially, they used the same code, but after testing hundreds of times, > there were always a few failures... AFAICT, the C reference and the AArch64 assembler are exactly the same for all 3 filters, except for the offset in the filter table. So logically, it ought toe be possible to merge regular and sharp with almost no changes, and merge smooth with a few fixes. As for the tables themselves, it seems the "problem" is that C uses 16-bit values for no apparent reasons. We should probably change the C code to use 8- bit on all platforms - except AArch64, because the NEON code probably relies on 16-bit format. -- Rémi Denis-Courmont http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 11+ messages in thread
* [FFmpeg-devel] [PATCH v2 4/5] lavc/vp9dsp: R-V V mc bilin hv [not found] <20240521171319.2629938-1-uk7b@foxmail.com> 2024-05-21 17:13 ` [FFmpeg-devel] [PATCH v2 2/5] lavc/vp9dsp: R-V V mc bilin h v uk7b 2024-05-21 17:13 ` [FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap " uk7b @ 2024-05-21 17:13 ` uk7b 2024-05-21 17:13 ` [FFmpeg-devel] [PATCH v2 5/5] lavc/vp9dsp: R-V V mc tap hv uk7b 3 siblings, 0 replies; 11+ messages in thread From: uk7b @ 2024-05-21 17:13 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908: vp9_avg_bilin_4hv_8bpp_c: 11.0 vp9_avg_bilin_4hv_8bpp_rvv_i64: 3.7 vp9_avg_bilin_8hv_8bpp_c: 38.7 vp9_avg_bilin_8hv_8bpp_rvv_i64: 7.2 vp9_avg_bilin_16hv_8bpp_c: 147.0 vp9_avg_bilin_16hv_8bpp_rvv_i64: 14.2 vp9_avg_bilin_32hv_8bpp_c: 574.5 vp9_avg_bilin_32hv_8bpp_rvv_i64: 42.7 vp9_avg_bilin_64hv_8bpp_c: 2311.5 vp9_avg_bilin_64hv_8bpp_rvv_i64: 201.7 vp9_put_bilin_4hv_8bpp_c: 10.0 vp9_put_bilin_4hv_8bpp_rvv_i64: 3.2 vp9_put_bilin_8hv_8bpp_c: 35.2 vp9_put_bilin_8hv_8bpp_rvv_i64: 6.5 vp9_put_bilin_16hv_8bpp_c: 133.7 vp9_put_bilin_16hv_8bpp_rvv_i64: 13.0 vp9_put_bilin_32hv_8bpp_c: 538.2 vp9_put_bilin_32hv_8bpp_rvv_i64: 39.7 vp9_put_bilin_64hv_8bpp_c: 2114.0 vp9_put_bilin_64hv_8bpp_rvv_i64: 153.7 --- libavcodec/riscv/vp9_mc_rvv.S | 34 ++++++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp_init.c | 10 ++++++++++ 2 files changed, 44 insertions(+) diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S index adba4afb90..d7db775df7 100644 --- a/libavcodec/riscv/vp9_mc_rvv.S +++ b/libavcodec/riscv/vp9_mc_rvv.S @@ -104,6 +104,39 @@ func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x endfunc .endm +.macro bilin_hv len op +func ff_\op\()_vp9_bilin_\len\()hv_rvv, zve32x +.ifc \op,avg + csrwi vxrm, 0 +.endif + vsetvlstatic8 \len, t0, 64 + neg t1, a5 + neg t2, a6 + li t4, 8 + bilin_load v24, \len, put, h, a5 + add a2, a2, a3 +1: + addi a4, a4, -1 + bilin_load v4, \len, put, h, a5 + vwmulu.vx v16, v4, a6 + vwmaccsu.vx v16, t2, v24 + vwadd.wx v16, v16, t4 + vnsra.wi v16, v16, 4 + vadd.vv v0, v16, v24 +.ifc \op,avg + vle8.v v16, (a0) + vaaddu.vv v0, v0, v16 +.endif + vse8.v v0, (a0) + vmv.v.v v24, v4 + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + + ret +endfunc +.endm + const subpel_filters_regular .byte 0, 0, 0, 128, 0, 0, 0, 0 .byte 0, 1, -5, 126, 8, -3, 1, 0 @@ -334,6 +367,7 @@ endfunc .irp op, put, avg bilin_h_v \len, \op, h, a5 bilin_h_v \len, \op, v, a6 + bilin_hv \len, \op .irp name, regular, sharp, smooth .irp type, h, v epel \len, \op, \name, \type, 128 diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index 314a1e5808..be5369d506 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -84,6 +84,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv; dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv; dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv; + dsp->mc[0][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_64hv_rvv; + dsp->mc[0][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_64hv_rvv; + dsp->mc[1][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_32hv_rvv; + dsp->mc[1][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_32hv_rvv; + dsp->mc[2][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_16hv_rvv; + dsp->mc[2][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_16hv_rvv; + dsp->mc[3][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_8hv_rvv; + dsp->mc[3][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_8hv_rvv; + dsp->mc[4][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_4hv_rvv; + dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv; #undef init_fpel -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 11+ messages in thread
* [FFmpeg-devel] [PATCH v2 5/5] lavc/vp9dsp: R-V V mc tap hv [not found] <20240521171319.2629938-1-uk7b@foxmail.com> ` (2 preceding siblings ...) 2024-05-21 17:13 ` [FFmpeg-devel] [PATCH v2 4/5] lavc/vp9dsp: R-V V mc bilin hv uk7b @ 2024-05-21 17:13 ` uk7b 3 siblings, 0 replies; 11+ messages in thread From: uk7b @ 2024-05-21 17:13 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908 X60 vp9_avg_8tap_smooth_4hv_8bpp_c : 32.0 28.2 vp9_avg_8tap_smooth_4hv_8bpp_rvv_i32 : 15.0 13.2 vp9_avg_8tap_smooth_8hv_8bpp_c : 98.0 86.2 vp9_avg_8tap_smooth_8hv_8bpp_rvv_i32 : 23.7 21.0 vp9_avg_8tap_smooth_16hv_8bpp_c : 355.5 297.0 vp9_avg_8tap_smooth_16hv_8bpp_rvv_i32 : 62.7 41.2 vp9_avg_8tap_smooth_32hv_8bpp_c : 1273.0 1099.7 vp9_avg_8tap_smooth_32hv_8bpp_rvv_i32 : 133.7 119.2 vp9_avg_8tap_smooth_64hv_8bpp_c : 4933.0 4240.5 vp9_avg_8tap_smooth_64hv_8bpp_rvv_i32 : 506.7 227.0 vp9_put_8tap_smooth_4hv_8bpp_c : 30.2 27.0 vp9_put_8tap_smooth_4hv_8bpp_rvv_i32 : 14.5 12.7 vp9_put_8tap_smooth_8hv_8bpp_c : 91.2 81.2 vp9_put_8tap_smooth_8hv_8bpp_rvv_i32 : 22.7 20.2 vp9_put_8tap_smooth_16hv_8bpp_c : 329.2 277.7 vp9_put_8tap_smooth_16hv_8bpp_rvv_i32 : 44.7 40.0 vp9_put_8tap_smooth_32hv_8bpp_c : 1183.7 1022.7 vp9_put_8tap_smooth_32hv_8bpp_rvv_i32 : 130.7 116.5 vp9_put_8tap_smooth_64hv_8bpp_c : 4502.7 3954.5 vp9_put_8tap_smooth_64hv_8bpp_rvv_i32 : 496.0 224.7 --- libavcodec/riscv/vp9_mc_rvv.S | 75 ++++++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp_init.c | 8 ++++ 2 files changed, 83 insertions(+) diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S index d7db775df7..06c79b16f7 100644 --- a/libavcodec/riscv/vp9_mc_rvv.S +++ b/libavcodec/riscv/vp9_mc_rvv.S @@ -362,6 +362,77 @@ func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x endfunc .endm +#if __riscv_xlen == 64 +.macro epel_hv_once len name op + sub a2, a2, a3 + sub a2, a2, a3 + sub a2, a2, a3 + .irp n,0,2,4,6,8,10,12,14 + epel_load_inc v\n, \len, put, \name, h, 1, t + .endr + addi a4, a4, -1 +1: + addi a4, a4, -1 + epel_load v30, \len, \op, \name, v, 0, s + vse8.v v30, (a0) + vmv.v.v v0, v2 + vmv.v.v v2, v4 + vmv.v.v v4, v6 + vmv.v.v v6, v8 + vmv.v.v v8, v10 + vmv.v.v v10, v12 + vmv.v.v v12, v14 + epel_load v14, \len, put, \name, h, 1, t + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + epel_load v30, \len, \op, \name, v, 0, s + vse8.v v30, (a0) +.endm + +.macro epel_hv op name len vlen +func ff_\op\()_vp9_8tap_\name\()_\len\()hv_rvv\vlen\(), zve32x + addi sp, sp, -64 + .irp n,0,1,2,3,4,5,6,7 + sd s\n, \n\()<<3(sp) + .endr +.if \len == 64 && \vlen < 256 + addi sp, sp, -48 + .irp n,0,1,2,3,4,5 + sd a\n, \n\()<<3(sp) + .endr +.endif +.ifc \op,avg + csrwi vxrm, 0 +.endif + epel_filter \name, h, t + epel_filter \name, v, s +.if \vlen < 256 + vsetvlstatic8 \len, a6, 32, m2 +.else + vsetvlstatic8 \len, a6, 64, m2 +.endif + epel_hv_once \len, \name, \op +.if \len == 64 && \vlen < 256 + .irp n,0,1,2,3,4,5 + ld a\n, \n\()<<3(sp) + .endr + addi sp, sp, 48 + addi a0, a0, 32 + addi a2, a2, 32 + epel_filter \name, h, t + epel_hv_once \len, \name, \op +.endif + .irp n,0,1,2,3,4,5,6,7 + ld s\n, \n\()<<3(sp) + .endr + addi sp, sp, 64 + + ret +endfunc +.endm +#endif + .irp len, 64, 32, 16, 8, 4 copy_avg \len .irp op, put, avg @@ -373,6 +444,10 @@ endfunc epel \len, \op, \name, \type, 128 epel \len, \op, \name, \type, 256 .endr + #if __riscv_xlen == 64 + epel_hv \op, \name, \len, 128 + epel_hv \op, \name, \len, 256 + #endif .endr .endr .endr diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index be5369d506..887dba461f 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -118,6 +118,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) if (flags & AV_CPU_FLAG_RVB_ADDR) { init_subpel2(0, 0, 1, v, put, 128); init_subpel2(1, 0, 1, v, avg, 128); +# if __riscv_xlen == 64 + init_subpel2(0, 1, 1, hv, put, 128); + init_subpel2(1, 1, 1, hv, avg, 128); +# endif } } @@ -128,6 +132,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) if (flags & AV_CPU_FLAG_RVB_ADDR) { init_subpel2(0, 0, 1, v, put, 256); init_subpel2(1, 0, 1, v, avg, 256); +# if __riscv_xlen == 64 + init_subpel2(0, 1, 1, hv, put, 256); + init_subpel2(1, 1, 1, hv, avg, 256); +# endif } } } -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2024-05-26 13:26 UTC | newest] Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- [not found] <20240521171319.2629938-1-uk7b@foxmail.com> 2024-05-21 17:13 ` [FFmpeg-devel] [PATCH v2 2/5] lavc/vp9dsp: R-V V mc bilin h v uk7b 2024-05-21 18:29 ` Rémi Denis-Courmont 2024-05-21 18:36 ` flow gg 2024-05-26 13:26 ` Rémi Denis-Courmont 2024-05-21 17:13 ` [FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap " uk7b 2024-05-25 10:17 ` Rémi Denis-Courmont 2024-05-25 10:38 ` flow gg 2024-05-25 10:47 ` flow gg 2024-05-25 17:36 ` Rémi Denis-Courmont 2024-05-21 17:13 ` [FFmpeg-devel] [PATCH v2 4/5] lavc/vp9dsp: R-V V mc bilin hv uk7b 2024-05-21 17:13 ` [FFmpeg-devel] [PATCH v2 5/5] lavc/vp9dsp: R-V V mc tap hv uk7b
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git