* [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels @ 2024-05-04 10:01 uk7b 2024-05-04 10:08 ` flow gg 2024-05-04 17:53 ` Rémi Denis-Courmont 0 siblings, 2 replies; 16+ messages in thread From: uk7b @ 2024-05-04 10:01 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7 vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7 vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5 vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2 vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7 vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0 vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5 vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7 --- libavcodec/riscv/vc1dsp_init.c | 8 +++++ libavcodec/riscv/vc1dsp_rvv.S | 66 ++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c index e47b644f80..610c43a1a3 100644 --- a/libavcodec/riscv/vc1dsp_init.c +++ b/libavcodec/riscv/vc1dsp_init.c @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) { @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) { dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv; dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv; + dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv; + dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv; if (flags & AV_CPU_FLAG_RVV_I64) { dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv; dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv; + dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv; + dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv; } } #endif diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S index 4a00945ead..48244f91aa 100644 --- a/libavcodec/riscv/vc1dsp_rvv.S +++ b/libavcodec/riscv/vc1dsp_rvv.S @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x vsse32.v v0, (a0), a1 ret endfunc + +func ff_put_pixels16x16_rvv, zve32x + vsetivli zero, 16, e8, m1, ta, ma + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 + vle8.v v\n, (a1) + add a1, a1, a2 + .endr + vle8.v v31, (a1) + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 + vse8.v v\n, (a0) + add a0, a0, a2 + .endr + vse8.v v31, (a0) + + ret +endfunc + +func ff_put_pixels8x8_rvv, zve64x + vsetivli zero, 8, e8, mf2, ta, ma + vlse64.v v8, (a1), a2 + vsse64.v v8, (a0), a2 + + ret +endfunc + +func ff_avg_pixels16x16_rvv, zve32x + csrwi vxrm, 0 + vsetivli zero, 16, e8, m1, ta, ma + li t0, 128 + + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 + vle8.v v\n, (a1) + add a1, a1, a2 + .endr + vle8.v v31, (a1) + .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + vle8.v v\n, (a0) + add a0, a0, a2 + .endr + vle8.v v15, (a0) + vsetvli zero, t0, e8, m8, ta, ma + vaaddu.vv v0, v0, v16 + vaaddu.vv v8, v8, v24 + vsetivli zero, 16, e8, m1, ta, ma + .irp n 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 + vse8.v v\n, (a0) + sub a0, a0, a2 + .endr + vse8.v v0, (a0) + + ret +endfunc + +func ff_avg_pixels8x8_rvv, zve64x + csrwi vxrm, 0 + li t0, 64 + vsetivli zero, 8, e8, mf2, ta, ma + vlse64.v v16, (a1), a2 + vlse64.v v8, (a0), a2 + vsetvli zero, t0, e8, m4, ta, ma + vaaddu.vv v16, v16, v8 + vsetivli zero, 8, e8, mf2, ta, ma + vsse64.v v16, (a0), a2 + + ret +endfunc -- 2.45.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels 2024-05-04 10:01 [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels uk7b @ 2024-05-04 10:08 ` flow gg 2024-05-04 17:53 ` Rémi Denis-Courmont 1 sibling, 0 replies; 16+ messages in thread From: flow gg @ 2024-05-04 10:08 UTC (permalink / raw) To: FFmpeg development discussions and patches Hi, it's me. I accidentally repeated it but it seems to be correct. <uk7b@foxmail.com> 于2024年5月4日周六 18:01写道: > From: sunyuechi <sunyuechi@iscas.ac.cn> > > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7 > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7 > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5 > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2 > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7 > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0 > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5 > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7 > --- > libavcodec/riscv/vc1dsp_init.c | 8 +++++ > libavcodec/riscv/vc1dsp_rvv.S | 66 ++++++++++++++++++++++++++++++++++ > 2 files changed, 74 insertions(+) > > diff --git a/libavcodec/riscv/vc1dsp_init.c > b/libavcodec/riscv/vc1dsp_init.c > index e47b644f80..610c43a1a3 100644 > --- a/libavcodec/riscv/vc1dsp_init.c > +++ b/libavcodec/riscv/vc1dsp_init.c > @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, > ptrdiff_t stride, int16_t *block > void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t > *block); > void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t > *block); > void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t > *block); > +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t > line_size, int rnd); > +void ff_put_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t > line_size, int rnd); > +void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t > line_size, int rnd); > +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t > line_size, int rnd); > > av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) > { > @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) > if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) { > dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv; > dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv; > + dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv; > + dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv; > if (flags & AV_CPU_FLAG_RVV_I64) { > dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv; > dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv; > + dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv; > + dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv; > } > } > #endif > diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S > index 4a00945ead..48244f91aa 100644 > --- a/libavcodec/riscv/vc1dsp_rvv.S > +++ b/libavcodec/riscv/vc1dsp_rvv.S > @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x > vsse32.v v0, (a0), a1 > ret > endfunc > + > +func ff_put_pixels16x16_rvv, zve32x > + vsetivli zero, 16, e8, m1, ta, ma > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 > + vle8.v v\n, (a1) > + add a1, a1, a2 > + .endr > + vle8.v v31, (a1) > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 > + vse8.v v\n, (a0) > + add a0, a0, a2 > + .endr > + vse8.v v31, (a0) > + > + ret > +endfunc > + > +func ff_put_pixels8x8_rvv, zve64x > + vsetivli zero, 8, e8, mf2, ta, ma > + vlse64.v v8, (a1), a2 > + vsse64.v v8, (a0), a2 > + > + ret > +endfunc > + > +func ff_avg_pixels16x16_rvv, zve32x > + csrwi vxrm, 0 > + vsetivli zero, 16, e8, m1, ta, ma > + li t0, 128 > + > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 > + vle8.v v\n, (a1) > + add a1, a1, a2 > + .endr > + vle8.v v31, (a1) > + .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 > + vle8.v v\n, (a0) > + add a0, a0, a2 > + .endr > + vle8.v v15, (a0) > + vsetvli zero, t0, e8, m8, ta, ma > + vaaddu.vv v0, v0, v16 > + vaaddu.vv v8, v8, v24 > + vsetivli zero, 16, e8, m1, ta, ma > + .irp n 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 > + vse8.v v\n, (a0) > + sub a0, a0, a2 > + .endr > + vse8.v v0, (a0) > + > + ret > +endfunc > + > +func ff_avg_pixels8x8_rvv, zve64x > + csrwi vxrm, 0 > + li t0, 64 > + vsetivli zero, 8, e8, mf2, ta, ma > + vlse64.v v16, (a1), a2 > + vlse64.v v8, (a0), a2 > + vsetvli zero, t0, e8, m4, ta, ma > + vaaddu.vv v16, v16, v8 > + vsetivli zero, 8, e8, mf2, ta, ma > + vsse64.v v16, (a0), a2 > + > + ret > +endfunc > -- > 2.45.0 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels 2024-05-04 10:01 [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels uk7b 2024-05-04 10:08 ` flow gg @ 2024-05-04 17:53 ` Rémi Denis-Courmont 2024-05-05 9:15 ` uk7b 2024-05-05 9:18 ` flow gg 1 sibling, 2 replies; 16+ messages in thread From: Rémi Denis-Courmont @ 2024-05-04 17:53 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi Le lauantaina 4. toukokuuta 2024, 13.01.05 EEST uk7b@foxmail.com a écrit : > From: sunyuechi <sunyuechi@iscas.ac.cn> > > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7 > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7 > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5 > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2 > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7 > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0 > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5 > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7 > --- > libavcodec/riscv/vc1dsp_init.c | 8 +++++ > libavcodec/riscv/vc1dsp_rvv.S | 66 ++++++++++++++++++++++++++++++++++ > 2 files changed, 74 insertions(+) > > diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c > index e47b644f80..610c43a1a3 100644 > --- a/libavcodec/riscv/vc1dsp_init.c > +++ b/libavcodec/riscv/vc1dsp_init.c > @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t > stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, > ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t > *dest, ptrdiff_t stride, int16_t *block); void > ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t > *block); +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, > ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvv(uint8_t *dst, > const uint8_t *src, ptrdiff_t line_size, int rnd); +void > ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t > line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t > *src, ptrdiff_t line_size, int rnd); > > av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) > { > @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) > if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) { > dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv; > dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv; > + dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv; > + dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv; > if (flags & AV_CPU_FLAG_RVV_I64) { > dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv; > dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv; > + dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv; > + dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv; > } > } > #endif > diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S > index 4a00945ead..48244f91aa 100644 > --- a/libavcodec/riscv/vc1dsp_rvv.S > +++ b/libavcodec/riscv/vc1dsp_rvv.S > @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x > vsse32.v v0, (a0), a1 > ret > endfunc > + > +func ff_put_pixels16x16_rvv, zve32x > + vsetivli zero, 16, e8, m1, ta, ma > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 > + vle8.v v\n, (a1) > + add a1, a1, a2 > + .endr > + vle8.v v31, (a1) Is it not faster to compute the address ahead of time, e.g.: add t1, a2, a1 vle8.v vN, (a1) sh1add a1, a2, a1 vle8.v vN+1, (t1) ...and so on? Even on a reordering core, you can't eliminate stall on data dependency if there is nothing else to be done. (Ditto below and in other patches.) > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 > + vse8.v v\n, (a0) > + add a0, a0, a2 > + .endr > + vse8.v v31, (a0) > + > + ret > +endfunc > + > +func ff_put_pixels8x8_rvv, zve64x > + vsetivli zero, 8, e8, mf2, ta, ma > + vlse64.v v8, (a1), a2 > + vsse64.v v8, (a0), a2 Copying 64-bit quantities should not need RVV at all. Maybe the C version needs to be improved instead, but if that is not possible, then an RVI version may be more portable and work just as well. > + > + ret > +endfunc > + > +func ff_avg_pixels16x16_rvv, zve32x > + csrwi vxrm, 0 > + vsetivli zero, 16, e8, m1, ta, ma > + li t0, 128 > + > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 > + vle8.v v\n, (a1) > + add a1, a1, a2 > + .endr > + vle8.v v31, (a1) > + .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 > + vle8.v v\n, (a0) > + add a0, a0, a2 > + .endr > + vle8.v v15, (a0) > + vsetvli zero, t0, e8, m8, ta, ma > + vaaddu.vv v0, v0, v16 > + vaaddu.vv v8, v8, v24 > + vsetivli zero, 16, e8, m1, ta, ma > + .irp n 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 > + vse8.v v\n, (a0) > + sub a0, a0, a2 > + .endr > + vse8.v v0, (a0) > + > + ret > +endfunc > + > +func ff_avg_pixels8x8_rvv, zve64x > + csrwi vxrm, 0 > + li t0, 64 > + vsetivli zero, 8, e8, mf2, ta, ma Does MF2 actually improve perfs over M1 here? > + vlse64.v v16, (a1), a2 > + vlse64.v v8, (a0), a2 > + vsetvli zero, t0, e8, m4, ta, ma > + vaaddu.vv v16, v16, v8 > + vsetivli zero, 8, e8, mf2, ta, ma > + vsse64.v v16, (a0), a2 > + > + ret > +endfunc -- レミ・デニ-クールモン http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels 2024-05-04 17:53 ` Rémi Denis-Courmont @ 2024-05-05 9:15 ` uk7b 2024-05-05 9:18 ` flow gg 1 sibling, 0 replies; 16+ messages in thread From: uk7b @ 2024-05-05 9:15 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 875.2 vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 141.7 vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 226.5 vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2 vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 529.5 vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 79.5 vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 144.5 vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvi: 26.7 --- libavcodec/riscv/Makefile | 1 + libavcodec/riscv/vc1dsp_init.c | 16 +++++++++- libavcodec/riscv/vc1dsp_rvi.S | 37 ++++++++++++++++++++++ libavcodec/riscv/vc1dsp_rvv.S | 57 ++++++++++++++++++++++++++++++++++ 4 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 libavcodec/riscv/vc1dsp_rvi.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index 43b5c21cf4..cd5cc21cfd 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -59,6 +59,7 @@ RVV-OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_rvv.o OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o +RV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvi.o RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c index e47b644f80..bddc4b65eb 100644 --- a/libavcodec/riscv/vc1dsp_init.c +++ b/libavcodec/riscv/vc1dsp_init.c @@ -29,19 +29,33 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) { -#if HAVE_RVV +#if HAVE_RV int flags = av_get_cpu_flags(); +# if __riscv_xlen >= 64 + if (flags & AV_CPU_FLAG_RVI) { + dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvi; + } +# endif +#if HAVE_RVV if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) { dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv; dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv; + dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv; + dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv; if (flags & AV_CPU_FLAG_RVV_I64) { dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv; dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv; + dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv; } } #endif +#endif } diff --git a/libavcodec/riscv/vc1dsp_rvi.S b/libavcodec/riscv/vc1dsp_rvi.S new file mode 100644 index 0000000000..bee8e52f2e --- /dev/null +++ b/libavcodec/riscv/vc1dsp_rvi.S @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +#if __riscv_xlen >= 64 +func ff_put_pixels8x8_rvi +.rept 7 + ld t0, (a1) + sd t0, (a0) + add a1, a1, a2 + add a0, a0, a2 +.endr + ld t0, (a1) + sd t0, (a0) + + ret +endfunc +#endif + diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S index 4a00945ead..08c93476de 100644 --- a/libavcodec/riscv/vc1dsp_rvv.S +++ b/libavcodec/riscv/vc1dsp_rvv.S @@ -111,3 +111,60 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x vsse32.v v0, (a0), a1 ret endfunc + +.macro mspel_op op pos n1 n2 + add t1, \pos, a2 + v\op\()e8.v v\n1, (\pos) + sh1add \pos, a2, \pos + v\op\()e8.v v\n2, (t1) +.endm + +.macro mspel_op_all op pos a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16 + mspel_op \op \pos \a1 \a2 + mspel_op \op \pos \a3 \a4 + mspel_op \op \pos \a5 \a6 + mspel_op \op \pos \a7 \a8 + mspel_op \op \pos \a9 \a10 + mspel_op \op \pos \a11 \a12 + mspel_op \op \pos \a13 \a14 + mspel_op \op \pos \a15 \a16 +.endm + +func ff_put_pixels16x16_rvv, zve32x + vsetivli zero, 16, e8, m1, ta, ma + mspel_op_all l a1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + mspel_op_all s a0 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + + ret +endfunc + +func ff_avg_pixels16x16_rvv, zve32x + csrwi vxrm, 0 + vsetivli zero, 16, e8, m1, ta, ma + li t0, 128 + mspel_op_all l a1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + mspel_op_all l a0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + vsetvli zero, t0, e8, m8, ta, ma + sub a0, a0, a2 + vaaddu.vv v0, v0, v16 + neg a2, a2 + vaaddu.vv v8, v8, v24 + vsetivli zero, 16, e8, m1, ta, ma + mspel_op_all s a0 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + + ret +endfunc + +func ff_avg_pixels8x8_rvv, zve64x + csrwi vxrm, 0 + li t0, 64 + vsetivli zero, 8, e8, mf2, ta, ma + vlse64.v v16, (a1), a2 + vlse64.v v8, (a0), a2 + vsetvli zero, t0, e8, m4, ta, ma + vaaddu.vv v16, v16, v8 + vsetivli zero, 8, e8, mf2, ta, ma + vsse64.v v16, (a0), a2 + + ret +endfunc -- 2.45.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels 2024-05-04 17:53 ` Rémi Denis-Courmont 2024-05-05 9:15 ` uk7b @ 2024-05-05 9:18 ` flow gg 2024-05-05 19:26 ` Rémi Denis-Courmont 1 sibling, 1 reply; 16+ messages in thread From: flow gg @ 2024-05-05 9:18 UTC (permalink / raw) To: FFmpeg development discussions and patches > Is it not faster to compute the address ahead of time, e.g.: > Ditto below and in other patches. Yes, update here and I will check other patches > Copying 64-bit quantities should not need RVV at all. Maybe the C version needs to be improved instead, but if that is not possible, then an RVI version may be more portable and work just as well. The logic in the c version is the same in other places, which might be difficult to modify. I've updated it using rvi. > Does MF2 actually improve perfs over M1 here? The difference here seems very small, but when both mf2 and m1 are correct, the test results have only shown mf2 to be better, so I want to use mf2. Rémi Denis-Courmont <remi@remlab.net> 于2024年5月5日周日 01:53写道: > Le lauantaina 4. toukokuuta 2024, 13.01.05 EEST uk7b@foxmail.com a écrit : > > From: sunyuechi <sunyuechi@iscas.ac.cn> > > > > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7 > > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7 > > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5 > > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2 > > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7 > > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0 > > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5 > > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7 > > --- > > libavcodec/riscv/vc1dsp_init.c | 8 +++++ > > libavcodec/riscv/vc1dsp_rvv.S | 66 ++++++++++++++++++++++++++++++++++ > > 2 files changed, 74 insertions(+) > > > > diff --git a/libavcodec/riscv/vc1dsp_init.c > b/libavcodec/riscv/vc1dsp_init.c > > index e47b644f80..610c43a1a3 100644 > > --- a/libavcodec/riscv/vc1dsp_init.c > > +++ b/libavcodec/riscv/vc1dsp_init.c > > @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, > ptrdiff_t > > stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, > > ptrdiff_t stride, int16_t *block); void > ff_vc1_inv_trans_8x4_dc_rvv(uint8_t > > *dest, ptrdiff_t stride, int16_t *block); void > > ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t > > *block); +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, > > ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvv(uint8_t *dst, > > const uint8_t *src, ptrdiff_t line_size, int rnd); +void > > ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t > > line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const > uint8_t > > *src, ptrdiff_t line_size, int rnd); > > > > av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) > > { > > @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) > > if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) { > > dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv; > > dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv; > > + dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv; > > + dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv; > > if (flags & AV_CPU_FLAG_RVV_I64) { > > dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv; > > dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv; > > + dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv; > > + dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv; > > } > > } > > #endif > > diff --git a/libavcodec/riscv/vc1dsp_rvv.S > b/libavcodec/riscv/vc1dsp_rvv.S > > index 4a00945ead..48244f91aa 100644 > > --- a/libavcodec/riscv/vc1dsp_rvv.S > > +++ b/libavcodec/riscv/vc1dsp_rvv.S > > @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x > > vsse32.v v0, (a0), a1 > > ret > > endfunc > > + > > +func ff_put_pixels16x16_rvv, zve32x > > + vsetivli zero, 16, e8, m1, ta, ma > > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, > 30 > > + vle8.v v\n, (a1) > > + add a1, a1, a2 > > + .endr > > + vle8.v v31, (a1) > > Is it not faster to compute the address ahead of time, e.g.: > > add t1, a2, a1 > vle8.v vN, (a1) > sh1add a1, a2, a1 > vle8.v vN+1, (t1) > > ...and so on? Even on a reordering core, you can't eliminate stall on data > dependency if there is nothing else to be done. > > (Ditto below and in other patches.) > > > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, > 30 > > + vse8.v v\n, (a0) > > + add a0, a0, a2 > > + .endr > > + vse8.v v31, (a0) > > + > > + ret > > +endfunc > > + > > +func ff_put_pixels8x8_rvv, zve64x > > + vsetivli zero, 8, e8, mf2, ta, ma > > + vlse64.v v8, (a1), a2 > > + vsse64.v v8, (a0), a2 > > Copying 64-bit quantities should not need RVV at all. Maybe the C version > needs to be improved instead, but if that is not possible, then an RVI > version > may be more portable and work just as well. > > > + > > + ret > > +endfunc > > + > > +func ff_avg_pixels16x16_rvv, zve32x > > + csrwi vxrm, 0 > > + vsetivli zero, 16, e8, m1, ta, ma > > + li t0, 128 > > + > > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, > 30 > > + vle8.v v\n, (a1) > > + add a1, a1, a2 > > + .endr > > + vle8.v v31, (a1) > > + .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 > > + vle8.v v\n, (a0) > > + add a0, a0, a2 > > + .endr > > + vle8.v v15, (a0) > > + vsetvli zero, t0, e8, m8, ta, ma > > + vaaddu.vv v0, v0, v16 > > + vaaddu.vv v8, v8, v24 > > + vsetivli zero, 16, e8, m1, ta, ma > > + .irp n 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 > > + vse8.v v\n, (a0) > > + sub a0, a0, a2 > > + .endr > > + vse8.v v0, (a0) > > + > > + ret > > +endfunc > > + > > +func ff_avg_pixels8x8_rvv, zve64x > > + csrwi vxrm, 0 > > + li t0, 64 > > + vsetivli zero, 8, e8, mf2, ta, ma > > Does MF2 actually improve perfs over M1 here? > > > + vlse64.v v16, (a1), a2 > > + vlse64.v v8, (a0), a2 > > + vsetvli zero, t0, e8, m4, ta, ma > > + vaaddu.vv v16, v16, v8 > > + vsetivli zero, 8, e8, mf2, ta, ma > > + vsse64.v v16, (a0), a2 > > + > > + ret > > +endfunc > > > -- > レミ・デニ-クールモン > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels 2024-05-05 9:18 ` flow gg @ 2024-05-05 19:26 ` Rémi Denis-Courmont 2024-05-10 8:21 ` uk7b 2024-05-10 8:22 ` flow gg 0 siblings, 2 replies; 16+ messages in thread From: Rémi Denis-Courmont @ 2024-05-05 19:26 UTC (permalink / raw) To: FFmpeg development discussions and patches Le sunnuntaina 5. toukokuuta 2024, 12.18.56 EEST flow gg a écrit : > > Does MF2 actually improve perfs over M1 here? > > The difference here seems very small, but when both mf2 and m1 are correct, > the test results have only shown mf2 to be better, so I want to use mf2. I can live with that. But this is a slippery slope because large vector sizes would involve even smaller fractions. Then we would need to compute the value which might negate the performance gains from fractional multipliers. The fastest approach that I can think of is a symbolic LA (which expands to 1xAUIPC + 1xLA) to load a precomputed VTYPE value from a static variable. Furthermore, this requires VSETVL, which precludes immediate constant VL Indeed, the VSETIVL instruction does not exist. AFAIU, BananaPi F3 has 256-bit vectors already now. -- Rémi Denis-Courmont http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels 2024-05-05 19:26 ` Rémi Denis-Courmont @ 2024-05-10 8:21 ` uk7b 2024-05-12 11:48 ` Rémi Denis-Courmont 2024-05-10 8:22 ` flow gg 1 sibling, 1 reply; 16+ messages in thread From: uk7b @ 2024-05-10 8:21 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908 X60 vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c : 14.7 13.2 vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32 : 2.5 2.2 vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c : 3.7 3.5 vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64 : 1.0 1.2 vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c : 9.0 8.0 vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvi : 1.0 1.0 vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c : 2.5 2.2 vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvi : 0.5 0.5 --- libavcodec/riscv/Makefile | 1 + libavcodec/riscv/vc1dsp_init.c | 16 +++++++++++- libavcodec/riscv/vc1dsp_rvi.S | 48 ++++++++++++++++++++++++++++++++++ libavcodec/riscv/vc1dsp_rvv.S | 48 ++++++++++++++++++++++++++++++++++ 4 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 libavcodec/riscv/vc1dsp_rvi.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index 43b5c21cf4..cd5cc21cfd 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -59,6 +59,7 @@ RVV-OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_rvv.o OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o +RV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvi.o RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c index e47b644f80..555aa5aea7 100644 --- a/libavcodec/riscv/vc1dsp_init.c +++ b/libavcodec/riscv/vc1dsp_init.c @@ -29,19 +29,33 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_put_pixels16x16_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) { -#if HAVE_RVV +#if HAVE_RV int flags = av_get_cpu_flags(); +# if __riscv_xlen >= 64 + if (flags & AV_CPU_FLAG_RVI) { + dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvi; + dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvi; + } +# endif +#if HAVE_RVV if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) { dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv; dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv; + dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv; if (flags & AV_CPU_FLAG_RVV_I64) { dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv; dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv; + dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv; } } #endif +#endif } diff --git a/libavcodec/riscv/vc1dsp_rvi.S b/libavcodec/riscv/vc1dsp_rvi.S new file mode 100644 index 0000000000..1d5660316f --- /dev/null +++ b/libavcodec/riscv/vc1dsp_rvi.S @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +#if __riscv_xlen >= 64 +func ff_put_pixels8x8_rvi +.rept 8 + ld t0, (a1) + sd t0, (a0) + add a1, a1, a2 + add a0, a0, a2 +.endr + + ret +endfunc + +func ff_put_pixels16x16_rvi +.rept 16 + ld t0, (a1) + ld t1, 8(a1) + sd t0, (a0) + sd t1, 8(a0) + add a1, a1, a2 + add a0, a0, a2 +.endr + + ret +endfunc +#endif + diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S index 4a00945ead..7c2b47f66c 100644 --- a/libavcodec/riscv/vc1dsp_rvv.S +++ b/libavcodec/riscv/vc1dsp_rvv.S @@ -111,3 +111,51 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x vsse32.v v0, (a0), a1 ret endfunc + +.macro mspel_op op pos n1 n2 + add t1, \pos, a2 + v\op\()e8.v v\n1, (\pos) + sh1add \pos, a2, \pos + v\op\()e8.v v\n2, (t1) +.endm + +.macro mspel_op_all op pos a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16 + mspel_op \op \pos \a1 \a2 + mspel_op \op \pos \a3 \a4 + mspel_op \op \pos \a5 \a6 + mspel_op \op \pos \a7 \a8 + mspel_op \op \pos \a9 \a10 + mspel_op \op \pos \a11 \a12 + mspel_op \op \pos \a13 \a14 + mspel_op \op \pos \a15 \a16 +.endm + +func ff_avg_pixels16x16_rvv, zve32x + csrwi vxrm, 0 + vsetivli zero, 16, e8, m1, ta, ma + mspel_op_all l a1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + mspel_op_all l a0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + vsetvli t0, zero, e8, m8, ta, ma + sub a0, a0, a2 + vaaddu.vv v0, v0, v16 + neg a2, a2 + vaaddu.vv v8, v8, v24 + vsetivli zero, 16, e8, m1, ta, ma + mspel_op_all s a0 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + + ret +endfunc + +func ff_avg_pixels8x8_rvv, zve64x + csrwi vxrm, 0 + li t0, 64 + vsetivli zero, 8, e8, mf2, ta, ma + vlse64.v v16, (a1), a2 + vlse64.v v8, (a0), a2 + vsetvli zero, t0, e8, m4, ta, ma + vaaddu.vv v16, v16, v8 + vsetivli zero, 8, e8, mf2, ta, ma + vsse64.v v16, (a0), a2 + + ret +endfunc -- 2.45.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels 2024-05-10 8:21 ` uk7b @ 2024-05-12 11:48 ` Rémi Denis-Courmont 2024-05-12 12:43 ` uk7b 2024-05-12 12:43 ` flow gg 0 siblings, 2 replies; 16+ messages in thread From: Rémi Denis-Courmont @ 2024-05-12 11:48 UTC (permalink / raw) To: ffmpeg-devel Le perjantaina 10. toukokuuta 2024, 11.21.14 EEST uk7b@foxmail.com a écrit : > From: sunyuechi <sunyuechi@iscas.ac.cn> > > C908 X60 > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c : 14.7 13.2 > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32 : 2.5 2.2 > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c : 3.7 3.5 > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64 : 1.0 1.2 > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c : 9.0 8.0 > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvi : 1.0 1.0 > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c : 2.5 2.2 > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvi : 0.5 0.5 > --- > libavcodec/riscv/Makefile | 1 + > libavcodec/riscv/vc1dsp_init.c | 16 +++++++++++- > libavcodec/riscv/vc1dsp_rvi.S | 48 ++++++++++++++++++++++++++++++++++ > libavcodec/riscv/vc1dsp_rvv.S | 48 ++++++++++++++++++++++++++++++++++ > 4 files changed, 112 insertions(+), 1 deletion(-) > create mode 100644 libavcodec/riscv/vc1dsp_rvi.S > > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile > index 43b5c21cf4..cd5cc21cfd 100644 > --- a/libavcodec/riscv/Makefile > +++ b/libavcodec/riscv/Makefile > @@ -59,6 +59,7 @@ RVV-OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_rvv.o > OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o > RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o > OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o > +RV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvi.o > RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o > OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o > RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o > diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c > index e47b644f80..555aa5aea7 100644 > --- a/libavcodec/riscv/vc1dsp_init.c > +++ b/libavcodec/riscv/vc1dsp_init.c > @@ -29,19 +29,33 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, > ptrdiff_t stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t > *dest, ptrdiff_t stride, int16_t *block); void > ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t > *block); void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, > int16_t *block); +void ff_put_pixels16x16_rvi(uint8_t *dst, const uint8_t > *src, ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvi(uint8_t > *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void > ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t > line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t > *src, ptrdiff_t line_size, int rnd); > > av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) > { > -#if HAVE_RVV > +#if HAVE_RV > int flags = av_get_cpu_flags(); > > +# if __riscv_xlen >= 64 > + if (flags & AV_CPU_FLAG_RVI) { > + dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvi; > + dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvi; > + } > +# endif > +#if HAVE_RVV > if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) { > dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv; > dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv; > + dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv; > if (flags & AV_CPU_FLAG_RVV_I64) { > dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv; > dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv; > + dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv; > } > } > #endif > +#endif > } > diff --git a/libavcodec/riscv/vc1dsp_rvi.S b/libavcodec/riscv/vc1dsp_rvi.S > new file mode 100644 > index 0000000000..1d5660316f > --- /dev/null > +++ b/libavcodec/riscv/vc1dsp_rvi.S > @@ -0,0 +1,48 @@ > +/* > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences > (ISCAS). + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA + */ > + > +#include "libavutil/riscv/asm.S" > + > +#if __riscv_xlen >= 64 > +func ff_put_pixels8x8_rvi > +.rept 8 > + ld t0, (a1) > + sd t0, (a0) > + add a1, a1, a2 > + add a0, a0, a2 > +.endr > + > + ret > +endfunc Are you sure that these accesses are aligned? Same below > + > +func ff_put_pixels16x16_rvi > +.rept 16 > + ld t0, (a1) > + ld t1, 8(a1) > + sd t0, (a0) > + sd t1, 8(a0) > + add a1, a1, a2 > + add a0, a0, a2 > +.endr > + > + ret > +endfunc > +#endif > + -- 雷米‧德尼-库尔蒙 http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels 2024-05-12 11:48 ` Rémi Denis-Courmont @ 2024-05-12 12:43 ` uk7b 2024-05-12 12:43 ` flow gg 1 sibling, 0 replies; 16+ messages in thread From: uk7b @ 2024-05-12 12:43 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908 X60 vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c : 14.7 13.2 vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32 : 2.5 2.2 vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c : 3.7 3.5 vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64 : 1.0 1.2 vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c : 9.0 8.0 vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvi : 1.0 1.0 vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c : 2.5 2.2 vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvi : 0.5 0.5 --- libavcodec/riscv/Makefile | 1 + libavcodec/riscv/vc1dsp_init.c | 16 +++++++++++- libavcodec/riscv/vc1dsp_rvi.S | 47 +++++++++++++++++++++++++++++++++ libavcodec/riscv/vc1dsp_rvv.S | 48 ++++++++++++++++++++++++++++++++++ 4 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 libavcodec/riscv/vc1dsp_rvi.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index f2348e259e..6f4869145a 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -58,6 +58,7 @@ RVV-OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_rvv.o OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o +RV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvi.o RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o RV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvi.o diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c index e47b644f80..2b7071d6ff 100644 --- a/libavcodec/riscv/vc1dsp_init.c +++ b/libavcodec/riscv/vc1dsp_init.c @@ -29,19 +29,33 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_put_pixels16x16_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) { -#if HAVE_RVV +#if HAVE_RV int flags = av_get_cpu_flags(); +# if __riscv_xlen >= 64 + if (flags & AV_CPU_FLAG_RVI & AV_CPU_FLAG_RV_MISALIGNED) { + dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvi; + dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvi; + } +# endif +#if HAVE_RVV if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) { dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv; dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv; + dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv; if (flags & AV_CPU_FLAG_RVV_I64) { dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv; dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv; + dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv; } } #endif +#endif } diff --git a/libavcodec/riscv/vc1dsp_rvi.S b/libavcodec/riscv/vc1dsp_rvi.S new file mode 100644 index 0000000000..d4a1b5bf49 --- /dev/null +++ b/libavcodec/riscv/vc1dsp_rvi.S @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +#if __riscv_xlen >= 64 +func ff_put_pixels8x8_rvi +.rept 8 + ld t0, (a1) + sd t0, (a0) + add a1, a1, a2 + add a0, a0, a2 +.endr + + ret +endfunc + +func ff_put_pixels16x16_rvi +.rept 16 + ld t0, (a1) + ld t1, 8(a1) + sd t0, (a0) + sd t1, 8(a0) + add a1, a1, a2 + add a0, a0, a2 +.endr + + ret +endfunc +#endif diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S index 4a00945ead..7c2b47f66c 100644 --- a/libavcodec/riscv/vc1dsp_rvv.S +++ b/libavcodec/riscv/vc1dsp_rvv.S @@ -111,3 +111,51 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x vsse32.v v0, (a0), a1 ret endfunc + +.macro mspel_op op pos n1 n2 + add t1, \pos, a2 + v\op\()e8.v v\n1, (\pos) + sh1add \pos, a2, \pos + v\op\()e8.v v\n2, (t1) +.endm + +.macro mspel_op_all op pos a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16 + mspel_op \op \pos \a1 \a2 + mspel_op \op \pos \a3 \a4 + mspel_op \op \pos \a5 \a6 + mspel_op \op \pos \a7 \a8 + mspel_op \op \pos \a9 \a10 + mspel_op \op \pos \a11 \a12 + mspel_op \op \pos \a13 \a14 + mspel_op \op \pos \a15 \a16 +.endm + +func ff_avg_pixels16x16_rvv, zve32x + csrwi vxrm, 0 + vsetivli zero, 16, e8, m1, ta, ma + mspel_op_all l a1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + mspel_op_all l a0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + vsetvli t0, zero, e8, m8, ta, ma + sub a0, a0, a2 + vaaddu.vv v0, v0, v16 + neg a2, a2 + vaaddu.vv v8, v8, v24 + vsetivli zero, 16, e8, m1, ta, ma + mspel_op_all s a0 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + + ret +endfunc + +func ff_avg_pixels8x8_rvv, zve64x + csrwi vxrm, 0 + li t0, 64 + vsetivli zero, 8, e8, mf2, ta, ma + vlse64.v v16, (a1), a2 + vlse64.v v8, (a0), a2 + vsetvli zero, t0, e8, m4, ta, ma + vaaddu.vv v16, v16, v8 + vsetivli zero, 8, e8, mf2, ta, ma + vsse64.v v16, (a0), a2 + + ret +endfunc -- 2.45.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels 2024-05-12 11:48 ` Rémi Denis-Courmont 2024-05-12 12:43 ` uk7b @ 2024-05-12 12:43 ` flow gg 2024-05-12 12:57 ` uk7b 1 sibling, 1 reply; 16+ messages in thread From: flow gg @ 2024-05-12 12:43 UTC (permalink / raw) To: FFmpeg development discussions and patches It seems like it can't... update using AV_CPU_FLAG_RV_MISALIGNED Rémi Denis-Courmont <remi@remlab.net> 于2024年5月12日周日 19:48写道: > Le perjantaina 10. toukokuuta 2024, 11.21.14 EEST uk7b@foxmail.com a > écrit : > > From: sunyuechi <sunyuechi@iscas.ac.cn> > > > > C908 X60 > > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c : 14.7 13.2 > > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32 : 2.5 2.2 > > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c : 3.7 3.5 > > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64 : 1.0 1.2 > > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c : 9.0 8.0 > > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvi : 1.0 1.0 > > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c : 2.5 2.2 > > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvi : 0.5 0.5 > > --- > > libavcodec/riscv/Makefile | 1 + > > libavcodec/riscv/vc1dsp_init.c | 16 +++++++++++- > > libavcodec/riscv/vc1dsp_rvi.S | 48 ++++++++++++++++++++++++++++++++++ > > libavcodec/riscv/vc1dsp_rvv.S | 48 ++++++++++++++++++++++++++++++++++ > > 4 files changed, 112 insertions(+), 1 deletion(-) > > create mode 100644 libavcodec/riscv/vc1dsp_rvi.S > > > > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile > > index 43b5c21cf4..cd5cc21cfd 100644 > > --- a/libavcodec/riscv/Makefile > > +++ b/libavcodec/riscv/Makefile > > @@ -59,6 +59,7 @@ RVV-OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_rvv.o > > OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o > > RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o > > OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o > > +RV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvi.o > > RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o > > OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o > > RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o > > diff --git a/libavcodec/riscv/vc1dsp_init.c > b/libavcodec/riscv/vc1dsp_init.c > > index e47b644f80..555aa5aea7 100644 > > --- a/libavcodec/riscv/vc1dsp_init.c > > +++ b/libavcodec/riscv/vc1dsp_init.c > > @@ -29,19 +29,33 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, > > ptrdiff_t stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t > > *dest, ptrdiff_t stride, int16_t *block); void > > ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t > > *block); void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t > stride, > > int16_t *block); +void ff_put_pixels16x16_rvi(uint8_t *dst, const uint8_t > > *src, ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvi(uint8_t > > *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void > > ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t > > line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const > uint8_t > > *src, ptrdiff_t line_size, int rnd); > > > > av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) > > { > > -#if HAVE_RVV > > +#if HAVE_RV > > int flags = av_get_cpu_flags(); > > > > +# if __riscv_xlen >= 64 > > + if (flags & AV_CPU_FLAG_RVI) { > > + dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvi; > > + dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvi; > > + } > > +# endif > > +#if HAVE_RVV > > if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) { > > dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv; > > dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv; > > + dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv; > > if (flags & AV_CPU_FLAG_RVV_I64) { > > dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv; > > dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv; > > + dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv; > > } > > } > > #endif > > +#endif > > } > > diff --git a/libavcodec/riscv/vc1dsp_rvi.S > b/libavcodec/riscv/vc1dsp_rvi.S > > new file mode 100644 > > index 0000000000..1d5660316f > > --- /dev/null > > +++ b/libavcodec/riscv/vc1dsp_rvi.S > > @@ -0,0 +1,48 @@ > > +/* > > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences > > (ISCAS). + * > > + * This file is part of FFmpeg. > > + * > > + * FFmpeg is free software; you can redistribute it and/or > > + * modify it under the terms of the GNU Lesser General Public > > + * License as published by the Free Software Foundation; either > > + * version 2.1 of the License, or (at your option) any later version. > > + * > > + * FFmpeg is distributed in the hope that it will be useful, > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + * Lesser General Public License for more details. > > + * > > + * You should have received a copy of the GNU Lesser General Public > > + * License along with FFmpeg; if not, write to the Free Software > > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 > > USA + */ > > + > > +#include "libavutil/riscv/asm.S" > > + > > +#if __riscv_xlen >= 64 > > +func ff_put_pixels8x8_rvi > > +.rept 8 > > + ld t0, (a1) > > + sd t0, (a0) > > + add a1, a1, a2 > > + add a0, a0, a2 > > +.endr > > + > > + ret > > +endfunc > > Are you sure that these accesses are aligned? Same below > > > + > > +func ff_put_pixels16x16_rvi > > +.rept 16 > > + ld t0, (a1) > > + ld t1, 8(a1) > > + sd t0, (a0) > > + sd t1, 8(a0) > > + add a1, a1, a2 > > + add a0, a0, a2 > > +.endr > > + > > + ret > > +endfunc > > +#endif > > + > > -- > 雷米‧德尼-库尔蒙 > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels 2024-05-12 12:43 ` flow gg @ 2024-05-12 12:57 ` uk7b 0 siblings, 0 replies; 16+ messages in thread From: uk7b @ 2024-05-12 12:57 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908 X60 vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c : 14.7 13.2 vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32 : 2.5 2.2 vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c : 3.7 3.5 vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64 : 1.0 1.2 vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c : 9.0 8.0 vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvi : 1.0 1.0 vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c : 2.5 2.2 vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvi : 0.5 0.5 --- libavcodec/riscv/Makefile | 1 + libavcodec/riscv/vc1dsp_init.c | 16 +++++++++++- libavcodec/riscv/vc1dsp_rvi.S | 47 +++++++++++++++++++++++++++++++++ libavcodec/riscv/vc1dsp_rvv.S | 48 ++++++++++++++++++++++++++++++++++ 4 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 libavcodec/riscv/vc1dsp_rvi.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index f2348e259e..6f4869145a 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -58,6 +58,7 @@ RVV-OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_rvv.o OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o +RV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvi.o RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o RV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvi.o diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c index e47b644f80..2628369155 100644 --- a/libavcodec/riscv/vc1dsp_init.c +++ b/libavcodec/riscv/vc1dsp_init.c @@ -29,19 +29,33 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_put_pixels16x16_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) { -#if HAVE_RVV +#if HAVE_RV int flags = av_get_cpu_flags(); +# if __riscv_xlen >= 64 + if (flags & AV_CPU_FLAG_RV_MISALIGNED) { + dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvi; + dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvi; + } +# endif +#if HAVE_RVV if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) { dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv; dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv; + dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv; if (flags & AV_CPU_FLAG_RVV_I64) { dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv; dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv; + dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv; } } #endif +#endif } diff --git a/libavcodec/riscv/vc1dsp_rvi.S b/libavcodec/riscv/vc1dsp_rvi.S new file mode 100644 index 0000000000..d4a1b5bf49 --- /dev/null +++ b/libavcodec/riscv/vc1dsp_rvi.S @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +#if __riscv_xlen >= 64 +func ff_put_pixels8x8_rvi +.rept 8 + ld t0, (a1) + sd t0, (a0) + add a1, a1, a2 + add a0, a0, a2 +.endr + + ret +endfunc + +func ff_put_pixels16x16_rvi +.rept 16 + ld t0, (a1) + ld t1, 8(a1) + sd t0, (a0) + sd t1, 8(a0) + add a1, a1, a2 + add a0, a0, a2 +.endr + + ret +endfunc +#endif diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S index 4a00945ead..7c2b47f66c 100644 --- a/libavcodec/riscv/vc1dsp_rvv.S +++ b/libavcodec/riscv/vc1dsp_rvv.S @@ -111,3 +111,51 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x vsse32.v v0, (a0), a1 ret endfunc + +.macro mspel_op op pos n1 n2 + add t1, \pos, a2 + v\op\()e8.v v\n1, (\pos) + sh1add \pos, a2, \pos + v\op\()e8.v v\n2, (t1) +.endm + +.macro mspel_op_all op pos a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16 + mspel_op \op \pos \a1 \a2 + mspel_op \op \pos \a3 \a4 + mspel_op \op \pos \a5 \a6 + mspel_op \op \pos \a7 \a8 + mspel_op \op \pos \a9 \a10 + mspel_op \op \pos \a11 \a12 + mspel_op \op \pos \a13 \a14 + mspel_op \op \pos \a15 \a16 +.endm + +func ff_avg_pixels16x16_rvv, zve32x + csrwi vxrm, 0 + vsetivli zero, 16, e8, m1, ta, ma + mspel_op_all l a1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + mspel_op_all l a0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + vsetvli t0, zero, e8, m8, ta, ma + sub a0, a0, a2 + vaaddu.vv v0, v0, v16 + neg a2, a2 + vaaddu.vv v8, v8, v24 + vsetivli zero, 16, e8, m1, ta, ma + mspel_op_all s a0 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + + ret +endfunc + +func ff_avg_pixels8x8_rvv, zve64x + csrwi vxrm, 0 + li t0, 64 + vsetivli zero, 8, e8, mf2, ta, ma + vlse64.v v16, (a1), a2 + vlse64.v v8, (a0), a2 + vsetvli zero, t0, e8, m4, ta, ma + vaaddu.vv v16, v16, v8 + vsetivli zero, 8, e8, mf2, ta, ma + vsse64.v v16, (a0), a2 + + ret +endfunc -- 2.45.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels 2024-05-05 19:26 ` Rémi Denis-Courmont 2024-05-10 8:21 ` uk7b @ 2024-05-10 8:22 ` flow gg 2024-05-10 15:34 ` Rémi Denis-Courmont 1 sibling, 1 reply; 16+ messages in thread From: flow gg @ 2024-05-10 8:22 UTC (permalink / raw) To: FFmpeg development discussions and patches Hi, I got BananaPi F3, made some fixes, updated in reply Rémi Denis-Courmont <remi@remlab.net> 于2024年5月6日周一 03:26写道: > Le sunnuntaina 5. toukokuuta 2024, 12.18.56 EEST flow gg a écrit : > > > Does MF2 actually improve perfs over M1 here? > > > > The difference here seems very small, but when both mf2 and m1 are > correct, > > the test results have only shown mf2 to be better, so I want to use mf2. > > I can live with that. But this is a slippery slope because large vector > sizes > would involve even smaller fractions. Then we would need to compute the > value > which might negate the performance gains from fractional multipliers. > > The fastest approach that I can think of is a symbolic LA (which expands > to > 1xAUIPC + 1xLA) to load a precomputed VTYPE value from a static variable. > Furthermore, this requires VSETVL, which precludes immediate constant VL > Indeed, the VSETIVL instruction does not exist. > > AFAIU, BananaPi F3 has 256-bit vectors already now. > > -- > Rémi Denis-Courmont > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels 2024-05-10 8:22 ` flow gg @ 2024-05-10 15:34 ` Rémi Denis-Courmont 2024-05-11 10:02 ` flow gg 0 siblings, 1 reply; 16+ messages in thread From: Rémi Denis-Courmont @ 2024-05-10 15:34 UTC (permalink / raw) To: FFmpeg development discussions and patches Le perjantaina 10. toukokuuta 2024, 11.22.53 EEST flow gg a écrit : > Hi, I got BananaPi F3, made some fixes, updated in reply So... Does it benefit from halving the logical multiplier to process fixed-sized block as compared to C908, or can we stick to the same code regardless of vector sizes? Also beware that K60 cores have in-order pipelines, so data dependencies will probably hurt more than on C908. -- Rémi Denis-Courmont http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels 2024-05-10 15:34 ` Rémi Denis-Courmont @ 2024-05-11 10:02 ` flow gg 2024-05-11 10:24 ` Rémi Denis-Courmont 0 siblings, 1 reply; 16+ messages in thread From: flow gg @ 2024-05-11 10:02 UTC (permalink / raw) To: FFmpeg development discussions and patches The test results show that changing mf2 to m1 in ff_avg_pixels8x8_rvv in vc1, or changing mf2/mf4 to m1 in vsetvlstatic8 in vp8, results in a 10-20% performance decrease on both k230 and banana_f3. I think we should just continue using it as is... Rémi Denis-Courmont <remi@remlab.net> 于2024年5月10日周五 23:34写道: > Le perjantaina 10. toukokuuta 2024, 11.22.53 EEST flow gg a écrit : > > Hi, I got BananaPi F3, made some fixes, updated in reply > > So... Does it benefit from halving the logical multiplier to process > fixed-sized > block as compared to C908, or can we stick to the same code regardless of > vector sizes? > > Also beware that K60 cores have in-order pipelines, so data dependencies > will > probably hurt more than on C908. > > -- > Rémi Denis-Courmont > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels 2024-05-11 10:02 ` flow gg @ 2024-05-11 10:24 ` Rémi Denis-Courmont 2024-05-11 10:47 ` flow gg 0 siblings, 1 reply; 16+ messages in thread From: Rémi Denis-Courmont @ 2024-05-11 10:24 UTC (permalink / raw) To: FFmpeg development discussions and patches Le lauantaina 11. toukokuuta 2024, 13.02.02 EEST flow gg a écrit : > The test results show that changing mf2 to m1 in ff_avg_pixels8x8_rvv in > vc1, > or changing mf2/mf4 to m1 in vsetvlstatic8 in vp8, > results in a 10-20% performance decrease on both k230 and banana_f3. The questions remain, how changing from MF2 to MF4 affects performance on Zvl256b, and if it does, how to deal with that without breaking support for Zvl128b. -- Rémi Denis-Courmont http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels 2024-05-11 10:24 ` Rémi Denis-Courmont @ 2024-05-11 10:47 ` flow gg 0 siblings, 0 replies; 16+ messages in thread From: flow gg @ 2024-05-11 10:47 UTC (permalink / raw) To: FFmpeg development discussions and patches In banana_f3, further reducing the value of mf resulted in another performance improvement. I think in the end we might need to use different functions depending on vlen in init.. Rémi Denis-Courmont <remi@remlab.net> 于2024年5月11日周六 18:24写道: > Le lauantaina 11. toukokuuta 2024, 13.02.02 EEST flow gg a écrit : > > The test results show that changing mf2 to m1 in ff_avg_pixels8x8_rvv in > > vc1, > > or changing mf2/mf4 to m1 in vsetvlstatic8 in vp8, > > results in a 10-20% performance decrease on both k230 and banana_f3. > > The questions remain, how changing from MF2 to MF4 affects performance on > Zvl256b, and if it does, how to deal with that without breaking support > for > Zvl128b. > > -- > Rémi Denis-Courmont > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2024-05-12 12:58 UTC | newest] Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2024-05-04 10:01 [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels uk7b 2024-05-04 10:08 ` flow gg 2024-05-04 17:53 ` Rémi Denis-Courmont 2024-05-05 9:15 ` uk7b 2024-05-05 9:18 ` flow gg 2024-05-05 19:26 ` Rémi Denis-Courmont 2024-05-10 8:21 ` uk7b 2024-05-12 11:48 ` Rémi Denis-Courmont 2024-05-12 12:43 ` uk7b 2024-05-12 12:43 ` flow gg 2024-05-12 12:57 ` uk7b 2024-05-10 8:22 ` flow gg 2024-05-10 15:34 ` Rémi Denis-Courmont 2024-05-11 10:02 ` flow gg 2024-05-11 10:24 ` Rémi Denis-Courmont 2024-05-11 10:47 ` flow gg
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git