* [FFmpeg-devel] [PATCH 1/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add @ 2024-06-01 19:56 Rémi Denis-Courmont 2024-06-01 19:56 ` [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add4y Rémi Denis-Courmont 2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y Rémi Denis-Courmont 0 siblings, 2 replies; 5+ messages in thread From: Rémi Denis-Courmont @ 2024-06-01 19:56 UTC (permalink / raw) To: ffmpeg-devel This just computes the direct coefficient and hands over to code shared with VP8. Accordingly the bulk of changes are just rewriting the VP8 code to share. Nothing to write home about: vp7_idct_dc_add_c: 1.7 vp7_idct_dc_add_rvv_i32: 1.2 --- libavcodec/riscv/vp7dsp_init.c | 12 +++++++++++- libavcodec/riscv/vp8dsp_rvv.S | 30 +++++++++++++++++++++++------- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/libavcodec/riscv/vp7dsp_init.c b/libavcodec/riscv/vp7dsp_init.c index ae7f2d4277..491874483f 100644 --- a/libavcodec/riscv/vp7dsp_init.c +++ b/libavcodec/riscv/vp7dsp_init.c @@ -27,6 +27,15 @@ void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]); void ff_vp7_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +void ff_vp78_idct_dc_add_rvv(uint8_t *, int16_t block[16], ptrdiff_t, int dc); + +static void ff_vp7_idct_dc_add_rvv(uint8_t *dst, int16_t block[16], + ptrdiff_t stride) +{ + int dc = (23170 * (23170 * block[0] >> 14) + 0x20000) >> 18; + + ff_vp78_idct_dc_add_rvv(dst, block, stride, dc); +} av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c) { @@ -37,8 +46,9 @@ av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c) ff_rv_vlen_least(128)) { #if __riscv_xlen >= 64 c->vp8_luma_dc_wht = ff_vp7_luma_dc_wht_rvv; -#endif c->vp8_idct_add = ff_vp7_idct_add_rvv; +#endif + c->vp8_idct_dc_add = ff_vp7_idct_dc_add_rvv; } #endif } diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S index b187c6c7c9..02351be383 100644 --- a/libavcodec/riscv/vp8dsp_rvv.S +++ b/libavcodec/riscv/vp8dsp_rvv.S @@ -100,6 +100,29 @@ func ff_vp8_luma_dc_wht_rvv, zve64x endfunc #endif +func ff_vp8_idct_dc_add_rvv, zve32x + lh a3, (a1) + addi a3, a3, 4 + srai a3, a3, 3 + # fall through +endfunc + +func ff_vp78_idct_dc_add_rvv, zve32x + csrwi vxrm, 0 + vsetivli zero, 4, e8, mf4, ta, ma + sh zero, (a1) + vlse32.v v8, (a0), a2 + vsetivli zero, 16, e16, m2, ta, ma + vzext.vf2 v16, v8 + vadd.vx v16, v16, a3 + vmax.vx v16, v16, zero + vsetvli zero, zero, e8, m1, ta, ma + vnclipu.wi v8, v16, 0 + vsetivli zero, 4, e8, mf4, ta, ma + vsse32.v v8, (a0), a2 + ret +endfunc + .macro vp8_idct_dc_add vlse32.v v0, (a0), a2 lh a5, 0(a1) @@ -122,13 +145,6 @@ endfunc addi a1, a1, 32 .endm -func ff_vp8_idct_dc_add_rvv, zve32x - vsetivli zero, 4, e8, mf4, ta, ma - vp8_idct_dc_add - - ret -endfunc - func ff_vp8_idct_dc_add4y_rvv, zve32x vsetivli zero, 4, e8, mf4, ta, ma .rept 3 -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add4y 2024-06-01 19:56 [FFmpeg-devel] [PATCH 1/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add Rémi Denis-Courmont @ 2024-06-01 19:56 ` Rémi Denis-Courmont 2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y Rémi Denis-Courmont 1 sibling, 0 replies; 5+ messages in thread From: Rémi Denis-Courmont @ 2024-06-01 19:56 UTC (permalink / raw) To: ffmpeg-devel As with idct_dc_add, most of the code is shared with, and replaces, the previous VP8 function. To improve performance, we break down the 16x4 matrix into 4 rows, rather than 4 squares. Thus strided loads and stores are avoided, and the 4 DC calculations are vectored. Unfortunately this requires a vector gather to splat the DC values, but overall this is still a win for performance: T-Head C908: vp7_idct_dc_add4y_c: 7.2 vp7_idct_dc_add4y_rvv_i32: 2.2 vp8_idct_dc_add4y_c: 6.2 vp8_idct_dc_add4y_rvv_i32: 2.2 (before) vp8_idct_dc_add4y_rvv_i32: 1.7 SpacemiT X60: vp7_idct_dc_add4y_c: 6.2 vp7_idct_dc_add4y_rvv_i32: 2.0 vp8_idct_dc_add4y_c: 5.5 vp8_idct_dc_add4y_rvv_i32: 2.5 (before) vp8_idct_dc_add4y_rvv_i32: 1.7 I also tried to provision the DC values using indexed loads. It ends up slower overall, especially for VP7, as we then have to compute 16 DC's instead of just 4. --- libavcodec/riscv/vp7dsp_init.c | 2 ++ libavcodec/riscv/vp7dsp_rvv.S | 16 ++++++++++++ libavcodec/riscv/vp8dsp_rvv.S | 46 ++++++++++++++++++++++++++-------- 3 files changed, 54 insertions(+), 10 deletions(-) diff --git a/libavcodec/riscv/vp7dsp_init.c b/libavcodec/riscv/vp7dsp_init.c index 491874483f..fa5fb9d2ae 100644 --- a/libavcodec/riscv/vp7dsp_init.c +++ b/libavcodec/riscv/vp7dsp_init.c @@ -28,6 +28,7 @@ void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]); void ff_vp7_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride); void ff_vp78_idct_dc_add_rvv(uint8_t *, int16_t block[16], ptrdiff_t, int dc); +void ff_vp7_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t); static void ff_vp7_idct_dc_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride) @@ -49,6 +50,7 @@ av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c) c->vp8_idct_add = ff_vp7_idct_add_rvv; #endif c->vp8_idct_dc_add = ff_vp7_idct_dc_add_rvv; + c->vp8_idct_dc_add4y = ff_vp7_idct_dc_add4y_rvv; } #endif } diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S index ecebbebd52..3fe859757d 100644 --- a/libavcodec/riscv/vp7dsp_rvv.S +++ b/libavcodec/riscv/vp7dsp_rvv.S @@ -129,3 +129,19 @@ func ff_vp7_idct_add_rvv, zve32x ret endfunc #endif + +func ff_vp7_idct_dc_add4y_rvv, zve32x + li t0, 32 + vsetivli zero, 4, e16, mf2, ta, ma + li t1, 23170 + vlse16.v v8, (a1), t0 # block[0..3][0] + vwmul.vx v0, v8, t1 + li t2, 0x20000 + vsetvli zero, zero, e32, m1, ta, ma + vsra.vi v0, v0, 14 + vmul.vx v0, v0, t1 + vadd.vx v0, v0, t2 + vsetvli zero, zero, e16, mf2, ta, ma + vnsra.wi v8, v0, 18 # 4x DC + tail ff_vp78_idct_dc_add4y_rvv +endfunc diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S index 02351be383..9fa2ab2376 100644 --- a/libavcodec/riscv/vp8dsp_rvv.S +++ b/libavcodec/riscv/vp8dsp_rvv.S @@ -107,6 +107,7 @@ func ff_vp8_idct_dc_add_rvv, zve32x # fall through endfunc +# a3 = DC func ff_vp78_idct_dc_add_rvv, zve32x csrwi vxrm, 0 vsetivli zero, 4, e8, mf4, ta, ma @@ -123,6 +124,41 @@ func ff_vp78_idct_dc_add_rvv, zve32x ret endfunc +func ff_vp8_idct_dc_add4y_rvv, zve32x + li t0, 32 + vsetivli zero, 4, e16, mf2, ta, ma + vlse16.v v8, (a1), t0 + vadd.vi v8, v8, 4 + vsra.vi v8, v8, 3 + # fall through +endfunc + + .variant_cc ff_vp78_idct_dc_add4y_rvv +# v8 = [dc0, dc1, dc2, dc3] +func ff_vp78_idct_dc_add4y_rvv, zve32x + vsetivli zero, 16, e16, m2, ta, ma + vid.v v4 + vsrl.vi v4, v4, 2 + vrgather.vv v0, v8, v4 # replicate each DC four times + vsetvli zero, zero, e8, m1, ta, ma + li a4, 4 +1: + vle8.v v8, (a0) + addi a4, a4, -1 + vwaddu.wv v16, v0, v8 + sh zero, (a1) + vsetvli zero, zero, e16, m2, ta, ma + vmax.vx v16, v16, zero + addi a1, a1, 32 + vsetvli zero, zero, e8, m1, ta, ma + vnclipu.wi v8, v16, 0 + vse8.v v8, (a0) + add a0, a0, a2 + bnez a4, 1b + + ret +endfunc + .macro vp8_idct_dc_add vlse32.v v0, (a0), a2 lh a5, 0(a1) @@ -145,16 +181,6 @@ endfunc addi a1, a1, 32 .endm -func ff_vp8_idct_dc_add4y_rvv, zve32x - vsetivli zero, 4, e8, mf4, ta, ma - .rept 3 - vp8_idct_dc_addy - .endr - vp8_idct_dc_add - - ret -endfunc - func ff_vp8_idct_dc_add4uv_rvv, zve32x vsetivli zero, 4, e8, mf4, ta, ma vp8_idct_dc_addy -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y 2024-06-01 19:56 [FFmpeg-devel] [PATCH 1/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add Rémi Denis-Courmont 2024-06-01 19:56 ` [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add4y Rémi Denis-Courmont @ 2024-06-02 10:24 ` Rémi Denis-Courmont 2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 4/5] lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv Rémi Denis-Courmont 2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: remove no longer used macros Rémi Denis-Courmont 1 sibling, 2 replies; 5+ messages in thread From: Rémi Denis-Courmont @ 2024-06-02 10:24 UTC (permalink / raw) To: ffmpeg-devel DCT-related FFmpeg functions often add an unsigned 8-bit sample to a signed 16-bit coefficient, then clip the result back to an unsigned 8-bit value. RISC-V has no signed 16-bit to unsigned 8-bit clip, so instead our most common sequence is: VWADDU.WV set SEW to 16 bits VMAX.VV zero # clip negative values to 0 set SEW to 8 bits VNCLIPU.WI # clip values over 255 to 255 and narrow Here we use a different sequence which does not require toggling the vector type. This assumes that the wide addend vector is biased by -128: VWADDU.WV VNCLIP.WI # clip values to signed 8-bit and narrow VXOR.VX 0x80 # flip sign bit (convert signed to unsigned) Also the VMAX is effectively replaced by a VXOR of half-width. In this function, this comes for free as we anyway add a constant to the wide vector in the prologue. On C908, this has no observable effects. On X60, this improves microbenchmarks by about 20%. --- libavcodec/riscv/vp7dsp_rvv.S | 2 +- libavcodec/riscv/vp8dsp_rvv.S | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S index 39b23c2e79..09dcbf3857 100644 --- a/libavcodec/riscv/vp7dsp_rvv.S +++ b/libavcodec/riscv/vp7dsp_rvv.S @@ -134,7 +134,7 @@ func ff_vp7_idct_dc_add4y_rvv, zve32x li t1, 23170 vlse16.v v8, (a1), t0 # block[0..3][0] vwmul.vx v0, v8, t1 - li t2, 0x20000 + li t2, 0x20000 - (128 << 18) vsetvli zero, zero, e32, m1, ta, ma vsra.vi v0, v0, 14 vmul.vx v0, v0, t1 diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S index 8ea0a0c9bd..458eebb306 100644 --- a/libavcodec/riscv/vp8dsp_rvv.S +++ b/libavcodec/riscv/vp8dsp_rvv.S @@ -125,31 +125,31 @@ endfunc func ff_vp8_idct_dc_add4y_rvv, zve32x li t0, 32 vsetivli zero, 4, e16, mf2, ta, ma + li t1, 4 - (128 << 3) vlse16.v v8, (a1), t0 - vadd.vi v8, v8, 4 + vadd.vx v8, v8, t1 vsra.vi v8, v8, 3 # fall through endfunc .variant_cc ff_vp78_idct_dc_add4y_rvv -# v8 = [dc0, dc1, dc2, dc3] +# v8 = [dc0 - 128, dc1 - 128, dc2 - 128, dc3 - 128] func ff_vp78_idct_dc_add4y_rvv, zve32x vsetivli zero, 16, e16, m2, ta, ma vid.v v4 + li a4, 4 vsrl.vi v4, v4, 2 + li t1, 128 vrgather.vv v0, v8, v4 # replicate each DC four times vsetvli zero, zero, e8, m1, ta, ma - li a4, 4 1: vle8.v v8, (a0) addi a4, a4, -1 vwaddu.wv v16, v0, v8 sh zero, (a1) - vsetvli zero, zero, e16, m2, ta, ma - vmax.vx v16, v16, zero + vnclip.wi v8, v16, 0 addi a1, a1, 32 - vsetvli zero, zero, e8, m1, ta, ma - vnclipu.wi v8, v16, 0 + vxor.vx v8, v8, t1 vse8.v v8, (a0) add a0, a0, a2 bnez a4, 1b -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 4/5] lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv 2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y Rémi Denis-Courmont @ 2024-06-02 10:24 ` Rémi Denis-Courmont 2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: remove no longer used macros Rémi Denis-Courmont 1 sibling, 0 replies; 5+ messages in thread From: Rémi Denis-Courmont @ 2024-06-02 10:24 UTC (permalink / raw) To: ffmpeg-devel This is almost the same story as vp7_idct_add4y. We just have to use strided loads of 2 64-bit elements to account for the different data layout in memory. T-Head C908: vp7_idct_dc_add4uv_c: 7.5 vp7_idct_dc_add4uv_rvv_i64: 2.0 vp8_idct_dc_add4uv_c: 6.2 vp8_idct_dc_add4uv_rvv_i32: 2.2 (before) vp8_idct_dc_add4uv_rvv_i64: 2.0 SpacemiT X60: vp7_idct_dc_add4uv_c: 6.7 vp7_idct_dc_add4uv_rvv_i64: 2.2 vp8_idct_dc_add4uv_c: 5.7 vp8_idct_dc_add4uv_rvv_i32: 2.5 (before) vp8_idct_dc_add4uv_rvv_i64: 2.0 --- libavcodec/riscv/vp7dsp_init.c | 3 ++ libavcodec/riscv/vp7dsp_rvv.S | 6 ++-- libavcodec/riscv/vp8dsp_init.c | 3 +- libavcodec/riscv/vp8dsp_rvv.S | 50 +++++++++++++++++++++++++--------- 4 files changed, 45 insertions(+), 17 deletions(-) diff --git a/libavcodec/riscv/vp7dsp_init.c b/libavcodec/riscv/vp7dsp_init.c index fa5fb9d2ae..9b8357ec05 100644 --- a/libavcodec/riscv/vp7dsp_init.c +++ b/libavcodec/riscv/vp7dsp_init.c @@ -29,6 +29,7 @@ void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]); void ff_vp7_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride); void ff_vp78_idct_dc_add_rvv(uint8_t *, int16_t block[16], ptrdiff_t, int dc); void ff_vp7_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t); +void ff_vp7_idct_dc_add4uv_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t); static void ff_vp7_idct_dc_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride) @@ -51,6 +52,8 @@ av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c) #endif c->vp8_idct_dc_add = ff_vp7_idct_dc_add_rvv; c->vp8_idct_dc_add4y = ff_vp7_idct_dc_add4y_rvv; + if (flags & AV_CPU_FLAG_RVV_I64) + c->vp8_idct_dc_add4uv = ff_vp7_idct_dc_add4uv_rvv; } #endif } diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S index 09dcbf3857..856b0e8c96 100644 --- a/libavcodec/riscv/vp7dsp_rvv.S +++ b/libavcodec/riscv/vp7dsp_rvv.S @@ -128,7 +128,8 @@ func ff_vp7_idct_add_rvv, zve32x endfunc #endif -func ff_vp7_idct_dc_add4y_rvv, zve32x +.irp type, y, uv +func ff_vp7_idct_dc_add4\type\()_rvv, zve32x li t0, 32 vsetivli zero, 4, e16, mf2, ta, ma li t1, 23170 @@ -141,5 +142,6 @@ func ff_vp7_idct_dc_add4y_rvv, zve32x vadd.vx v0, v0, t2 vsetvli zero, zero, e16, mf2, ta, ma vnsra.wi v8, v0, 18 # 4x DC - tail ff_vp78_idct_dc_add4y_rvv + tail ff_vp78_idct_dc_add4\type\()_rvv endfunc +.endr diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c index 836237b41c..5911d195ba 100644 --- a/libavcodec/riscv/vp8dsp_init.c +++ b/libavcodec/riscv/vp8dsp_init.c @@ -131,9 +131,8 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c) #endif c->vp8_idct_dc_add = ff_vp8_idct_dc_add_rvv; c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_rvv; - if (flags & AV_CPU_FLAG_RVB_ADDR) { + if (flags & AV_CPU_FLAG_RVV_I64) c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv; - } } #endif } diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S index 458eebb306..c83f9eec71 100644 --- a/libavcodec/riscv/vp8dsp_rvv.S +++ b/libavcodec/riscv/vp8dsp_rvv.S @@ -157,6 +157,43 @@ func ff_vp78_idct_dc_add4y_rvv, zve32x ret endfunc +func ff_vp8_idct_dc_add4uv_rvv, zve32x + li t0, 32 + vsetivli zero, 4, e16, mf2, ta, ma + li t1, 4 - (128 << 3) + vlse16.v v8, (a1), t0 + vadd.vx v8, v8, t1 + vsra.vi v8, v8, 3 + # fall through +endfunc + + .variant_cc ff_vp78_idct_dc_add4uv_rvv +func ff_vp78_idct_dc_add4uv_rvv, zve64x + vsetivli zero, 16, e16, m2, ta, ma + vid.v v4 + li a4, 4 + vsrl.vi v4, v4, 2 + li t1, 128 + vrgather.vv v0, v8, v4 # replicate each DC four times + slli t2, a2, 2 + vsetivli zero, 2, e64, m1, ta, ma +1: + vlse64.v v8, (a0), t2 + addi a4, a4, -1 + vsetivli zero, 16, e8, m1, ta, ma + vwaddu.wv v16, v0, v8 + sh zero, (a1) + vnclip.wi v8, v16, 0 + addi a1, a1, 32 + vxor.vx v8, v8, t1 + vsetivli zero, 2, e64, m1, ta, ma + vsse64.v v8, (a0), t2 + add a0, a0, a2 + bnez a4, 1b + + ret +endfunc + .macro vp8_idct_dc_add vlse32.v v0, (a0), a2 lh a5, 0(a1) @@ -179,19 +216,6 @@ endfunc addi a1, a1, 32 .endm -func ff_vp8_idct_dc_add4uv_rvv, zve32x - vsetivli zero, 4, e8, mf4, ta, ma - vp8_idct_dc_addy - vp8_idct_dc_add - addi a0, a0, -4 - sh2add a0, a2, a0 - addi a1, a1, 32 - vp8_idct_dc_addy - vp8_idct_dc_add - - ret -endfunc - .macro bilin_load dst type mn .ifc \type,v add t5, a2, a3 -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: remove no longer used macros 2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y Rémi Denis-Courmont 2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 4/5] lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv Rémi Denis-Courmont @ 2024-06-02 10:24 ` Rémi Denis-Courmont 1 sibling, 0 replies; 5+ messages in thread From: Rémi Denis-Courmont @ 2024-06-02 10:24 UTC (permalink / raw) To: ffmpeg-devel --- libavcodec/riscv/vp8dsp_rvv.S | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S index c83f9eec71..82489a7f14 100644 --- a/libavcodec/riscv/vp8dsp_rvv.S +++ b/libavcodec/riscv/vp8dsp_rvv.S @@ -194,28 +194,6 @@ func ff_vp78_idct_dc_add4uv_rvv, zve64x ret endfunc -.macro vp8_idct_dc_add - vlse32.v v0, (a0), a2 - lh a5, 0(a1) - sh zero, 0(a1) - addi a5, a5, 4 - srai t1, a5, 3 - vsetivli zero, 4*4, e16, m2, ta, ma - vzext.vf2 v2, v0 - vadd.vx v2, v2, t1 - vmax.vx v2, v2, zero - vsetvli zero, zero, e8, m1, ta, ma - vnclipu.wi v0, v2, 0 - vsetivli zero, 4, e8, mf4, ta, ma - vsse32.v v0, (a0), a2 -.endm - -.macro vp8_idct_dc_addy - vp8_idct_dc_add - addi a0, a0, 4 - addi a1, a1, 32 -.endm - .macro bilin_load dst type mn .ifc \type,v add t5, a2, a3 -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2024-06-02 10:25 UTC | newest] Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2024-06-01 19:56 [FFmpeg-devel] [PATCH 1/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add Rémi Denis-Courmont 2024-06-01 19:56 ` [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add4y Rémi Denis-Courmont 2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y Rémi Denis-Courmont 2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 4/5] lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv Rémi Denis-Courmont 2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: remove no longer used macros Rémi Denis-Courmont
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git