Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add
@ 2024-06-01 19:56 Rémi Denis-Courmont
  2024-06-01 19:56 ` [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add4y Rémi Denis-Courmont
  2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y Rémi Denis-Courmont
  0 siblings, 2 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-01 19:56 UTC (permalink / raw)
  To: ffmpeg-devel

This just computes the direct coefficient and hands over to code shared
with VP8. Accordingly the bulk of changes are just rewriting the VP8
code to share.

Nothing to write home about:
vp7_idct_dc_add_c:       1.7
vp7_idct_dc_add_rvv_i32: 1.2
---
 libavcodec/riscv/vp7dsp_init.c | 12 +++++++++++-
 libavcodec/riscv/vp8dsp_rvv.S  | 30 +++++++++++++++++++++++-------
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/libavcodec/riscv/vp7dsp_init.c b/libavcodec/riscv/vp7dsp_init.c
index ae7f2d4277..491874483f 100644
--- a/libavcodec/riscv/vp7dsp_init.c
+++ b/libavcodec/riscv/vp7dsp_init.c
@@ -27,6 +27,15 @@
 
 void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]);
 void ff_vp7_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp78_idct_dc_add_rvv(uint8_t *, int16_t block[16], ptrdiff_t, int dc);
+
+static void ff_vp7_idct_dc_add_rvv(uint8_t *dst, int16_t block[16],
+                                   ptrdiff_t stride)
+{
+    int dc = (23170 * (23170 * block[0] >> 14) + 0x20000) >> 18;
+
+    ff_vp78_idct_dc_add_rvv(dst, block, stride, dc);
+}
 
 av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c)
 {
@@ -37,8 +46,9 @@ av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c)
         ff_rv_vlen_least(128)) {
 #if __riscv_xlen >= 64
         c->vp8_luma_dc_wht = ff_vp7_luma_dc_wht_rvv;
-#endif
         c->vp8_idct_add = ff_vp7_idct_add_rvv;
+#endif
+        c->vp8_idct_dc_add = ff_vp7_idct_dc_add_rvv;
     }
 #endif
 }
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index b187c6c7c9..02351be383 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -100,6 +100,29 @@ func ff_vp8_luma_dc_wht_rvv, zve64x
 endfunc
 #endif
 
+func ff_vp8_idct_dc_add_rvv, zve32x
+        lh      a3, (a1)
+        addi    a3, a3, 4
+        srai    a3, a3, 3
+        # fall through
+endfunc
+
+func ff_vp78_idct_dc_add_rvv, zve32x
+        csrwi      vxrm, 0
+        vsetivli   zero, 4, e8, mf4, ta, ma
+        sh         zero, (a1)
+        vlse32.v   v8, (a0), a2
+        vsetivli   zero, 16, e16, m2, ta, ma
+        vzext.vf2  v16, v8
+        vadd.vx    v16, v16, a3
+        vmax.vx    v16, v16, zero
+        vsetvli    zero, zero, e8, m1, ta, ma
+        vnclipu.wi v8, v16, 0
+        vsetivli   zero, 4, e8, mf4, ta, ma
+        vsse32.v   v8, (a0), a2
+        ret
+endfunc
+
 .macro vp8_idct_dc_add
         vlse32.v      v0, (a0), a2
         lh            a5, 0(a1)
@@ -122,13 +145,6 @@ endfunc
         addi          a1, a1, 32
 .endm
 
-func ff_vp8_idct_dc_add_rvv, zve32x
-        vsetivli      zero, 4, e8, mf4, ta, ma
-        vp8_idct_dc_add
-
-        ret
-endfunc
-
 func ff_vp8_idct_dc_add4y_rvv, zve32x
         vsetivli      zero, 4, e8, mf4, ta, ma
         .rept 3
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add4y
  2024-06-01 19:56 [FFmpeg-devel] [PATCH 1/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add Rémi Denis-Courmont
@ 2024-06-01 19:56 ` Rémi Denis-Courmont
  2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y Rémi Denis-Courmont
  1 sibling, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-01 19:56 UTC (permalink / raw)
  To: ffmpeg-devel

As with idct_dc_add, most of the code is shared with, and replaces, the
previous VP8 function. To improve performance, we break down the 16x4
matrix into 4 rows, rather than 4 squares. Thus strided loads and
stores are avoided, and the 4 DC calculations are vectored.
Unfortunately this requires a vector gather to splat the DC values, but
overall this is still a win for performance:

T-Head C908:
vp7_idct_dc_add4y_c:       7.2
vp7_idct_dc_add4y_rvv_i32: 2.2
vp8_idct_dc_add4y_c:       6.2
vp8_idct_dc_add4y_rvv_i32: 2.2 (before)
vp8_idct_dc_add4y_rvv_i32: 1.7

SpacemiT X60:
vp7_idct_dc_add4y_c:       6.2
vp7_idct_dc_add4y_rvv_i32: 2.0
vp8_idct_dc_add4y_c:       5.5
vp8_idct_dc_add4y_rvv_i32: 2.5 (before)
vp8_idct_dc_add4y_rvv_i32: 1.7

I also tried to provision the DC values using indexed loads. It ends up
slower overall, especially for VP7, as we then have to compute 16 DC's
instead of just 4.
---
 libavcodec/riscv/vp7dsp_init.c |  2 ++
 libavcodec/riscv/vp7dsp_rvv.S  | 16 ++++++++++++
 libavcodec/riscv/vp8dsp_rvv.S  | 46 ++++++++++++++++++++++++++--------
 3 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/libavcodec/riscv/vp7dsp_init.c b/libavcodec/riscv/vp7dsp_init.c
index 491874483f..fa5fb9d2ae 100644
--- a/libavcodec/riscv/vp7dsp_init.c
+++ b/libavcodec/riscv/vp7dsp_init.c
@@ -28,6 +28,7 @@
 void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]);
 void ff_vp7_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
 void ff_vp78_idct_dc_add_rvv(uint8_t *, int16_t block[16], ptrdiff_t, int dc);
+void ff_vp7_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t);
 
 static void ff_vp7_idct_dc_add_rvv(uint8_t *dst, int16_t block[16],
                                    ptrdiff_t stride)
@@ -49,6 +50,7 @@ av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c)
         c->vp8_idct_add = ff_vp7_idct_add_rvv;
 #endif
         c->vp8_idct_dc_add = ff_vp7_idct_dc_add_rvv;
+        c->vp8_idct_dc_add4y  = ff_vp7_idct_dc_add4y_rvv;
     }
 #endif
 }
diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S
index ecebbebd52..3fe859757d 100644
--- a/libavcodec/riscv/vp7dsp_rvv.S
+++ b/libavcodec/riscv/vp7dsp_rvv.S
@@ -129,3 +129,19 @@ func ff_vp7_idct_add_rvv, zve32x
         ret
 endfunc
 #endif
+
+func ff_vp7_idct_dc_add4y_rvv, zve32x
+        li       t0, 32
+        vsetivli zero, 4, e16, mf2, ta, ma
+        li       t1, 23170
+        vlse16.v v8, (a1), t0 # block[0..3][0]
+        vwmul.vx v0, v8, t1
+        li       t2, 0x20000
+        vsetvli  zero, zero, e32, m1, ta, ma
+        vsra.vi  v0, v0, 14
+        vmul.vx  v0, v0, t1
+        vadd.vx  v0, v0, t2
+        vsetvli  zero, zero, e16, mf2, ta, ma
+        vnsra.wi v8, v0, 18   # 4x DC
+        tail     ff_vp78_idct_dc_add4y_rvv
+endfunc
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 02351be383..9fa2ab2376 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -107,6 +107,7 @@ func ff_vp8_idct_dc_add_rvv, zve32x
         # fall through
 endfunc
 
+# a3 = DC
 func ff_vp78_idct_dc_add_rvv, zve32x
         csrwi      vxrm, 0
         vsetivli   zero, 4, e8, mf4, ta, ma
@@ -123,6 +124,41 @@ func ff_vp78_idct_dc_add_rvv, zve32x
         ret
 endfunc
 
+func ff_vp8_idct_dc_add4y_rvv, zve32x
+        li       t0, 32
+        vsetivli zero, 4, e16, mf2, ta, ma
+        vlse16.v v8, (a1), t0
+        vadd.vi  v8, v8, 4
+        vsra.vi  v8, v8, 3
+        # fall through
+endfunc
+
+        .variant_cc ff_vp78_idct_dc_add4y_rvv
+# v8 = [dc0, dc1, dc2, dc3]
+func ff_vp78_idct_dc_add4y_rvv, zve32x
+        vsetivli    zero, 16, e16, m2, ta, ma
+        vid.v       v4
+        vsrl.vi     v4, v4, 2
+        vrgather.vv v0, v8, v4 # replicate each DC four times
+        vsetvli     zero, zero, e8, m1, ta, ma
+        li          a4, 4
+1:
+        vle8.v      v8, (a0)
+        addi        a4, a4, -1
+        vwaddu.wv   v16, v0, v8
+        sh          zero, (a1)
+        vsetvli     zero, zero, e16, m2, ta, ma
+        vmax.vx     v16, v16, zero
+        addi        a1, a1, 32
+        vsetvli     zero, zero, e8, m1, ta, ma
+        vnclipu.wi  v8, v16, 0
+        vse8.v      v8, (a0)
+        add         a0, a0, a2
+        bnez        a4, 1b
+
+        ret
+endfunc
+
 .macro vp8_idct_dc_add
         vlse32.v      v0, (a0), a2
         lh            a5, 0(a1)
@@ -145,16 +181,6 @@ endfunc
         addi          a1, a1, 32
 .endm
 
-func ff_vp8_idct_dc_add4y_rvv, zve32x
-        vsetivli      zero, 4, e8, mf4, ta, ma
-        .rept 3
-        vp8_idct_dc_addy
-        .endr
-        vp8_idct_dc_add
-
-        ret
-endfunc
-
 func ff_vp8_idct_dc_add4uv_rvv, zve32x
         vsetivli      zero, 4, e8, mf4, ta, ma
         vp8_idct_dc_addy
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y
  2024-06-01 19:56 [FFmpeg-devel] [PATCH 1/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add Rémi Denis-Courmont
  2024-06-01 19:56 ` [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add4y Rémi Denis-Courmont
@ 2024-06-02 10:24 ` Rémi Denis-Courmont
  2024-06-02 10:24   ` [FFmpeg-devel] [PATCH 4/5] lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv Rémi Denis-Courmont
  2024-06-02 10:24   ` [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: remove no longer used macros Rémi Denis-Courmont
  1 sibling, 2 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-02 10:24 UTC (permalink / raw)
  To: ffmpeg-devel

DCT-related FFmpeg functions often add an unsigned 8-bit sample to a
signed 16-bit coefficient, then clip the result back to an unsigned
8-bit value. RISC-V has no signed 16-bit to unsigned 8-bit clip, so
instead our most common sequence is:
    VWADDU.WV
    set SEW to 16 bits
    VMAX.VV zero     # clip negative values to 0
    set SEW to 8 bits
    VNCLIPU.WI       # clip values over 255 to 255 and narrow

Here we use a different sequence which does not require toggling the
vector type. This assumes that the wide addend vector is biased by
-128:
    VWADDU.WV
    VNCLIP.WI    # clip values to signed 8-bit and narrow
    VXOR.VX 0x80 # flip sign bit (convert signed to unsigned)

Also the VMAX is effectively replaced by a VXOR of half-width. In this
function, this comes for free as we anyway add a constant to the wide
vector in the prologue.

On C908, this has no observable effects. On X60, this improves
microbenchmarks by about 20%.
---
 libavcodec/riscv/vp7dsp_rvv.S |  2 +-
 libavcodec/riscv/vp8dsp_rvv.S | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S
index 39b23c2e79..09dcbf3857 100644
--- a/libavcodec/riscv/vp7dsp_rvv.S
+++ b/libavcodec/riscv/vp7dsp_rvv.S
@@ -134,7 +134,7 @@ func ff_vp7_idct_dc_add4y_rvv, zve32x
         li       t1, 23170
         vlse16.v v8, (a1), t0 # block[0..3][0]
         vwmul.vx v0, v8, t1
-        li       t2, 0x20000
+        li       t2, 0x20000 - (128 << 18)
         vsetvli  zero, zero, e32, m1, ta, ma
         vsra.vi  v0, v0, 14
         vmul.vx  v0, v0, t1
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 8ea0a0c9bd..458eebb306 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -125,31 +125,31 @@ endfunc
 func ff_vp8_idct_dc_add4y_rvv, zve32x
         li       t0, 32
         vsetivli zero, 4, e16, mf2, ta, ma
+        li       t1, 4 - (128 << 3)
         vlse16.v v8, (a1), t0
-        vadd.vi  v8, v8, 4
+        vadd.vx  v8, v8, t1
         vsra.vi  v8, v8, 3
         # fall through
 endfunc
 
         .variant_cc ff_vp78_idct_dc_add4y_rvv
-# v8 = [dc0, dc1, dc2, dc3]
+# v8 = [dc0 - 128, dc1 - 128, dc2 - 128, dc3 - 128]
 func ff_vp78_idct_dc_add4y_rvv, zve32x
         vsetivli    zero, 16, e16, m2, ta, ma
         vid.v       v4
+        li          a4, 4
         vsrl.vi     v4, v4, 2
+        li          t1, 128
         vrgather.vv v0, v8, v4 # replicate each DC four times
         vsetvli     zero, zero, e8, m1, ta, ma
-        li          a4, 4
 1:
         vle8.v      v8, (a0)
         addi        a4, a4, -1
         vwaddu.wv   v16, v0, v8
         sh          zero, (a1)
-        vsetvli     zero, zero, e16, m2, ta, ma
-        vmax.vx     v16, v16, zero
+        vnclip.wi   v8, v16, 0
         addi        a1, a1, 32
-        vsetvli     zero, zero, e8, m1, ta, ma
-        vnclipu.wi  v8, v16, 0
+        vxor.vx     v8, v8, t1
         vse8.v      v8, (a0)
         add         a0, a0, a2
         bnez        a4, 1b
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH 4/5] lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv
  2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y Rémi Denis-Courmont
@ 2024-06-02 10:24   ` Rémi Denis-Courmont
  2024-06-02 10:24   ` [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: remove no longer used macros Rémi Denis-Courmont
  1 sibling, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-02 10:24 UTC (permalink / raw)
  To: ffmpeg-devel

This is almost the same story as vp7_idct_add4y. We just have to use
strided loads of 2 64-bit elements to account for the different data
layout in memory.

T-Head C908:
vp7_idct_dc_add4uv_c:       7.5
vp7_idct_dc_add4uv_rvv_i64: 2.0
vp8_idct_dc_add4uv_c:       6.2
vp8_idct_dc_add4uv_rvv_i32: 2.2 (before)
vp8_idct_dc_add4uv_rvv_i64: 2.0

SpacemiT X60:
vp7_idct_dc_add4uv_c:       6.7
vp7_idct_dc_add4uv_rvv_i64: 2.2
vp8_idct_dc_add4uv_c:       5.7
vp8_idct_dc_add4uv_rvv_i32: 2.5 (before)
vp8_idct_dc_add4uv_rvv_i64: 2.0
---
 libavcodec/riscv/vp7dsp_init.c |  3 ++
 libavcodec/riscv/vp7dsp_rvv.S  |  6 ++--
 libavcodec/riscv/vp8dsp_init.c |  3 +-
 libavcodec/riscv/vp8dsp_rvv.S  | 50 +++++++++++++++++++++++++---------
 4 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/libavcodec/riscv/vp7dsp_init.c b/libavcodec/riscv/vp7dsp_init.c
index fa5fb9d2ae..9b8357ec05 100644
--- a/libavcodec/riscv/vp7dsp_init.c
+++ b/libavcodec/riscv/vp7dsp_init.c
@@ -29,6 +29,7 @@ void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]);
 void ff_vp7_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
 void ff_vp78_idct_dc_add_rvv(uint8_t *, int16_t block[16], ptrdiff_t, int dc);
 void ff_vp7_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t);
+void ff_vp7_idct_dc_add4uv_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t);
 
 static void ff_vp7_idct_dc_add_rvv(uint8_t *dst, int16_t block[16],
                                    ptrdiff_t stride)
@@ -51,6 +52,8 @@ av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c)
 #endif
         c->vp8_idct_dc_add = ff_vp7_idct_dc_add_rvv;
         c->vp8_idct_dc_add4y  = ff_vp7_idct_dc_add4y_rvv;
+        if (flags & AV_CPU_FLAG_RVV_I64)
+            c->vp8_idct_dc_add4uv = ff_vp7_idct_dc_add4uv_rvv;
     }
 #endif
 }
diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S
index 09dcbf3857..856b0e8c96 100644
--- a/libavcodec/riscv/vp7dsp_rvv.S
+++ b/libavcodec/riscv/vp7dsp_rvv.S
@@ -128,7 +128,8 @@ func ff_vp7_idct_add_rvv, zve32x
 endfunc
 #endif
 
-func ff_vp7_idct_dc_add4y_rvv, zve32x
+.irp type, y, uv
+func ff_vp7_idct_dc_add4\type\()_rvv, zve32x
         li       t0, 32
         vsetivli zero, 4, e16, mf2, ta, ma
         li       t1, 23170
@@ -141,5 +142,6 @@ func ff_vp7_idct_dc_add4y_rvv, zve32x
         vadd.vx  v0, v0, t2
         vsetvli  zero, zero, e16, mf2, ta, ma
         vnsra.wi v8, v0, 18   # 4x DC
-        tail     ff_vp78_idct_dc_add4y_rvv
+        tail     ff_vp78_idct_dc_add4\type\()_rvv
 endfunc
+.endr
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 836237b41c..5911d195ba 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -131,9 +131,8 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
 #endif
         c->vp8_idct_dc_add = ff_vp8_idct_dc_add_rvv;
         c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_rvv;
-        if (flags & AV_CPU_FLAG_RVB_ADDR) {
+        if (flags & AV_CPU_FLAG_RVV_I64)
             c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
-        }
     }
 #endif
 }
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 458eebb306..c83f9eec71 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -157,6 +157,43 @@ func ff_vp78_idct_dc_add4y_rvv, zve32x
         ret
 endfunc
 
+func ff_vp8_idct_dc_add4uv_rvv, zve32x
+        li       t0, 32
+        vsetivli zero, 4, e16, mf2, ta, ma
+        li       t1, 4 - (128 << 3)
+        vlse16.v v8, (a1), t0
+        vadd.vx  v8, v8, t1
+        vsra.vi  v8, v8, 3
+        # fall through
+endfunc
+
+        .variant_cc ff_vp78_idct_dc_add4uv_rvv
+func ff_vp78_idct_dc_add4uv_rvv, zve64x
+        vsetivli    zero, 16, e16, m2, ta, ma
+        vid.v       v4
+        li          a4, 4
+        vsrl.vi     v4, v4, 2
+        li          t1, 128
+        vrgather.vv v0, v8, v4 # replicate each DC four times
+        slli        t2, a2, 2
+        vsetivli    zero, 2, e64, m1, ta, ma
+1:
+        vlse64.v    v8, (a0), t2
+        addi        a4, a4, -1
+        vsetivli    zero, 16, e8, m1, ta, ma
+        vwaddu.wv   v16, v0, v8
+        sh          zero, (a1)
+        vnclip.wi   v8, v16, 0
+        addi        a1, a1, 32
+        vxor.vx     v8, v8, t1
+        vsetivli    zero, 2, e64, m1, ta, ma
+        vsse64.v    v8, (a0), t2
+        add         a0, a0, a2
+        bnez        a4, 1b
+
+        ret
+endfunc
+
 .macro vp8_idct_dc_add
         vlse32.v      v0, (a0), a2
         lh            a5, 0(a1)
@@ -179,19 +216,6 @@ endfunc
         addi          a1, a1, 32
 .endm
 
-func ff_vp8_idct_dc_add4uv_rvv, zve32x
-        vsetivli      zero, 4, e8, mf4, ta, ma
-        vp8_idct_dc_addy
-        vp8_idct_dc_add
-        addi          a0, a0, -4
-        sh2add        a0, a2, a0
-        addi          a1, a1, 32
-        vp8_idct_dc_addy
-        vp8_idct_dc_add
-
-        ret
-endfunc
-
 .macro bilin_load dst type mn
 .ifc \type,v
         add             t5, a2, a3
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: remove no longer used macros
  2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y Rémi Denis-Courmont
  2024-06-02 10:24   ` [FFmpeg-devel] [PATCH 4/5] lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv Rémi Denis-Courmont
@ 2024-06-02 10:24   ` Rémi Denis-Courmont
  1 sibling, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-02 10:24 UTC (permalink / raw)
  To: ffmpeg-devel

---
 libavcodec/riscv/vp8dsp_rvv.S | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index c83f9eec71..82489a7f14 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -194,28 +194,6 @@ func ff_vp78_idct_dc_add4uv_rvv, zve64x
         ret
 endfunc
 
-.macro vp8_idct_dc_add
-        vlse32.v      v0, (a0), a2
-        lh            a5, 0(a1)
-        sh            zero, 0(a1)
-        addi          a5, a5, 4
-        srai          t1, a5, 3
-        vsetivli      zero, 4*4, e16, m2, ta, ma
-        vzext.vf2     v2, v0
-        vadd.vx       v2, v2, t1
-        vmax.vx       v2, v2, zero
-        vsetvli       zero, zero, e8, m1, ta, ma
-        vnclipu.wi    v0, v2, 0
-        vsetivli      zero, 4, e8, mf4, ta, ma
-        vsse32.v      v0, (a0), a2
-.endm
-
-.macro vp8_idct_dc_addy
-        vp8_idct_dc_add
-        addi          a0, a0, 4
-        addi          a1, a1, 32
-.endm
-
 .macro bilin_load dst type mn
 .ifc \type,v
         add             t5, a2, a3
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-06-02 10:25 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-01 19:56 [FFmpeg-devel] [PATCH 1/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add Rémi Denis-Courmont
2024-06-01 19:56 ` [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add4y Rémi Denis-Courmont
2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y Rémi Denis-Courmont
2024-06-02 10:24   ` [FFmpeg-devel] [PATCH 4/5] lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv Rémi Denis-Courmont
2024-06-02 10:24   ` [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: remove no longer used macros Rémi Denis-Courmont

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git