* [FFmpeg-devel] [PATCH 1/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add
@ 2024-06-01 19:56 Rémi Denis-Courmont
2024-06-01 19:56 ` [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add4y Rémi Denis-Courmont
2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y Rémi Denis-Courmont
0 siblings, 2 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-01 19:56 UTC (permalink / raw)
To: ffmpeg-devel
This just computes the direct coefficient and hands over to code shared
with VP8. Accordingly the bulk of changes are just rewriting the VP8
code to share.
Nothing to write home about:
vp7_idct_dc_add_c: 1.7
vp7_idct_dc_add_rvv_i32: 1.2
---
libavcodec/riscv/vp7dsp_init.c | 12 +++++++++++-
libavcodec/riscv/vp8dsp_rvv.S | 30 +++++++++++++++++++++++-------
2 files changed, 34 insertions(+), 8 deletions(-)
diff --git a/libavcodec/riscv/vp7dsp_init.c b/libavcodec/riscv/vp7dsp_init.c
index ae7f2d4277..491874483f 100644
--- a/libavcodec/riscv/vp7dsp_init.c
+++ b/libavcodec/riscv/vp7dsp_init.c
@@ -27,6 +27,15 @@
void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp7_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp78_idct_dc_add_rvv(uint8_t *, int16_t block[16], ptrdiff_t, int dc);
+
+static void ff_vp7_idct_dc_add_rvv(uint8_t *dst, int16_t block[16],
+ ptrdiff_t stride)
+{
+ int dc = (23170 * (23170 * block[0] >> 14) + 0x20000) >> 18;
+
+ ff_vp78_idct_dc_add_rvv(dst, block, stride, dc);
+}
av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c)
{
@@ -37,8 +46,9 @@ av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c)
ff_rv_vlen_least(128)) {
#if __riscv_xlen >= 64
c->vp8_luma_dc_wht = ff_vp7_luma_dc_wht_rvv;
-#endif
c->vp8_idct_add = ff_vp7_idct_add_rvv;
+#endif
+ c->vp8_idct_dc_add = ff_vp7_idct_dc_add_rvv;
}
#endif
}
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index b187c6c7c9..02351be383 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -100,6 +100,29 @@ func ff_vp8_luma_dc_wht_rvv, zve64x
endfunc
#endif
+func ff_vp8_idct_dc_add_rvv, zve32x
+ lh a3, (a1)
+ addi a3, a3, 4
+ srai a3, a3, 3
+ # fall through
+endfunc
+
+func ff_vp78_idct_dc_add_rvv, zve32x
+ csrwi vxrm, 0
+ vsetivli zero, 4, e8, mf4, ta, ma
+ sh zero, (a1)
+ vlse32.v v8, (a0), a2
+ vsetivli zero, 16, e16, m2, ta, ma
+ vzext.vf2 v16, v8
+ vadd.vx v16, v16, a3
+ vmax.vx v16, v16, zero
+ vsetvli zero, zero, e8, m1, ta, ma
+ vnclipu.wi v8, v16, 0
+ vsetivli zero, 4, e8, mf4, ta, ma
+ vsse32.v v8, (a0), a2
+ ret
+endfunc
+
.macro vp8_idct_dc_add
vlse32.v v0, (a0), a2
lh a5, 0(a1)
@@ -122,13 +145,6 @@ endfunc
addi a1, a1, 32
.endm
-func ff_vp8_idct_dc_add_rvv, zve32x
- vsetivli zero, 4, e8, mf4, ta, ma
- vp8_idct_dc_add
-
- ret
-endfunc
-
func ff_vp8_idct_dc_add4y_rvv, zve32x
vsetivli zero, 4, e8, mf4, ta, ma
.rept 3
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add4y
2024-06-01 19:56 [FFmpeg-devel] [PATCH 1/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add Rémi Denis-Courmont
@ 2024-06-01 19:56 ` Rémi Denis-Courmont
2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y Rémi Denis-Courmont
1 sibling, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-01 19:56 UTC (permalink / raw)
To: ffmpeg-devel
As with idct_dc_add, most of the code is shared with, and replaces, the
previous VP8 function. To improve performance, we break down the 16x4
matrix into 4 rows, rather than 4 squares. Thus strided loads and
stores are avoided, and the 4 DC calculations are vectored.
Unfortunately this requires a vector gather to splat the DC values, but
overall this is still a win for performance:
T-Head C908:
vp7_idct_dc_add4y_c: 7.2
vp7_idct_dc_add4y_rvv_i32: 2.2
vp8_idct_dc_add4y_c: 6.2
vp8_idct_dc_add4y_rvv_i32: 2.2 (before)
vp8_idct_dc_add4y_rvv_i32: 1.7
SpacemiT X60:
vp7_idct_dc_add4y_c: 6.2
vp7_idct_dc_add4y_rvv_i32: 2.0
vp8_idct_dc_add4y_c: 5.5
vp8_idct_dc_add4y_rvv_i32: 2.5 (before)
vp8_idct_dc_add4y_rvv_i32: 1.7
I also tried to provision the DC values using indexed loads. It ends up
slower overall, especially for VP7, as we then have to compute 16 DC's
instead of just 4.
---
libavcodec/riscv/vp7dsp_init.c | 2 ++
libavcodec/riscv/vp7dsp_rvv.S | 16 ++++++++++++
libavcodec/riscv/vp8dsp_rvv.S | 46 ++++++++++++++++++++++++++--------
3 files changed, 54 insertions(+), 10 deletions(-)
diff --git a/libavcodec/riscv/vp7dsp_init.c b/libavcodec/riscv/vp7dsp_init.c
index 491874483f..fa5fb9d2ae 100644
--- a/libavcodec/riscv/vp7dsp_init.c
+++ b/libavcodec/riscv/vp7dsp_init.c
@@ -28,6 +28,7 @@
void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp7_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp78_idct_dc_add_rvv(uint8_t *, int16_t block[16], ptrdiff_t, int dc);
+void ff_vp7_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t);
static void ff_vp7_idct_dc_add_rvv(uint8_t *dst, int16_t block[16],
ptrdiff_t stride)
@@ -49,6 +50,7 @@ av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c)
c->vp8_idct_add = ff_vp7_idct_add_rvv;
#endif
c->vp8_idct_dc_add = ff_vp7_idct_dc_add_rvv;
+ c->vp8_idct_dc_add4y = ff_vp7_idct_dc_add4y_rvv;
}
#endif
}
diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S
index ecebbebd52..3fe859757d 100644
--- a/libavcodec/riscv/vp7dsp_rvv.S
+++ b/libavcodec/riscv/vp7dsp_rvv.S
@@ -129,3 +129,19 @@ func ff_vp7_idct_add_rvv, zve32x
ret
endfunc
#endif
+
+func ff_vp7_idct_dc_add4y_rvv, zve32x
+ li t0, 32
+ vsetivli zero, 4, e16, mf2, ta, ma
+ li t1, 23170
+ vlse16.v v8, (a1), t0 # block[0..3][0]
+ vwmul.vx v0, v8, t1
+ li t2, 0x20000
+ vsetvli zero, zero, e32, m1, ta, ma
+ vsra.vi v0, v0, 14
+ vmul.vx v0, v0, t1
+ vadd.vx v0, v0, t2
+ vsetvli zero, zero, e16, mf2, ta, ma
+ vnsra.wi v8, v0, 18 # 4x DC
+ tail ff_vp78_idct_dc_add4y_rvv
+endfunc
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 02351be383..9fa2ab2376 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -107,6 +107,7 @@ func ff_vp8_idct_dc_add_rvv, zve32x
# fall through
endfunc
+# a3 = DC
func ff_vp78_idct_dc_add_rvv, zve32x
csrwi vxrm, 0
vsetivli zero, 4, e8, mf4, ta, ma
@@ -123,6 +124,41 @@ func ff_vp78_idct_dc_add_rvv, zve32x
ret
endfunc
+func ff_vp8_idct_dc_add4y_rvv, zve32x
+ li t0, 32
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vlse16.v v8, (a1), t0
+ vadd.vi v8, v8, 4
+ vsra.vi v8, v8, 3
+ # fall through
+endfunc
+
+ .variant_cc ff_vp78_idct_dc_add4y_rvv
+# v8 = [dc0, dc1, dc2, dc3]
+func ff_vp78_idct_dc_add4y_rvv, zve32x
+ vsetivli zero, 16, e16, m2, ta, ma
+ vid.v v4
+ vsrl.vi v4, v4, 2
+ vrgather.vv v0, v8, v4 # replicate each DC four times
+ vsetvli zero, zero, e8, m1, ta, ma
+ li a4, 4
+1:
+ vle8.v v8, (a0)
+ addi a4, a4, -1
+ vwaddu.wv v16, v0, v8
+ sh zero, (a1)
+ vsetvli zero, zero, e16, m2, ta, ma
+ vmax.vx v16, v16, zero
+ addi a1, a1, 32
+ vsetvli zero, zero, e8, m1, ta, ma
+ vnclipu.wi v8, v16, 0
+ vse8.v v8, (a0)
+ add a0, a0, a2
+ bnez a4, 1b
+
+ ret
+endfunc
+
.macro vp8_idct_dc_add
vlse32.v v0, (a0), a2
lh a5, 0(a1)
@@ -145,16 +181,6 @@ endfunc
addi a1, a1, 32
.endm
-func ff_vp8_idct_dc_add4y_rvv, zve32x
- vsetivli zero, 4, e8, mf4, ta, ma
- .rept 3
- vp8_idct_dc_addy
- .endr
- vp8_idct_dc_add
-
- ret
-endfunc
-
func ff_vp8_idct_dc_add4uv_rvv, zve32x
vsetivli zero, 4, e8, mf4, ta, ma
vp8_idct_dc_addy
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y
2024-06-01 19:56 [FFmpeg-devel] [PATCH 1/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add Rémi Denis-Courmont
2024-06-01 19:56 ` [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add4y Rémi Denis-Courmont
@ 2024-06-02 10:24 ` Rémi Denis-Courmont
2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 4/5] lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv Rémi Denis-Courmont
2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: remove no longer used macros Rémi Denis-Courmont
1 sibling, 2 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-02 10:24 UTC (permalink / raw)
To: ffmpeg-devel
DCT-related FFmpeg functions often add an unsigned 8-bit sample to a
signed 16-bit coefficient, then clip the result back to an unsigned
8-bit value. RISC-V has no signed 16-bit to unsigned 8-bit clip, so
instead our most common sequence is:
VWADDU.WV
set SEW to 16 bits
VMAX.VV zero # clip negative values to 0
set SEW to 8 bits
VNCLIPU.WI # clip values over 255 to 255 and narrow
Here we use a different sequence which does not require toggling the
vector type. This assumes that the wide addend vector is biased by
-128:
VWADDU.WV
VNCLIP.WI # clip values to signed 8-bit and narrow
VXOR.VX 0x80 # flip sign bit (convert signed to unsigned)
Also the VMAX is effectively replaced by a VXOR of half-width. In this
function, this comes for free as we anyway add a constant to the wide
vector in the prologue.
On C908, this has no observable effects. On X60, this improves
microbenchmarks by about 20%.
---
libavcodec/riscv/vp7dsp_rvv.S | 2 +-
libavcodec/riscv/vp8dsp_rvv.S | 14 +++++++-------
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S
index 39b23c2e79..09dcbf3857 100644
--- a/libavcodec/riscv/vp7dsp_rvv.S
+++ b/libavcodec/riscv/vp7dsp_rvv.S
@@ -134,7 +134,7 @@ func ff_vp7_idct_dc_add4y_rvv, zve32x
li t1, 23170
vlse16.v v8, (a1), t0 # block[0..3][0]
vwmul.vx v0, v8, t1
- li t2, 0x20000
+ li t2, 0x20000 - (128 << 18)
vsetvli zero, zero, e32, m1, ta, ma
vsra.vi v0, v0, 14
vmul.vx v0, v0, t1
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 8ea0a0c9bd..458eebb306 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -125,31 +125,31 @@ endfunc
func ff_vp8_idct_dc_add4y_rvv, zve32x
li t0, 32
vsetivli zero, 4, e16, mf2, ta, ma
+ li t1, 4 - (128 << 3)
vlse16.v v8, (a1), t0
- vadd.vi v8, v8, 4
+ vadd.vx v8, v8, t1
vsra.vi v8, v8, 3
# fall through
endfunc
.variant_cc ff_vp78_idct_dc_add4y_rvv
-# v8 = [dc0, dc1, dc2, dc3]
+# v8 = [dc0 - 128, dc1 - 128, dc2 - 128, dc3 - 128]
func ff_vp78_idct_dc_add4y_rvv, zve32x
vsetivli zero, 16, e16, m2, ta, ma
vid.v v4
+ li a4, 4
vsrl.vi v4, v4, 2
+ li t1, 128
vrgather.vv v0, v8, v4 # replicate each DC four times
vsetvli zero, zero, e8, m1, ta, ma
- li a4, 4
1:
vle8.v v8, (a0)
addi a4, a4, -1
vwaddu.wv v16, v0, v8
sh zero, (a1)
- vsetvli zero, zero, e16, m2, ta, ma
- vmax.vx v16, v16, zero
+ vnclip.wi v8, v16, 0
addi a1, a1, 32
- vsetvli zero, zero, e8, m1, ta, ma
- vnclipu.wi v8, v16, 0
+ vxor.vx v8, v8, t1
vse8.v v8, (a0)
add a0, a0, a2
bnez a4, 1b
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 4/5] lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv
2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y Rémi Denis-Courmont
@ 2024-06-02 10:24 ` Rémi Denis-Courmont
2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: remove no longer used macros Rémi Denis-Courmont
1 sibling, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-02 10:24 UTC (permalink / raw)
To: ffmpeg-devel
This is almost the same story as vp7_idct_add4y. We just have to use
strided loads of 2 64-bit elements to account for the different data
layout in memory.
T-Head C908:
vp7_idct_dc_add4uv_c: 7.5
vp7_idct_dc_add4uv_rvv_i64: 2.0
vp8_idct_dc_add4uv_c: 6.2
vp8_idct_dc_add4uv_rvv_i32: 2.2 (before)
vp8_idct_dc_add4uv_rvv_i64: 2.0
SpacemiT X60:
vp7_idct_dc_add4uv_c: 6.7
vp7_idct_dc_add4uv_rvv_i64: 2.2
vp8_idct_dc_add4uv_c: 5.7
vp8_idct_dc_add4uv_rvv_i32: 2.5 (before)
vp8_idct_dc_add4uv_rvv_i64: 2.0
---
libavcodec/riscv/vp7dsp_init.c | 3 ++
libavcodec/riscv/vp7dsp_rvv.S | 6 ++--
libavcodec/riscv/vp8dsp_init.c | 3 +-
libavcodec/riscv/vp8dsp_rvv.S | 50 +++++++++++++++++++++++++---------
4 files changed, 45 insertions(+), 17 deletions(-)
diff --git a/libavcodec/riscv/vp7dsp_init.c b/libavcodec/riscv/vp7dsp_init.c
index fa5fb9d2ae..9b8357ec05 100644
--- a/libavcodec/riscv/vp7dsp_init.c
+++ b/libavcodec/riscv/vp7dsp_init.c
@@ -29,6 +29,7 @@ void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp7_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp78_idct_dc_add_rvv(uint8_t *, int16_t block[16], ptrdiff_t, int dc);
void ff_vp7_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t);
+void ff_vp7_idct_dc_add4uv_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t);
static void ff_vp7_idct_dc_add_rvv(uint8_t *dst, int16_t block[16],
ptrdiff_t stride)
@@ -51,6 +52,8 @@ av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c)
#endif
c->vp8_idct_dc_add = ff_vp7_idct_dc_add_rvv;
c->vp8_idct_dc_add4y = ff_vp7_idct_dc_add4y_rvv;
+ if (flags & AV_CPU_FLAG_RVV_I64)
+ c->vp8_idct_dc_add4uv = ff_vp7_idct_dc_add4uv_rvv;
}
#endif
}
diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S
index 09dcbf3857..856b0e8c96 100644
--- a/libavcodec/riscv/vp7dsp_rvv.S
+++ b/libavcodec/riscv/vp7dsp_rvv.S
@@ -128,7 +128,8 @@ func ff_vp7_idct_add_rvv, zve32x
endfunc
#endif
-func ff_vp7_idct_dc_add4y_rvv, zve32x
+.irp type, y, uv
+func ff_vp7_idct_dc_add4\type\()_rvv, zve32x
li t0, 32
vsetivli zero, 4, e16, mf2, ta, ma
li t1, 23170
@@ -141,5 +142,6 @@ func ff_vp7_idct_dc_add4y_rvv, zve32x
vadd.vx v0, v0, t2
vsetvli zero, zero, e16, mf2, ta, ma
vnsra.wi v8, v0, 18 # 4x DC
- tail ff_vp78_idct_dc_add4y_rvv
+ tail ff_vp78_idct_dc_add4\type\()_rvv
endfunc
+.endr
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 836237b41c..5911d195ba 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -131,9 +131,8 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
#endif
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_rvv;
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_rvv;
- if (flags & AV_CPU_FLAG_RVB_ADDR) {
+ if (flags & AV_CPU_FLAG_RVV_I64)
c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
- }
}
#endif
}
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 458eebb306..c83f9eec71 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -157,6 +157,43 @@ func ff_vp78_idct_dc_add4y_rvv, zve32x
ret
endfunc
+func ff_vp8_idct_dc_add4uv_rvv, zve32x
+ li t0, 32
+ vsetivli zero, 4, e16, mf2, ta, ma
+ li t1, 4 - (128 << 3)
+ vlse16.v v8, (a1), t0
+ vadd.vx v8, v8, t1
+ vsra.vi v8, v8, 3
+ # fall through
+endfunc
+
+ .variant_cc ff_vp78_idct_dc_add4uv_rvv
+func ff_vp78_idct_dc_add4uv_rvv, zve64x
+ vsetivli zero, 16, e16, m2, ta, ma
+ vid.v v4
+ li a4, 4
+ vsrl.vi v4, v4, 2
+ li t1, 128
+ vrgather.vv v0, v8, v4 # replicate each DC four times
+ slli t2, a2, 2
+ vsetivli zero, 2, e64, m1, ta, ma
+1:
+ vlse64.v v8, (a0), t2
+ addi a4, a4, -1
+ vsetivli zero, 16, e8, m1, ta, ma
+ vwaddu.wv v16, v0, v8
+ sh zero, (a1)
+ vnclip.wi v8, v16, 0
+ addi a1, a1, 32
+ vxor.vx v8, v8, t1
+ vsetivli zero, 2, e64, m1, ta, ma
+ vsse64.v v8, (a0), t2
+ add a0, a0, a2
+ bnez a4, 1b
+
+ ret
+endfunc
+
.macro vp8_idct_dc_add
vlse32.v v0, (a0), a2
lh a5, 0(a1)
@@ -179,19 +216,6 @@ endfunc
addi a1, a1, 32
.endm
-func ff_vp8_idct_dc_add4uv_rvv, zve32x
- vsetivli zero, 4, e8, mf4, ta, ma
- vp8_idct_dc_addy
- vp8_idct_dc_add
- addi a0, a0, -4
- sh2add a0, a2, a0
- addi a1, a1, 32
- vp8_idct_dc_addy
- vp8_idct_dc_add
-
- ret
-endfunc
-
.macro bilin_load dst type mn
.ifc \type,v
add t5, a2, a3
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: remove no longer used macros
2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y Rémi Denis-Courmont
2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 4/5] lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv Rémi Denis-Courmont
@ 2024-06-02 10:24 ` Rémi Denis-Courmont
1 sibling, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-02 10:24 UTC (permalink / raw)
To: ffmpeg-devel
---
libavcodec/riscv/vp8dsp_rvv.S | 22 ----------------------
1 file changed, 22 deletions(-)
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index c83f9eec71..82489a7f14 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -194,28 +194,6 @@ func ff_vp78_idct_dc_add4uv_rvv, zve64x
ret
endfunc
-.macro vp8_idct_dc_add
- vlse32.v v0, (a0), a2
- lh a5, 0(a1)
- sh zero, 0(a1)
- addi a5, a5, 4
- srai t1, a5, 3
- vsetivli zero, 4*4, e16, m2, ta, ma
- vzext.vf2 v2, v0
- vadd.vx v2, v2, t1
- vmax.vx v2, v2, zero
- vsetvli zero, zero, e8, m1, ta, ma
- vnclipu.wi v0, v2, 0
- vsetivli zero, 4, e8, mf4, ta, ma
- vsse32.v v0, (a0), a2
-.endm
-
-.macro vp8_idct_dc_addy
- vp8_idct_dc_add
- addi a0, a0, 4
- addi a1, a1, 32
-.endm
-
.macro bilin_load dst type mn
.ifc \type,v
add t5, a2, a3
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2024-06-02 10:25 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-01 19:56 [FFmpeg-devel] [PATCH 1/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add Rémi Denis-Courmont
2024-06-01 19:56 ` [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: add R-V V vp7_idct_dc_add4y Rémi Denis-Courmont
2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 3/5] lavc/vp8dsp: rework R-V V idct_dc_add4y Rémi Denis-Courmont
2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 4/5] lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv Rémi Denis-Courmont
2024-06-02 10:24 ` [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: remove no longer used macros Rémi Denis-Courmont
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git