* [FFmpeg-devel] [PATCH v3 2/9] lavc/vp8dsp: R-V V put_bilin_h v
[not found] <20240506033809.3790245-1-uk7b@foxmail.com>
@ 2024-05-06 3:38 ` uk7b
2024-05-06 3:44 ` flow gg
2024-05-07 15:59 ` Rémi Denis-Courmont
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 3/9] lavc/vp8dsp: R-V V put_bilin_hv uk7b
` (6 subsequent siblings)
7 siblings, 2 replies; 15+ messages in thread
From: uk7b @ 2024-05-06 3:38 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp8_put_bilin4_h_c: 367.0
vp8_put_bilin4_h_rvv_i32: 137.7
vp8_put_bilin4_v_c: 377.0
vp8_put_bilin4_v_rvv_i32: 137.7
vp8_put_bilin8_h_c: 1431.0
vp8_put_bilin8_h_rvv_i32: 297.5
vp8_put_bilin8_v_c: 1449.0
vp8_put_bilin8_v_rvv_i32: 297.5
vp8_put_bilin16_h_c: 2839.0
vp8_put_bilin16_h_rvv_i32: 344.7
vp8_put_bilin16_v_c: 2857.0
vp8_put_bilin16_v_rvv_i32: 344.7
---
libavcodec/riscv/vp8dsp_init.c | 21 +++++++++++++++
libavcodec/riscv/vp8dsp_rvv.S | 49 ++++++++++++++++++++++++++++++++++
2 files changed, 70 insertions(+)
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index fa3feeacf7..afffa6de2f 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -34,6 +34,10 @@ VP8_EPEL(16, rvi);
VP8_EPEL(8, rvi);
VP8_EPEL(4, rvi);
+VP8_BILIN(16, rvv);
+VP8_BILIN(8, rvv);
+VP8_BILIN(4, rvv);
+
av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
{
#if HAVE_RV
@@ -48,6 +52,23 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
c->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
c->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
}
+#if HAVE_RVV
+ if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
+ c->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_rvv;
+ c->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_rvv;
+ c->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_rvv;
+ c->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_rvv;
+ c->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_rvv;
+ c->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_rvv;
+
+ c->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_rvv;
+ c->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_rvv;
+ c->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_rvv;
+ c->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_rvv;
+ c->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_rvv;
+ c->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_rvv;
+ }
+#endif
#endif
}
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 8a0773f964..9bf969d794 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -20,6 +20,18 @@
#include "libavutil/riscv/asm.S"
+.macro vsetvlstatic8 len
+.if \len <= 4
+ vsetivli zero, \len, e8, mf4, ta, ma
+.elseif \len <= 8
+ vsetivli zero, \len, e8, mf2, ta, ma
+.elseif \len <= 16
+ vsetivli zero, \len, e8, m1, ta, ma
+.elseif \len <= 31
+ vsetivli zero, \len, e8, m2, ta, ma
+.endif
+.endm
+
.macro vp8_idct_dc_add
vlse32.v v0, (a0), a2
lh a5, 0(a1)
@@ -71,3 +83,40 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
ret
endfunc
+
+.macro bilin_load dst len type mn
+.ifc \type,v
+ add t5, a2, a3
+.elseif \type == h
+ addi t5, a2, 1
+.endif
+ vle8.v \dst, (a2)
+ vle8.v v2, (t5)
+ vwmulu.vx v28, \dst, t1
+ vwmaccu.vx v28, \mn, v2
+ vwaddu.wx v24, v28, t4
+ vnsra.wi \dst, v24, 3
+.endm
+
+.macro put_vp8_bilin_h_v len type mn
+func ff_put_vp8_bilin\len\()_\type\()_rvv, zve32x
+ vsetvlstatic8 \len
+ li t1, 8
+ li t4, 4
+ sub t1, t1, \mn
+1:
+ addi a4, a4, -1
+ bilin_load v0, \len, \type, \mn
+ vse8.v v0, (a0)
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+.endm
+
+.irp len 16,8,4
+put_vp8_bilin_h_v \len h a5
+put_vp8_bilin_h_v \len v a6
+.endr
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v3 3/9] lavc/vp8dsp: R-V V put_bilin_hv
[not found] <20240506033809.3790245-1-uk7b@foxmail.com>
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 2/9] lavc/vp8dsp: R-V V put_bilin_h v uk7b
@ 2024-05-06 3:38 ` uk7b
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 4/9] lavc/vp8dsp: R-V V put_epel h uk7b
` (5 subsequent siblings)
7 siblings, 0 replies; 15+ messages in thread
From: uk7b @ 2024-05-06 3:38 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp8_put_bilin4_hv_c: 561.0
vp8_put_bilin4_hv_rvv_i32: 232.7
vp8_put_bilin8_hv_c: 2162.7
vp8_put_bilin8_hv_rvv_i32: 506.7
vp8_put_bilin16_hv_c: 4769.7
vp8_put_bilin16_hv_rvv_i32: 556.7
---
libavcodec/riscv/vp8dsp_init.c | 13 +++++++++++++
libavcodec/riscv/vp8dsp_rvv.S | 26 ++++++++++++++++++++++++++
2 files changed, 39 insertions(+)
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index afffa6de2f..9627105fc8 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -67,6 +67,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
c->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_rvv;
c->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_rvv;
c->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_rvv;
+
+ c->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_rvv;
+ c->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_rvv;
+ c->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_rvv;
+ c->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_rvv;
+ c->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_rvv;
+ c->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_rvv;
+ c->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_rvv;
+ c->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_rvv;
+ c->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_rvv;
+ c->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_rvv;
+ c->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_rvv;
+ c->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_rvv;
}
#endif
#endif
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 9bf969d794..d30e4cab07 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -116,7 +116,33 @@ func ff_put_vp8_bilin\len\()_\type\()_rvv, zve32x
endfunc
.endm
+.macro put_vp8_bilin_hv len
+func ff_put_vp8_bilin\len\()_hv_rvv, zve32x
+ vsetvlstatic8 \len
+ li t3, 8
+ sub t1, t3, a5
+ sub t2, t3, a6
+ li t4, 4
+ bilin_load v4, \len, h, a5
+ add a2, a2, a3
+1:
+ addi a4, a4, -1
+ vwmulu.vx v20, v4, t2
+ bilin_load v4, \len, h, a5
+ vwmaccu.vx v20, a6, v4
+ vwaddu.wx v24, v20, t4
+ vnsra.wi v0, v24, 3
+ vse8.v v0, (a0)
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+.endm
+
.irp len 16,8,4
put_vp8_bilin_h_v \len h a5
put_vp8_bilin_h_v \len v a6
+put_vp8_bilin_hv \len
.endr
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v3 4/9] lavc/vp8dsp: R-V V put_epel h
[not found] <20240506033809.3790245-1-uk7b@foxmail.com>
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 2/9] lavc/vp8dsp: R-V V put_bilin_h v uk7b
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 3/9] lavc/vp8dsp: R-V V put_bilin_hv uk7b
@ 2024-05-06 3:38 ` uk7b
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 5/9] lavc/vp8dsp: R-V V put_epel v uk7b
` (4 subsequent siblings)
7 siblings, 0 replies; 15+ messages in thread
From: uk7b @ 2024-05-06 3:38 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp8_put_epel4_h4_c: 10.7
vp8_put_epel4_h4_rvv_i32: 5.0
vp8_put_epel4_h6_c: 15.0
vp8_put_epel4_h6_rvv_i32: 6.2
vp8_put_epel8_h4_c: 43.2
vp8_put_epel8_h4_rvv_i32: 11.2
vp8_put_epel8_h6_c: 57.5
vp8_put_epel8_h6_rvv_i32: 13.5
vp8_put_epel16_h4_c: 92.5
vp8_put_epel16_h4_rvv_i32: 13.7
vp8_put_epel16_h6_c: 139.0
vp8_put_epel16_h6_rvv_i32: 16.5
---
libavcodec/riscv/vp8dsp_init.c | 10 ++++
libavcodec/riscv/vp8dsp_rvv.S | 87 ++++++++++++++++++++++++++++++++++
2 files changed, 97 insertions(+)
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 9627105fc8..a4b7d49932 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -33,6 +33,9 @@ void ff_vp8_idct_dc_add4uv_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t str
VP8_EPEL(16, rvi);
VP8_EPEL(8, rvi);
VP8_EPEL(4, rvi);
+VP8_EPEL(16, rvv);
+VP8_EPEL(8, rvv);
+VP8_EPEL(4, rvv);
VP8_BILIN(16, rvv);
VP8_BILIN(8, rvv);
@@ -80,6 +83,13 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
c->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_rvv;
c->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_rvv;
c->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_rvv;
+
+ c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_rvv;
+ c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_rvv;
+ c->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_rvv;
+ c->put_vp8_epel_pixels_tab[0][0][1] = ff_put_vp8_epel16_h4_rvv;
+ c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_rvv;
+ c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_rvv;
}
#endif
#endif
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index d30e4cab07..30955a7b95 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -32,6 +32,16 @@
.endif
.endm
+.macro vsetvlstatic16 len
+.if \len <= 4
+ vsetivli zero, \len, e16, mf2, ta, ma
+.elseif \len <= 8
+ vsetivli zero, \len, e16, m1, ta, ma
+.elseif \len <= 16
+ vsetivli zero, \len, e16, m2, ta, ma
+.endif
+.endm
+
.macro vp8_idct_dc_add
vlse32.v v0, (a0), a2
lh a5, 0(a1)
@@ -141,8 +151,85 @@ func ff_put_vp8_bilin\len\()_hv_rvv, zve32x
endfunc
.endm
+const subpel_filters
+ .byte 0, -6, 123, 12, -1, 0
+ .byte 2, -11, 108, 36, -8, 1
+ .byte 0, -9, 93, 50, -6, 0
+ .byte 3, -16, 77, 77, -16, 3
+ .byte 0, -6, 50, 93, -9, 0
+ .byte 1, -8, 36, 108, -11, 2
+ .byte 0, -1, 12, 123, -6, 0
+endconst
+
+.macro epel_filter size
+ lla t2, subpel_filters
+ addi t0, a5, -1
+ li t1, 6
+ mul t0, t0, t1
+ add t0, t0, t2
+ .irp n 1,2,3,4
+ lb t\n, \n(t0)
+ .endr
+.ifc \size,6
+ lb t5, 5(t0)
+ lb t0, (t0)
+.endif
+.endm
+
+.macro epel_load dst len size
+ addi t6, a2, -1
+ addi a7, a2, 1
+ vle8.v v24, (a2)
+ vle8.v v22, (t6)
+ vle8.v v26, (a7)
+ addi a7, a7, 1
+ vle8.v v28, (a7)
+ vwmulu.vx v16, v24, t2
+ vwmulu.vx v20, v26, t3
+.ifc \size,6
+ addi t6, t6, -1
+ addi a7, a7, 1
+ vle8.v v24, (t6)
+ vle8.v v26, (a7)
+ vwmaccu.vx v16, t0, v24
+ vwmaccu.vx v16, t5, v26
+.endif
+ li t6, 64
+ vwmaccsu.vx v16, t1, v22
+ vwmaccsu.vx v16, t4, v28
+ vwadd.wx v16, v16, t6
+ vsetvlstatic16 \len
+ vwadd.vv v24, v16, v20
+ vnsra.wi v24, v24, 7
+ vmax.vx v24, v24, zero
+ vsetvlstatic8 \len
+ vnclipu.wi \dst, v24, 0
+.endm
+
+.macro epel_load_inc dst len size
+ epel_load \dst \len \size
+ add a2, a2, a3
+.endm
+
+.macro epel len size type
+func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
+ epel_filter \size
+ vsetvlstatic8 \len
+1:
+ addi a4, a4, -1
+ epel_load_inc v30 \len \size
+ vse8.v v30, (a0)
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+.endm
+
.irp len 16,8,4
put_vp8_bilin_h_v \len h a5
put_vp8_bilin_h_v \len v a6
put_vp8_bilin_hv \len
+epel \len 6 h
+epel \len 4 h
.endr
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v3 5/9] lavc/vp8dsp: R-V V put_epel v
[not found] <20240506033809.3790245-1-uk7b@foxmail.com>
` (2 preceding siblings ...)
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 4/9] lavc/vp8dsp: R-V V put_epel h uk7b
@ 2024-05-06 3:38 ` uk7b
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 6/9] lavc/vp8dsp: R-V V put_epel hv uk7b
` (3 subsequent siblings)
7 siblings, 0 replies; 15+ messages in thread
From: uk7b @ 2024-05-06 3:38 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp8_put_epel4_v4_c: 11.0
vp8_put_epel4_v4_rvv_i32: 5.0
vp8_put_epel4_v6_c: 16.5
vp8_put_epel4_v6_rvv_i32: 6.2
vp8_put_epel8_v4_c: 43.7
vp8_put_epel8_v4_rvv_i32: 11.2
vp8_put_epel8_v6_c: 68.7
vp8_put_epel8_v6_rvv_i32: 13.2
vp8_put_epel16_v4_c: 92.5
vp8_put_epel16_v4_rvv_i32: 13.7
vp8_put_epel16_v6_c: 135.7
vp8_put_epel16_v6_rvv_i32: 16.5
---
libavcodec/riscv/vp8dsp_init.c | 7 +++++++
libavcodec/riscv/vp8dsp_rvv.S | 34 +++++++++++++++++++++++-----------
2 files changed, 30 insertions(+), 11 deletions(-)
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index a4b7d49932..dc3e087f01 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -90,6 +90,13 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
c->put_vp8_epel_pixels_tab[0][0][1] = ff_put_vp8_epel16_h4_rvv;
c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_rvv;
c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_rvv;
+
+ c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_rvv;
+ c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_rvv;
+ c->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_rvv;
+ c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
+ c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
+ c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
}
#endif
#endif
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 30955a7b95..bf268e4d8d 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -161,9 +161,13 @@ const subpel_filters
.byte 0, -1, 12, 123, -6, 0
endconst
-.macro epel_filter size
+.macro epel_filter size type
lla t2, subpel_filters
+.ifc \type,v
+ addi t0, a6, -1
+.elseif \type == h
addi t0, a5, -1
+.endif
li t1, 6
mul t0, t0, t1
add t0, t0, t2
@@ -176,19 +180,25 @@ endconst
.endif
.endm
-.macro epel_load dst len size
- addi t6, a2, -1
- addi a7, a2, 1
+.macro epel_load dst len size type
+.ifc \type,v
+ mv a5, a3
+.else
+ li a5, 1
+.endif
+ sub t6, a2, a5
+ add a7, a2, a5
+
vle8.v v24, (a2)
vle8.v v22, (t6)
vle8.v v26, (a7)
- addi a7, a7, 1
+ add a7, a7, a5
vle8.v v28, (a7)
vwmulu.vx v16, v24, t2
vwmulu.vx v20, v26, t3
.ifc \size,6
- addi t6, t6, -1
- addi a7, a7, 1
+ sub t6, t6, a5
+ add a7, a7, a5
vle8.v v24, (t6)
vle8.v v26, (a7)
vwmaccu.vx v16, t0, v24
@@ -206,18 +216,18 @@ endconst
vnclipu.wi \dst, v24, 0
.endm
-.macro epel_load_inc dst len size
- epel_load \dst \len \size
+.macro epel_load_inc dst len size type
+ epel_load \dst \len \size \type
add a2, a2, a3
.endm
.macro epel len size type
func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
- epel_filter \size
+ epel_filter \size \type
vsetvlstatic8 \len
1:
addi a4, a4, -1
- epel_load_inc v30 \len \size
+ epel_load_inc v30 \len \size \type
vse8.v v30, (a0)
add a0, a0, a1
bnez a4, 1b
@@ -232,4 +242,6 @@ put_vp8_bilin_h_v \len v a6
put_vp8_bilin_hv \len
epel \len 6 h
epel \len 4 h
+epel \len 6 v
+epel \len 4 v
.endr
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v3 6/9] lavc/vp8dsp: R-V V put_epel hv
[not found] <20240506033809.3790245-1-uk7b@foxmail.com>
` (3 preceding siblings ...)
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 5/9] lavc/vp8dsp: R-V V put_epel v uk7b
@ 2024-05-06 3:38 ` uk7b
2024-05-06 19:24 ` Rémi Denis-Courmont
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 7/9] lavc/vp8dsp: R-V V loop_filter_simple uk7b
` (2 subsequent siblings)
7 siblings, 1 reply; 15+ messages in thread
From: uk7b @ 2024-05-06 3:38 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp8_put_epel4_h4v4_c: 20.0
vp8_put_epel4_h4v4_rvv_i32: 11.0
vp8_put_epel4_h4v6_c: 25.2
vp8_put_epel4_h4v6_rvv_i32: 13.5
vp8_put_epel4_h6v4_c: 22.2
vp8_put_epel4_h6v4_rvv_i32: 14.5
vp8_put_epel4_h6v6_c: 29.0
vp8_put_epel4_h6v6_rvv_i32: 15.7
vp8_put_epel8_h4v4_c: 73.0
vp8_put_epel8_h4v4_rvv_i32: 22.2
vp8_put_epel8_h4v6_c: 90.5
vp8_put_epel8_h4v6_rvv_i32: 26.7
vp8_put_epel8_h6v4_c: 85.0
vp8_put_epel8_h6v4_rvv_i32: 27.2
vp8_put_epel8_h6v6_c: 104.7
vp8_put_epel8_h6v6_rvv_i32: 29.5
vp8_put_epel16_h4v4_c: 145.5
vp8_put_epel16_h4v4_rvv_i32: 26.5
vp8_put_epel16_h4v6_c: 190.7
vp8_put_epel16_h4v6_rvv_i32: 47.5
vp8_put_epel16_h6v4_c: 173.7
vp8_put_epel16_h6v4_rvv_i32: 33.2
vp8_put_epel16_h6v6_c: 222.2
vp8_put_epel16_h6v6_rvv_i32: 35.5
---
libavcodec/riscv/vp8dsp_init.c | 13 ++++
libavcodec/riscv/vp8dsp_rvv.S | 117 +++++++++++++++++++++++++++------
2 files changed, 109 insertions(+), 21 deletions(-)
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index dc3e087f01..463c8fa0a2 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
+
+ c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv;
+ c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
+ c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
+ c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv;
+ c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
+ c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
+ c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv;
+ c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
+ c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
+ c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv;
+ c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
+ c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
}
#endif
#endif
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index bf268e4d8d..baa8152830 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -161,26 +161,26 @@ const subpel_filters
.byte 0, -1, 12, 123, -6, 0
endconst
-.macro epel_filter size type
- lla t2, subpel_filters
+.macro epel_filter size type regtype
+ lla \regtype\()2, subpel_filters
.ifc \type,v
- addi t0, a6, -1
+ addi \regtype\()0, a6, -1
.elseif \type == h
- addi t0, a5, -1
+ addi \regtype\()0, a5, -1
.endif
- li t1, 6
- mul t0, t0, t1
- add t0, t0, t2
+ li \regtype\()1, 6
+ mul \regtype\()0, \regtype\()0, \regtype\()1
+ add \regtype\()0, \regtype\()0, \regtype\()2
.irp n 1,2,3,4
- lb t\n, \n(t0)
+ lb \regtype\n, \n(\regtype\()0)
.endr
.ifc \size,6
- lb t5, 5(t0)
- lb t0, (t0)
+ lb \regtype\()5, 5(\regtype\()0)
+ lb \regtype\()0, (\regtype\()0)
.endif
.endm
-.macro epel_load dst len size type
+.macro epel_load dst len size type from_mem regtype
.ifc \type,v
mv a5, a3
.else
@@ -189,24 +189,35 @@ endconst
sub t6, a2, a5
add a7, a2, a5
+.if \from_mem
vle8.v v24, (a2)
vle8.v v22, (t6)
vle8.v v26, (a7)
add a7, a7, a5
vle8.v v28, (a7)
- vwmulu.vx v16, v24, t2
- vwmulu.vx v20, v26, t3
+ vwmulu.vx v16, v24, \regtype\()2
+ vwmulu.vx v20, v26, \regtype\()3
.ifc \size,6
sub t6, t6, a5
add a7, a7, a5
vle8.v v24, (t6)
vle8.v v26, (a7)
- vwmaccu.vx v16, t0, v24
- vwmaccu.vx v16, t5, v26
+ vwmaccu.vx v16, \regtype\()0, v24
+ vwmaccu.vx v16, \regtype\()5, v26
+.endif
+ vwmaccsu.vx v16, \regtype\()1, v22
+ vwmaccsu.vx v16, \regtype\()4, v28
+.else
+ vwmulu.vx v16, v4, \regtype\()2
+ vwmulu.vx v20, v6, \regtype\()3
+ .ifc \size,6
+ vwmaccu.vx v16, \regtype\()0, v0
+ vwmaccu.vx v16, \regtype\()5, v10
+ .endif
+ vwmaccsu.vx v16, \regtype\()1, v2
+ vwmaccsu.vx v16, \regtype\()4, v8
.endif
li t6, 64
- vwmaccsu.vx v16, t1, v22
- vwmaccsu.vx v16, t4, v28
vwadd.wx v16, v16, t6
vsetvlstatic16 \len
vwadd.vv v24, v16, v20
@@ -216,18 +227,18 @@ endconst
vnclipu.wi \dst, v24, 0
.endm
-.macro epel_load_inc dst len size type
- epel_load \dst \len \size \type
+.macro epel_load_inc dst len size type from_mem regtype
+ epel_load \dst \len \size \type \from_mem \regtype
add a2, a2, a3
.endm
.macro epel len size type
func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
- epel_filter \size \type
+ epel_filter \size \type t
vsetvlstatic8 \len
1:
addi a4, a4, -1
- epel_load_inc v30 \len \size \type
+ epel_load_inc v30 \len \size \type 1 t
vse8.v v30, (a0)
add a0, a0, a1
bnez a4, 1b
@@ -236,6 +247,66 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
endfunc
.endm
+.macro epel_hv len hsize vsize
+func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x
+ addi sp, sp, -48
+ .irp n 0,1,2,3,4,5
+#if __riscv_xlen >= 64
+ sd s\n, \n\()<<3(sp)
+#else
+ sw s\n, \n\()<<3(sp)
+#endif
+ .endr
+ sub a2, a2, a3
+ epel_filter \hsize h t
+ epel_filter \vsize v s
+ vsetvlstatic8 \len
+.if \hsize == 6 || \vsize == 6
+ sub a2, a2, a3
+ epel_load_inc v0 \len \hsize h 1 t
+.endif
+ epel_load_inc v2 \len \hsize h 1 t
+ epel_load_inc v4 \len \hsize h 1 t
+ epel_load_inc v6 \len \hsize h 1 t
+ epel_load_inc v8 \len \hsize h 1 t
+.if \hsize == 6 || \vsize == 6
+ epel_load_inc v10 \len \hsize h 1 t
+.endif
+ addi a4, a4, -1
+1:
+ addi a4, a4, -1
+ epel_load v30 \len \vsize v 0 s
+ vse8.v v30, (a0)
+.if \hsize == 6 || \vsize == 6
+ vmv.v.v v0, v2
+.endif
+ vmv.v.v v2, v4
+ vmv.v.v v4, v6
+ vmv.v.v v6, v8
+.if \hsize == 6 || \vsize == 6
+ vmv.v.v v8, v10
+ epel_load_inc v10 \len \hsize h 1 t
+.else
+ epel_load_inc v8 \len 4 h 1 t
+.endif
+ add a0, a0, a1
+ bnez a4, 1b
+ epel_load v30 \len \vsize v 0 s
+ vse8.v v30, (a0)
+
+ .irp n 0,1,2,3,4,5
+#if __riscv_xlen >= 64
+ ld s\n, \n\()<<3(sp)
+#else
+ lw s\n, \n\()<<3(sp)
+#endif
+ .endr
+ addi sp, sp, 48
+
+ ret
+endfunc
+.endm
+
.irp len 16,8,4
put_vp8_bilin_h_v \len h a5
put_vp8_bilin_h_v \len v a6
@@ -244,4 +315,8 @@ epel \len 6 h
epel \len 4 h
epel \len 6 v
epel \len 4 v
+epel_hv \len 6 6
+epel_hv \len 4 4
+epel_hv \len 6 4
+epel_hv \len 4 6
.endr
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v3 7/9] lavc/vp8dsp: R-V V loop_filter_simple
[not found] <20240506033809.3790245-1-uk7b@foxmail.com>
` (4 preceding siblings ...)
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 6/9] lavc/vp8dsp: R-V V put_epel hv uk7b
@ 2024-05-06 3:38 ` uk7b
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 8/9] lavc/vp8dsp: R-V V loop_filter_inner uk7b
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 9/9] lavc/vp8dsp: R-V V loop_filter uk7b
7 siblings, 0 replies; 15+ messages in thread
From: uk7b @ 2024-05-06 3:38 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp8_loop_filter_simple_h_c: 416.0
vp8_loop_filter_simple_h_rvv_i32: 187.5
vp8_loop_filter_simple_v_c: 429.7
vp8_loop_filter_simple_v_rvv_i32: 104.0
---
libavcodec/riscv/vp8dsp_init.c | 5 ++
libavcodec/riscv/vp8dsp_rvv.S | 85 ++++++++++++++++++++++++++++++++++
2 files changed, 90 insertions(+)
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 463c8fa0a2..3acfe75d67 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -41,6 +41,8 @@ VP8_BILIN(16, rvv);
VP8_BILIN(8, rvv);
VP8_BILIN(4, rvv);
+VP8_LF(rvv);
+
av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
{
#if HAVE_RV
@@ -126,6 +128,9 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
if (flags & AV_CPU_FLAG_RVB_ADDR) {
c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
}
+
+ c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_rvv;
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_rvv;
}
#endif
}
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index baa8152830..2ac79a3b77 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -94,6 +94,91 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
ret
endfunc
+.macro filter_fmin len a f1 p0f2 q0f1
+ vsetvlstatic16 \len
+ vsext.vf2 \q0f1, \a
+ vmin.vx \p0f2, \q0f1, a7
+ vmin.vx \q0f1, \q0f1, t3
+ vadd.vi \p0f2, \p0f2, 3
+ vadd.vi \q0f1, \q0f1, 4
+ vsra.vi \p0f2, \p0f2, 3
+ vsra.vi \f1, \q0f1, 3
+ vadd.vv \p0f2, \p0f2, v8
+ vsub.vv \q0f1, v16, \f1
+ vmax.vx \p0f2, \p0f2, zero
+ vmax.vx \q0f1, \q0f1, zero
+.endm
+
+.macro filter len type normal inner dst stride fE fI thresh
+.ifc \type,v
+ slli a6, \stride, 1
+ sub t2, \dst, a6
+ add t4, \dst, \stride
+ sub t1, \dst, \stride
+ vle8.v v1, (t2)
+ vle8.v v11, (t4)
+ vle8.v v17, (t1)
+ vle8.v v22, (\dst)
+.else
+ addi t1, \dst, -1
+ addi a6, \dst, -2
+ addi t4, \dst, 1
+ vlse8.v v1, (a6), \stride
+ vlse8.v v11, (t4), \stride
+ vlse8.v v17, (t1), \stride
+ vlse8.v v22, (\dst), \stride
+.endif
+ vwsubu.vv v12, v1, v11 // p1-q1
+ vwsubu.vv v24, v22, v17 // q0-p0
+ vnclip.wi v23, v12, 0
+ vsetvlstatic16 \len
+ // vp8_simple_limit(dst + i, stride, flim)
+ li a7, 2
+ vneg.v v18, v12
+ vmax.vv v18, v18, v12
+ vneg.v v8, v24
+ vmax.vv v8, v8, v24
+ vsrl.vi v18, v18, 1
+ vmacc.vx v18, a7, v8
+ vmsleu.vx v0, v18, \fE
+
+ li t5, 3
+ li a7, 124
+ li t3, 123
+ vsext.vf2 v4, v23
+ vzext.vf2 v8, v17 // p0
+ vzext.vf2 v16, v22 // q0
+ vmul.vx v30, v24, t5
+ vadd.vv v12, v30, v4
+ vsetvlstatic8 \len
+ vnclip.wi v11, v12, 0
+ filter_fmin \len v11 v24 v4 v6
+ vsetvlstatic8 \len
+ vnclipu.wi v4, v4, 0
+ vnclipu.wi v6, v6, 0
+
+.ifc \type,v
+ vse8.v v4, (t1), v0.t
+ vse8.v v6, (\dst), v0.t
+.else
+ vsse8.v v4, (t1), \stride, v0.t
+ vsse8.v v6, (\dst), \stride, v0.t
+.endif
+
+.endm
+
+func ff_vp8_v_loop_filter16_simple_rvv, zve32x
+ vsetvlstatic8 16
+ filter 16 v 0 0 a0 a1 a2 a3 a4
+ ret
+endfunc
+
+func ff_vp8_h_loop_filter16_simple_rvv, zve32x
+ vsetvlstatic8 16
+ filter 16 h 0 0 a0 a1 a2 a3 a4
+ ret
+endfunc
+
.macro bilin_load dst len type mn
.ifc \type,v
add t5, a2, a3
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v3 8/9] lavc/vp8dsp: R-V V loop_filter_inner
[not found] <20240506033809.3790245-1-uk7b@foxmail.com>
` (5 preceding siblings ...)
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 7/9] lavc/vp8dsp: R-V V loop_filter_simple uk7b
@ 2024-05-06 3:38 ` uk7b
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 9/9] lavc/vp8dsp: R-V V loop_filter uk7b
7 siblings, 0 replies; 15+ messages in thread
From: uk7b @ 2024-05-06 3:38 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp8_loop_filter8uv_inner_v_c: 738.2
vp8_loop_filter8uv_inner_v_rvv_i32: 455.2
vp8_loop_filter16y_inner_h_c: 685.0
vp8_loop_filter16y_inner_h_rvv_i32: 497.0
vp8_loop_filter16y_inner_v_c: 743.7
vp8_loop_filter16y_inner_v_rvv_i32: 295.7
---
libavcodec/riscv/vp8dsp_init.c | 4 ++
libavcodec/riscv/vp8dsp_rvv.S | 104 +++++++++++++++++++++++++++++++++
2 files changed, 108 insertions(+)
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 3acfe75d67..2adff1052a 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -129,6 +129,10 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
}
+ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_rvv;
+ c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_rvv;
+ c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_rvv;
+
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_rvv;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_rvv;
}
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 2ac79a3b77..21fa232325 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -94,6 +94,13 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
ret
endfunc
+.macro filter_abs dst diff fI
+ vneg.v v8, \diff
+ vmax.vv \dst, v8, \diff
+ vmsleu.vx v8, \dst, \fI
+ vmand.mm v27, v27, v8
+.endm
+
.macro filter_fmin len a f1 p0f2 q0f1
vsetvlstatic16 \len
vsext.vf2 \q0f1, \a
@@ -119,6 +126,16 @@ endfunc
vle8.v v11, (t4)
vle8.v v17, (t1)
vle8.v v22, (\dst)
+ .if \normal
+ sub t3, t2, a6
+ sub t0, t1, a6
+ add t6, \dst, a6
+ add a7, t4, a6
+ vle8.v v2, (t3)
+ vle8.v v15, (t0)
+ vle8.v v10, (t6)
+ vle8.v v14, (a7)
+ .endif
.else
addi t1, \dst, -1
addi a6, \dst, -2
@@ -127,9 +144,27 @@ endfunc
vlse8.v v11, (t4), \stride
vlse8.v v17, (t1), \stride
vlse8.v v22, (\dst), \stride
+ .if \normal
+ addi t5, \dst, -4
+ addi t0, \dst, -3
+ addi t6, \dst, 2
+ addi a7, \dst, 3
+ vlse8.v v2, (t5), \stride
+ vlse8.v v15, (t0), \stride
+ vlse8.v v10, (t6), \stride
+ vlse8.v v14, (a7), \stride
+ .endif
.endif
vwsubu.vv v12, v1, v11 // p1-q1
vwsubu.vv v24, v22, v17 // q0-p0
+.if \normal
+ vwsubu.vv v30, v1, v17
+ vwsubu.vv v20, v11, v22
+ vwsubu.vv v28, v1, v15
+ vwsubu.vv v4, v2, v15
+ vwsubu.vv v6, v10, v11
+ vwsubu.vv v2, v14, v10
+.endif
vnclip.wi v23, v12, 0
vsetvlstatic16 \len
// vp8_simple_limit(dst + i, stride, flim)
@@ -141,6 +176,25 @@ endfunc
vsrl.vi v18, v18, 1
vmacc.vx v18, a7, v8
vmsleu.vx v0, v18, \fE
+.if \normal
+ vneg.v v18, v30
+ vmax.vv v30, v18, v30
+ vmsleu.vx v27, v30, \fI
+ filter_abs v18 v28 \fI
+ filter_abs v18 v4 \fI
+ filter_abs v18 v6 \fI
+ filter_abs v18 v2 \fI
+ filter_abs v20 v20 \fI
+ vmand.mm v27, v0, v27 // vp8_simple_limit && normal
+
+ vmsgtu.vx v20, v20, \thresh // hev
+ vmsgtu.vx v3, v30, \thresh
+ vmor.mm v3, v3, v20 // v3 = hev: > thresh
+ vzext.vf2 v18, v1 // v18 = p1
+ vmand.mm v0, v27, v3 // v0 = normal && hev
+ vzext.vf2 v20, v11 // v12 = q1
+ vmnot.m v3, v3 // v3 = !hv
+.endif
li t5, 3
li a7, 124
@@ -165,6 +219,37 @@ endfunc
vsse8.v v6, (\dst), \stride, v0.t
.endif
+.if \normal
+ vmand.mm v0, v27, v3 // vp8_normal_limit & !hv
+
+ .if \inner
+ vnclip.wi v30, v30, 0
+ filter_fmin \len v30 v24 v4 v6
+ vadd.vi v24, v24, 1
+ vsra.vi v24, v24, 1 // (f1 + 1) >> 1;
+ vadd.vv v8, v18, v24
+ vsub.vv v10, v20, v24
+ .endif
+
+ vmax.vx v8, v8, zero
+ vmax.vx v10, v10, zero
+ vsetvlstatic8 \len
+ vnclipu.wi v4, v4, 0
+ vnclipu.wi v5, v6, 0
+ vnclipu.wi v6, v8, 0
+ vnclipu.wi v7, v10, 0
+ .ifc \type,v
+ vse8.v v4, (t1), v0.t
+ vse8.v v5, (\dst), v0.t
+ vse8.v v6, (t2), v0.t
+ vse8.v v7, (t4), v0.t
+ .else
+ vsse8.v v4, (t1), \stride, v0.t
+ vsse8.v v5, (\dst), \stride, v0.t
+ vsse8.v v6, (a6), \stride, v0.t
+ vsse8.v v7, (t4), \stride, v0.t
+ .endif
+.endif
.endm
func ff_vp8_v_loop_filter16_simple_rvv, zve32x
@@ -179,6 +264,25 @@ func ff_vp8_h_loop_filter16_simple_rvv, zve32x
ret
endfunc
+func ff_vp8_h_loop_filter16_inner_rvv, zve32x
+ vsetvlstatic8 16
+ filter 16 h 1 1 a0 a1 a2 a3 a4
+ ret
+endfunc
+
+func ff_vp8_v_loop_filter16_inner_rvv, zve32x
+ vsetvlstatic8 16
+ filter 16 v 1 1 a0 a1 a2 a3 a4
+ ret
+endfunc
+
+func ff_vp8_v_loop_filter8uv_inner_rvv, zve32x
+ vsetvlstatic8 8
+ filter 8 v 1 1 a0 a2 a3 a4 a5
+ filter 8 v 1 1 a1 a2 a3 a4 a5
+ ret
+endfunc
+
.macro bilin_load dst len type mn
.ifc \type,v
add t5, a2, a3
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v3 9/9] lavc/vp8dsp: R-V V loop_filter
[not found] <20240506033809.3790245-1-uk7b@foxmail.com>
` (6 preceding siblings ...)
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 8/9] lavc/vp8dsp: R-V V loop_filter_inner uk7b
@ 2024-05-06 3:38 ` uk7b
7 siblings, 0 replies; 15+ messages in thread
From: uk7b @ 2024-05-06 3:38 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp8_loop_filter8uv_v_c: 745.5
vp8_loop_filter8uv_v_rvv_i32: 467.2
vp8_loop_filter16y_h_c: 674.2
vp8_loop_filter16y_h_rvv_i32: 553.0
vp8_loop_filter16y_v_c: 732.7
vp8_loop_filter16y_v_rvv_i32: 324.5
---
libavcodec/riscv/vp8dsp_init.c | 4 +++
libavcodec/riscv/vp8dsp_rvv.S | 57 ++++++++++++++++++++++++++++++++++
2 files changed, 61 insertions(+)
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 2adff1052a..1bb5aad518 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -129,6 +129,10 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
}
+ c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_rvv;
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_rvv;
+ c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_rvv;
+
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_rvv;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_rvv;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_rvv;
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 21fa232325..567dc96f76 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -229,6 +229,33 @@ endfunc
vsra.vi v24, v24, 1 // (f1 + 1) >> 1;
vadd.vv v8, v18, v24
vsub.vv v10, v20, v24
+ .else
+ li t5, 27
+ li t3, 9
+ li a7, 18
+ vwmul.vx v2, v11, t5
+ vwmul.vx v6, v11, t3
+ vwmul.vx v4, v11, a7
+ vsetvlstatic16 \len
+ li a7, 63
+ vzext.vf2 v14, v15 // p2
+ vzext.vf2 v24, v10 // q2
+ vadd.vx v2, v2, a7
+ vadd.vx v4, v4, a7
+ vadd.vx v6, v6, a7
+ vsra.vi v2, v2, 7 // a0
+ vsra.vi v12, v4, 7 // a1
+ vsra.vi v6, v6, 7 // a2
+ vadd.vv v14, v14, v6 // p2 + a2
+ vsub.vv v22, v24, v6 // q2 - a2
+ vsub.vv v10, v20, v12 // q1 - a1
+ vadd.vv v4, v8, v2 // p0 + a0
+ vsub.vv v6, v16, v2 // q0 - a0
+ vadd.vv v8, v12, v18 // a1 + p1
+ vmax.vx v4, v4, zero
+ vmax.vx v6, v6, zero
+ vmax.vx v14, v14, zero
+ vmax.vx v16, v22, zero
.endif
vmax.vx v8, v8, zero
@@ -249,6 +276,17 @@ endfunc
vsse8.v v6, (a6), \stride, v0.t
vsse8.v v7, (t4), \stride, v0.t
.endif
+ .if !\inner
+ vnclipu.wi v14, v14, 0
+ vnclipu.wi v16, v16, 0
+ .ifc \type,v
+ vse8.v v14, (t0), v0.t
+ vse8.v v16, (t6), v0.t
+ .else
+ vsse8.v v14, (t0), \stride, v0.t
+ vsse8.v v16, (t6), \stride, v0.t
+ .endif
+ .endif
.endif
.endm
@@ -283,6 +321,25 @@ func ff_vp8_v_loop_filter8uv_inner_rvv, zve32x
ret
endfunc
+func ff_vp8_v_loop_filter16_rvv, zve32x
+ vsetvlstatic8 16
+ filter 16 v 1 0 a0 a1 a2 a3 a4
+ ret
+endfunc
+
+func ff_vp8_h_loop_filter16_rvv, zve32x
+ vsetvlstatic8 16
+ filter 16 h 1 0 a0 a1 a2 a3 a4
+ ret
+endfunc
+
+func ff_vp8_v_loop_filter8uv_rvv, zve32x
+ vsetvlstatic8 8
+ filter 8 v 1 0 a0 a2 a3 a4 a5
+ filter 8 v 1 0 a1 a2 a3 a4 a5
+ ret
+endfunc
+
.macro bilin_load dst len type mn
.ifc \type,v
add t5, a2, a3
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 2/9] lavc/vp8dsp: R-V V put_bilin_h v
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 2/9] lavc/vp8dsp: R-V V put_bilin_h v uk7b
@ 2024-05-06 3:44 ` flow gg
2024-05-07 15:59 ` Rémi Denis-Courmont
1 sibling, 0 replies; 15+ messages in thread
From: flow gg @ 2024-05-06 3:44 UTC (permalink / raw)
To: FFmpeg development discussions and patches, Rémi Denis-Courmont
> Doesn't this effectively discard the last element, t5?
> Can't we skip the slide and just load the vector at a2+1? Also then, we
can
> keep VL=len and halve the multipler.
Yes, this is better, I remember that using slide1down was better in the
initial version testing, but now it has changed..
I modified it to load a2+1 and merged h and v.
<uk7b@foxmail.com> 于2024年5月6日周一 11:38写道:
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908:
> vp8_put_bilin4_h_c: 367.0
> vp8_put_bilin4_h_rvv_i32: 137.7
> vp8_put_bilin4_v_c: 377.0
> vp8_put_bilin4_v_rvv_i32: 137.7
> vp8_put_bilin8_h_c: 1431.0
> vp8_put_bilin8_h_rvv_i32: 297.5
> vp8_put_bilin8_v_c: 1449.0
> vp8_put_bilin8_v_rvv_i32: 297.5
> vp8_put_bilin16_h_c: 2839.0
> vp8_put_bilin16_h_rvv_i32: 344.7
> vp8_put_bilin16_v_c: 2857.0
> vp8_put_bilin16_v_rvv_i32: 344.7
> ---
> libavcodec/riscv/vp8dsp_init.c | 21 +++++++++++++++
> libavcodec/riscv/vp8dsp_rvv.S | 49 ++++++++++++++++++++++++++++++++++
> 2 files changed, 70 insertions(+)
>
> diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> index fa3feeacf7..afffa6de2f 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -34,6 +34,10 @@ VP8_EPEL(16, rvi);
> VP8_EPEL(8, rvi);
> VP8_EPEL(4, rvi);
>
> +VP8_BILIN(16, rvv);
> +VP8_BILIN(8, rvv);
> +VP8_BILIN(4, rvv);
> +
> av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> {
> #if HAVE_RV
> @@ -48,6 +52,23 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> c->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
> c->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
> }
> +#if HAVE_RVV
> + if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
> + c->put_vp8_bilinear_pixels_tab[0][0][1] =
> ff_put_vp8_bilin16_h_rvv;
> + c->put_vp8_bilinear_pixels_tab[0][0][2] =
> ff_put_vp8_bilin16_h_rvv;
> + c->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_rvv;
> + c->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_rvv;
> + c->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_rvv;
> + c->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_rvv;
> +
> + c->put_vp8_bilinear_pixels_tab[0][1][0] =
> ff_put_vp8_bilin16_v_rvv;
> + c->put_vp8_bilinear_pixels_tab[0][2][0] =
> ff_put_vp8_bilin16_v_rvv;
> + c->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_rvv;
> + c->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_rvv;
> + c->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_rvv;
> + c->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_rvv;
> + }
> +#endif
> #endif
> }
>
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index 8a0773f964..9bf969d794 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -20,6 +20,18 @@
>
> #include "libavutil/riscv/asm.S"
>
> +.macro vsetvlstatic8 len
> +.if \len <= 4
> + vsetivli zero, \len, e8, mf4, ta, ma
> +.elseif \len <= 8
> + vsetivli zero, \len, e8, mf2, ta, ma
> +.elseif \len <= 16
> + vsetivli zero, \len, e8, m1, ta, ma
> +.elseif \len <= 31
> + vsetivli zero, \len, e8, m2, ta, ma
> +.endif
> +.endm
> +
> .macro vp8_idct_dc_add
> vlse32.v v0, (a0), a2
> lh a5, 0(a1)
> @@ -71,3 +83,40 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
>
> ret
> endfunc
> +
> +.macro bilin_load dst len type mn
> +.ifc \type,v
> + add t5, a2, a3
> +.elseif \type == h
> + addi t5, a2, 1
> +.endif
> + vle8.v \dst, (a2)
> + vle8.v v2, (t5)
> + vwmulu.vx v28, \dst, t1
> + vwmaccu.vx v28, \mn, v2
> + vwaddu.wx v24, v28, t4
> + vnsra.wi \dst, v24, 3
> +.endm
> +
> +.macro put_vp8_bilin_h_v len type mn
> +func ff_put_vp8_bilin\len\()_\type\()_rvv, zve32x
> + vsetvlstatic8 \len
> + li t1, 8
> + li t4, 4
> + sub t1, t1, \mn
> +1:
> + addi a4, a4, -1
> + bilin_load v0, \len, \type, \mn
> + vse8.v v0, (a0)
> + add a2, a2, a3
> + add a0, a0, a1
> + bnez a4, 1b
> +
> + ret
> +endfunc
> +.endm
> +
> +.irp len 16,8,4
> +put_vp8_bilin_h_v \len h a5
> +put_vp8_bilin_h_v \len v a6
> +.endr
> --
> 2.45.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 6/9] lavc/vp8dsp: R-V V put_epel hv
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 6/9] lavc/vp8dsp: R-V V put_epel hv uk7b
@ 2024-05-06 19:24 ` Rémi Denis-Courmont
2024-05-07 2:27 ` [FFmpeg-devel] [PATCH " uk7b
2024-05-07 2:31 ` [FFmpeg-devel] [PATCH v3 " flow gg
0 siblings, 2 replies; 15+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-06 19:24 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
Le maanantaina 6. toukokuuta 2024, 6.38.06 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908:
> vp8_put_epel4_h4v4_c: 20.0
> vp8_put_epel4_h4v4_rvv_i32: 11.0
> vp8_put_epel4_h4v6_c: 25.2
> vp8_put_epel4_h4v6_rvv_i32: 13.5
> vp8_put_epel4_h6v4_c: 22.2
> vp8_put_epel4_h6v4_rvv_i32: 14.5
> vp8_put_epel4_h6v6_c: 29.0
> vp8_put_epel4_h6v6_rvv_i32: 15.7
> vp8_put_epel8_h4v4_c: 73.0
> vp8_put_epel8_h4v4_rvv_i32: 22.2
> vp8_put_epel8_h4v6_c: 90.5
> vp8_put_epel8_h4v6_rvv_i32: 26.7
> vp8_put_epel8_h6v4_c: 85.0
> vp8_put_epel8_h6v4_rvv_i32: 27.2
> vp8_put_epel8_h6v6_c: 104.7
> vp8_put_epel8_h6v6_rvv_i32: 29.5
> vp8_put_epel16_h4v4_c: 145.5
> vp8_put_epel16_h4v4_rvv_i32: 26.5
> vp8_put_epel16_h4v6_c: 190.7
> vp8_put_epel16_h4v6_rvv_i32: 47.5
> vp8_put_epel16_h6v4_c: 173.7
> vp8_put_epel16_h6v4_rvv_i32: 33.2
> vp8_put_epel16_h6v6_c: 222.2
> vp8_put_epel16_h6v6_rvv_i32: 35.5
> ---
> libavcodec/riscv/vp8dsp_init.c | 13 ++++
> libavcodec/riscv/vp8dsp_rvv.S | 117 +++++++++++++++++++++++++++------
> 2 files changed, 109 insertions(+), 21 deletions(-)
>
> diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
> index dc3e087f01..463c8fa0a2 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
> c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
> c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
> +
> + c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv;
> + c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
> + c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
> + c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv;
> + c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
> + c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
> + c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv;
> + c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
> + c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
> + c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv;
> + c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
> + c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
> }
> #endif
> #endif
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index bf268e4d8d..baa8152830 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -161,26 +161,26 @@ const subpel_filters
> .byte 0, -1, 12, 123, -6, 0
> endconst
>
> -.macro epel_filter size type
> - lla t2, subpel_filters
> +.macro epel_filter size type regtype
> + lla \regtype\()2, subpel_filters
> .ifc \type,v
> - addi t0, a6, -1
> + addi \regtype\()0, a6, -1
IMO, passing a complete register name, if you really need to vary it, would be
simpler and more flexible than an ABI register type prefix.
> .elseif \type == h
> - addi t0, a5, -1
> + addi \regtype\()0, a5, -1
> .endif
> - li t1, 6
> - mul t0, t0, t1
> - add t0, t0, t2
> + li \regtype\()1, 6
> + mul \regtype\()0, \regtype\()0, \regtype\()1
> + add \regtype\()0, \regtype\()0, \regtype\()2
> .irp n 1,2,3,4
> - lb t\n, \n(t0)
> + lb \regtype\n, \n(\regtype\()0)
> .endr
> .ifc \size,6
> - lb t5, 5(t0)
> - lb t0, (t0)
> + lb \regtype\()5, 5(\regtype\()0)
> + lb \regtype\()0, (\regtype\()0)
> .endif
> .endm
>
> -.macro epel_load dst len size type
> +.macro epel_load dst len size type from_mem regtype
> .ifc \type,v
> mv a5, a3
> .else
> @@ -189,24 +189,35 @@ endconst
> sub t6, a2, a5
> add a7, a2, a5
>
> +.if \from_mem
> vle8.v v24, (a2)
> vle8.v v22, (t6)
> vle8.v v26, (a7)
> add a7, a7, a5
> vle8.v v28, (a7)
> - vwmulu.vx v16, v24, t2
> - vwmulu.vx v20, v26, t3
> + vwmulu.vx v16, v24, \regtype\()2
> + vwmulu.vx v20, v26, \regtype\()3
> .ifc \size,6
> sub t6, t6, a5
> add a7, a7, a5
> vle8.v v24, (t6)
> vle8.v v26, (a7)
> - vwmaccu.vx v16, t0, v24
> - vwmaccu.vx v16, t5, v26
> + vwmaccu.vx v16, \regtype\()0, v24
> + vwmaccu.vx v16, \regtype\()5, v26
> +.endif
> + vwmaccsu.vx v16, \regtype\()1, v22
> + vwmaccsu.vx v16, \regtype\()4, v28
> +.else
> + vwmulu.vx v16, v4, \regtype\()2
> + vwmulu.vx v20, v6, \regtype\()3
> + .ifc \size,6
> + vwmaccu.vx v16, \regtype\()0, v0
> + vwmaccu.vx v16, \regtype\()5, v10
> + .endif
> + vwmaccsu.vx v16, \regtype\()1, v2
> + vwmaccsu.vx v16, \regtype\()4, v8
> .endif
> li t6, 64
> - vwmaccsu.vx v16, t1, v22
> - vwmaccsu.vx v16, t4, v28
> vwadd.wx v16, v16, t6
> vsetvlstatic16 \len
> vwadd.vv v24, v16, v20
> @@ -216,18 +227,18 @@ endconst
> vnclipu.wi \dst, v24, 0
> .endm
>
> -.macro epel_load_inc dst len size type
> - epel_load \dst \len \size \type
> +.macro epel_load_inc dst len size type from_mem regtype
> + epel_load \dst \len \size \type \from_mem \regtype
> add a2, a2, a3
> .endm
>
> .macro epel len size type
> func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
> - epel_filter \size \type
> + epel_filter \size \type t
> vsetvlstatic8 \len
> 1:
> addi a4, a4, -1
> - epel_load_inc v30 \len \size \type
> + epel_load_inc v30 \len \size \type 1 t
> vse8.v v30, (a0)
> add a0, a0, a1
> bnez a4, 1b
> @@ -236,6 +247,66 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> zve32x endfunc
> .endm
>
> +.macro epel_hv len hsize vsize
> +func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x
> + addi sp, sp, -48
> + .irp n 0,1,2,3,4,5
> +#if __riscv_xlen >= 64
This code actually requires ==, not >=.
> + sd s\n, \n\()<<3(sp)
> +#else
> + sw s\n, \n\()<<3(sp)
You can do that but you only need half the stack space and offsets.
(And that's why I avoid S and FS registers like the plague, but sometimes you
just can't.)
> +#endif
> + .endr
> + sub a2, a2, a3
> + epel_filter \hsize h t
> + epel_filter \vsize v s
> + vsetvlstatic8 \len
> +.if \hsize == 6 || \vsize == 6
> + sub a2, a2, a3
> + epel_load_inc v0 \len \hsize h 1 t
> +.endif
> + epel_load_inc v2 \len \hsize h 1 t
> + epel_load_inc v4 \len \hsize h 1 t
> + epel_load_inc v6 \len \hsize h 1 t
> + epel_load_inc v8 \len \hsize h 1 t
> +.if \hsize == 6 || \vsize == 6
> + epel_load_inc v10 \len \hsize h 1 t
> +.endif
> + addi a4, a4, -1
> +1:
> + addi a4, a4, -1
> + epel_load v30 \len \vsize v 0 s
> + vse8.v v30, (a0)
> +.if \hsize == 6 || \vsize == 6
> + vmv.v.v v0, v2
> +.endif
> + vmv.v.v v2, v4
> + vmv.v.v v4, v6
> + vmv.v.v v6, v8
> +.if \hsize == 6 || \vsize == 6
> + vmv.v.v v8, v10
> + epel_load_inc v10 \len \hsize h 1 t
> +.else
> + epel_load_inc v8 \len 4 h 1 t
> +.endif
> + add a0, a0, a1
> + bnez a4, 1b
> + epel_load v30 \len \vsize v 0 s
> + vse8.v v30, (a0)
> +
> + .irp n 0,1,2,3,4,5
> +#if __riscv_xlen >= 64
> + ld s\n, \n\()<<3(sp)
> +#else
> + lw s\n, \n\()<<3(sp)
> +#endif
> + .endr
> + addi sp, sp, 48
> +
> + ret
> +endfunc
> +.endm
> +
> .irp len 16,8,4
> put_vp8_bilin_h_v \len h a5
> put_vp8_bilin_h_v \len v a6
> @@ -244,4 +315,8 @@ epel \len 6 h
> epel \len 4 h
> epel \len 6 v
> epel \len 4 v
> +epel_hv \len 6 6
> +epel_hv \len 4 4
> +epel_hv \len 6 4
> +epel_hv \len 4 6
> .endr
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH 6/9] lavc/vp8dsp: R-V V put_epel hv
2024-05-06 19:24 ` Rémi Denis-Courmont
@ 2024-05-07 2:27 ` uk7b
2024-05-07 2:31 ` [FFmpeg-devel] [PATCH v3 " flow gg
1 sibling, 0 replies; 15+ messages in thread
From: uk7b @ 2024-05-07 2:27 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp8_put_epel4_h4v4_c: 20.0
vp8_put_epel4_h4v4_rvv_i32: 11.0
vp8_put_epel4_h4v6_c: 25.2
vp8_put_epel4_h4v6_rvv_i32: 13.5
vp8_put_epel4_h6v4_c: 22.2
vp8_put_epel4_h6v4_rvv_i32: 14.5
vp8_put_epel4_h6v6_c: 29.0
vp8_put_epel4_h6v6_rvv_i32: 15.7
vp8_put_epel8_h4v4_c: 73.0
vp8_put_epel8_h4v4_rvv_i32: 22.2
vp8_put_epel8_h4v6_c: 90.5
vp8_put_epel8_h4v6_rvv_i32: 26.7
vp8_put_epel8_h6v4_c: 85.0
vp8_put_epel8_h6v4_rvv_i32: 27.2
vp8_put_epel8_h6v6_c: 104.7
vp8_put_epel8_h6v6_rvv_i32: 29.5
vp8_put_epel16_h4v4_c: 145.5
vp8_put_epel16_h4v4_rvv_i32: 26.5
vp8_put_epel16_h4v6_c: 190.7
vp8_put_epel16_h4v6_rvv_i32: 47.5
vp8_put_epel16_h6v4_c: 173.7
vp8_put_epel16_h6v4_rvv_i32: 33.2
vp8_put_epel16_h6v6_c: 222.2
vp8_put_epel16_h6v6_rvv_i32: 35.5
---
libavcodec/riscv/vp8dsp_init.c | 13 ++++
libavcodec/riscv/vp8dsp_rvv.S | 123 +++++++++++++++++++++++++++------
2 files changed, 115 insertions(+), 21 deletions(-)
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index dc3e087f01..463c8fa0a2 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
+
+ c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv;
+ c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
+ c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
+ c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv;
+ c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
+ c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
+ c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv;
+ c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
+ c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
+ c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv;
+ c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
+ c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
}
#endif
#endif
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index bf268e4d8d..71945f3d77 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -161,26 +161,26 @@ const subpel_filters
.byte 0, -1, 12, 123, -6, 0
endconst
-.macro epel_filter size type
- lla t2, subpel_filters
+.macro epel_filter size type regtype
+ lla \regtype\()2, subpel_filters
.ifc \type,v
- addi t0, a6, -1
+ addi \regtype\()0, a6, -1
.elseif \type == h
- addi t0, a5, -1
+ addi \regtype\()0, a5, -1
.endif
- li t1, 6
- mul t0, t0, t1
- add t0, t0, t2
+ li \regtype\()1, 6
+ mul \regtype\()0, \regtype\()0, \regtype\()1
+ add \regtype\()0, \regtype\()0, \regtype\()2
.irp n 1,2,3,4
- lb t\n, \n(t0)
+ lb \regtype\n, \n(\regtype\()0)
.endr
.ifc \size,6
- lb t5, 5(t0)
- lb t0, (t0)
+ lb \regtype\()5, 5(\regtype\()0)
+ lb \regtype\()0, (\regtype\()0)
.endif
.endm
-.macro epel_load dst len size type
+.macro epel_load dst len size type from_mem regtype
.ifc \type,v
mv a5, a3
.else
@@ -189,24 +189,35 @@ endconst
sub t6, a2, a5
add a7, a2, a5
+.if \from_mem
vle8.v v24, (a2)
vle8.v v22, (t6)
vle8.v v26, (a7)
add a7, a7, a5
vle8.v v28, (a7)
- vwmulu.vx v16, v24, t2
- vwmulu.vx v20, v26, t3
+ vwmulu.vx v16, v24, \regtype\()2
+ vwmulu.vx v20, v26, \regtype\()3
.ifc \size,6
sub t6, t6, a5
add a7, a7, a5
vle8.v v24, (t6)
vle8.v v26, (a7)
- vwmaccu.vx v16, t0, v24
- vwmaccu.vx v16, t5, v26
+ vwmaccu.vx v16, \regtype\()0, v24
+ vwmaccu.vx v16, \regtype\()5, v26
+.endif
+ vwmaccsu.vx v16, \regtype\()1, v22
+ vwmaccsu.vx v16, \regtype\()4, v28
+.else
+ vwmulu.vx v16, v4, \regtype\()2
+ vwmulu.vx v20, v6, \regtype\()3
+ .ifc \size,6
+ vwmaccu.vx v16, \regtype\()0, v0
+ vwmaccu.vx v16, \regtype\()5, v10
+ .endif
+ vwmaccsu.vx v16, \regtype\()1, v2
+ vwmaccsu.vx v16, \regtype\()4, v8
.endif
li t6, 64
- vwmaccsu.vx v16, t1, v22
- vwmaccsu.vx v16, t4, v28
vwadd.wx v16, v16, t6
vsetvlstatic16 \len
vwadd.vv v24, v16, v20
@@ -216,18 +227,18 @@ endconst
vnclipu.wi \dst, v24, 0
.endm
-.macro epel_load_inc dst len size type
- epel_load \dst \len \size \type
+.macro epel_load_inc dst len size type from_mem regtype
+ epel_load \dst \len \size \type \from_mem \regtype
add a2, a2, a3
.endm
.macro epel len size type
func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
- epel_filter \size \type
+ epel_filter \size \type t
vsetvlstatic8 \len
1:
addi a4, a4, -1
- epel_load_inc v30 \len \size \type
+ epel_load_inc v30 \len \size \type 1 t
vse8.v v30, (a0)
add a0, a0, a1
bnez a4, 1b
@@ -236,6 +247,72 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
endfunc
.endm
+.macro epel_hv len hsize vsize
+func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x
+#if __riscv_xlen == 64
+ addi sp, sp, -48
+ .irp n 0,1,2,3,4,5
+ sd s\n, \n\()<<3(sp)
+ .endr
+#else
+ addi sp, sp, -24
+ .irp n 0,1,2,3,4,5
+ sw s\n, \n\()<<2(sp)
+ .endr
+#endif
+ sub a2, a2, a3
+ epel_filter \hsize h t
+ epel_filter \vsize v s
+ vsetvlstatic8 \len
+.if \hsize == 6 || \vsize == 6
+ sub a2, a2, a3
+ epel_load_inc v0 \len \hsize h 1 t
+.endif
+ epel_load_inc v2 \len \hsize h 1 t
+ epel_load_inc v4 \len \hsize h 1 t
+ epel_load_inc v6 \len \hsize h 1 t
+ epel_load_inc v8 \len \hsize h 1 t
+.if \hsize == 6 || \vsize == 6
+ epel_load_inc v10 \len \hsize h 1 t
+.endif
+ addi a4, a4, -1
+1:
+ addi a4, a4, -1
+ epel_load v30 \len \vsize v 0 s
+ vse8.v v30, (a0)
+.if \hsize == 6 || \vsize == 6
+ vmv.v.v v0, v2
+.endif
+ vmv.v.v v2, v4
+ vmv.v.v v4, v6
+ vmv.v.v v6, v8
+.if \hsize == 6 || \vsize == 6
+ vmv.v.v v8, v10
+ epel_load_inc v10 \len \hsize h 1 t
+.else
+ epel_load_inc v8 \len 4 h 1 t
+.endif
+ add a0, a0, a1
+ bnez a4, 1b
+ epel_load v30 \len \vsize v 0 s
+ vse8.v v30, (a0)
+
+#if __riscv_xlen == 64
+ .irp n 0,1,2,3,4,5
+ ld s\n, \n\()<<3(sp)
+ .endr
+ addi sp, sp, 48
+#else
+ .irp n 0,1,2,3,4,5
+ lw s\n, \n\()<<2(sp)
+ .endr
+ addi sp, sp, 24
+#endif
+
+ ret
+endfunc
+.endm
+
.irp len 16,8,4
put_vp8_bilin_h_v \len h a5
put_vp8_bilin_h_v \len v a6
@@ -244,4 +321,8 @@ epel \len 6 h
epel \len 4 h
epel \len 6 v
epel \len 4 v
+epel_hv \len 6 6
+epel_hv \len 4 4
+epel_hv \len 6 4
+epel_hv \len 4 6
.endr
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 6/9] lavc/vp8dsp: R-V V put_epel hv
2024-05-06 19:24 ` Rémi Denis-Courmont
2024-05-07 2:27 ` [FFmpeg-devel] [PATCH " uk7b
@ 2024-05-07 2:31 ` flow gg
1 sibling, 0 replies; 15+ messages in thread
From: flow gg @ 2024-05-07 2:31 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> IMO, passing a complete register name, if you really need to vary it,
would be
simpler and more flexible than an ABI register type prefix.
If the full register name is passed here, some require four parameters,
some require six parameters, and there is often repetition.
I feel it's easy to get confused about the differences between the
parameters passed each time.
If use a prefix instead, would only need one parameter, which I think would
be less error-prone.
> This code actually requires ==, not >=.
> You can do that but you only need half the stack space and offsets.
Ok, fixed it
Rémi Denis-Courmont <remi@remlab.net> 于2024年5月7日周二 03:25写道:
> Le maanantaina 6. toukokuuta 2024, 6.38.06 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> > C908:
> > vp8_put_epel4_h4v4_c: 20.0
> > vp8_put_epel4_h4v4_rvv_i32: 11.0
> > vp8_put_epel4_h4v6_c: 25.2
> > vp8_put_epel4_h4v6_rvv_i32: 13.5
> > vp8_put_epel4_h6v4_c: 22.2
> > vp8_put_epel4_h6v4_rvv_i32: 14.5
> > vp8_put_epel4_h6v6_c: 29.0
> > vp8_put_epel4_h6v6_rvv_i32: 15.7
> > vp8_put_epel8_h4v4_c: 73.0
> > vp8_put_epel8_h4v4_rvv_i32: 22.2
> > vp8_put_epel8_h4v6_c: 90.5
> > vp8_put_epel8_h4v6_rvv_i32: 26.7
> > vp8_put_epel8_h6v4_c: 85.0
> > vp8_put_epel8_h6v4_rvv_i32: 27.2
> > vp8_put_epel8_h6v6_c: 104.7
> > vp8_put_epel8_h6v6_rvv_i32: 29.5
> > vp8_put_epel16_h4v4_c: 145.5
> > vp8_put_epel16_h4v4_rvv_i32: 26.5
> > vp8_put_epel16_h4v6_c: 190.7
> > vp8_put_epel16_h4v6_rvv_i32: 47.5
> > vp8_put_epel16_h6v4_c: 173.7
> > vp8_put_epel16_h6v4_rvv_i32: 33.2
> > vp8_put_epel16_h6v6_c: 222.2
> > vp8_put_epel16_h6v6_rvv_i32: 35.5
> > ---
> > libavcodec/riscv/vp8dsp_init.c | 13 ++++
> > libavcodec/riscv/vp8dsp_rvv.S | 117 +++++++++++++++++++++++++++------
> > 2 files changed, 109 insertions(+), 21 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> > index dc3e087f01..463c8fa0a2 100644
> > --- a/libavcodec/riscv/vp8dsp_init.c
> > +++ b/libavcodec/riscv/vp8dsp_init.c
> > @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> > c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
> > c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
> > c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
> > +
> > + c->put_vp8_epel_pixels_tab[0][2][2] =
> ff_put_vp8_epel16_h6v6_rvv;
> > + c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
> > + c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
> > + c->put_vp8_epel_pixels_tab[0][2][1] =
> ff_put_vp8_epel16_h4v6_rvv;
> > + c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
> > + c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
> > + c->put_vp8_epel_pixels_tab[0][1][1] =
> ff_put_vp8_epel16_h4v4_rvv;
> > + c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
> > + c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
> > + c->put_vp8_epel_pixels_tab[0][1][2] =
> ff_put_vp8_epel16_h6v4_rvv;
> > + c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
> > + c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
> > }
> > #endif
> > #endif
> > diff --git a/libavcodec/riscv/vp8dsp_rvv.S
> b/libavcodec/riscv/vp8dsp_rvv.S
> > index bf268e4d8d..baa8152830 100644
> > --- a/libavcodec/riscv/vp8dsp_rvv.S
> > +++ b/libavcodec/riscv/vp8dsp_rvv.S
> > @@ -161,26 +161,26 @@ const subpel_filters
> > .byte 0, -1, 12, 123, -6, 0
> > endconst
> >
> > -.macro epel_filter size type
> > - lla t2, subpel_filters
> > +.macro epel_filter size type regtype
> > + lla \regtype\()2, subpel_filters
> > .ifc \type,v
> > - addi t0, a6, -1
> > + addi \regtype\()0, a6, -1
>
> IMO, passing a complete register name, if you really need to vary it,
> would be
> simpler and more flexible than an ABI register type prefix.
>
> > .elseif \type == h
> > - addi t0, a5, -1
> > + addi \regtype\()0, a5, -1
> > .endif
> > - li t1, 6
> > - mul t0, t0, t1
> > - add t0, t0, t2
> > + li \regtype\()1, 6
> > + mul \regtype\()0, \regtype\()0, \regtype\()1
> > + add \regtype\()0, \regtype\()0, \regtype\()2
> > .irp n 1,2,3,4
> > - lb t\n, \n(t0)
> > + lb \regtype\n, \n(\regtype\()0)
> > .endr
> > .ifc \size,6
> > - lb t5, 5(t0)
> > - lb t0, (t0)
> > + lb \regtype\()5, 5(\regtype\()0)
> > + lb \regtype\()0, (\regtype\()0)
> > .endif
> > .endm
> >
> > -.macro epel_load dst len size type
> > +.macro epel_load dst len size type from_mem regtype
> > .ifc \type,v
> > mv a5, a3
> > .else
> > @@ -189,24 +189,35 @@ endconst
> > sub t6, a2, a5
> > add a7, a2, a5
> >
> > +.if \from_mem
> > vle8.v v24, (a2)
> > vle8.v v22, (t6)
> > vle8.v v26, (a7)
> > add a7, a7, a5
> > vle8.v v28, (a7)
> > - vwmulu.vx v16, v24, t2
> > - vwmulu.vx v20, v26, t3
> > + vwmulu.vx v16, v24, \regtype\()2
> > + vwmulu.vx v20, v26, \regtype\()3
> > .ifc \size,6
> > sub t6, t6, a5
> > add a7, a7, a5
> > vle8.v v24, (t6)
> > vle8.v v26, (a7)
> > - vwmaccu.vx v16, t0, v24
> > - vwmaccu.vx v16, t5, v26
> > + vwmaccu.vx v16, \regtype\()0, v24
> > + vwmaccu.vx v16, \regtype\()5, v26
> > +.endif
> > + vwmaccsu.vx v16, \regtype\()1, v22
> > + vwmaccsu.vx v16, \regtype\()4, v28
> > +.else
> > + vwmulu.vx v16, v4, \regtype\()2
> > + vwmulu.vx v20, v6, \regtype\()3
> > + .ifc \size,6
> > + vwmaccu.vx v16, \regtype\()0, v0
> > + vwmaccu.vx v16, \regtype\()5, v10
> > + .endif
> > + vwmaccsu.vx v16, \regtype\()1, v2
> > + vwmaccsu.vx v16, \regtype\()4, v8
> > .endif
> > li t6, 64
> > - vwmaccsu.vx v16, t1, v22
> > - vwmaccsu.vx v16, t4, v28
> > vwadd.wx v16, v16, t6
> > vsetvlstatic16 \len
> > vwadd.vv v24, v16, v20
> > @@ -216,18 +227,18 @@ endconst
> > vnclipu.wi \dst, v24, 0
> > .endm
> >
> > -.macro epel_load_inc dst len size type
> > - epel_load \dst \len \size \type
> > +.macro epel_load_inc dst len size type from_mem regtype
> > + epel_load \dst \len \size \type \from_mem \regtype
> > add a2, a2, a3
> > .endm
> >
> > .macro epel len size type
> > func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
> > - epel_filter \size \type
> > + epel_filter \size \type t
> > vsetvlstatic8 \len
> > 1:
> > addi a4, a4, -1
> > - epel_load_inc v30 \len \size \type
> > + epel_load_inc v30 \len \size \type 1 t
> > vse8.v v30, (a0)
> > add a0, a0, a1
> > bnez a4, 1b
> > @@ -236,6 +247,66 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> > zve32x endfunc
> > .endm
> >
> > +.macro epel_hv len hsize vsize
> > +func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x
> > + addi sp, sp, -48
> > + .irp n 0,1,2,3,4,5
> > +#if __riscv_xlen >= 64
>
> This code actually requires ==, not >=.
>
> > + sd s\n, \n\()<<3(sp)
> > +#else
> > + sw s\n, \n\()<<3(sp)
>
> You can do that but you only need half the stack space and offsets.
>
> (And that's why I avoid S and FS registers like the plague, but sometimes
> you
> just can't.)
>
> > +#endif
> > + .endr
> > + sub a2, a2, a3
> > + epel_filter \hsize h t
> > + epel_filter \vsize v s
> > + vsetvlstatic8 \len
> > +.if \hsize == 6 || \vsize == 6
> > + sub a2, a2, a3
> > + epel_load_inc v0 \len \hsize h 1 t
> > +.endif
> > + epel_load_inc v2 \len \hsize h 1 t
> > + epel_load_inc v4 \len \hsize h 1 t
> > + epel_load_inc v6 \len \hsize h 1 t
> > + epel_load_inc v8 \len \hsize h 1 t
> > +.if \hsize == 6 || \vsize == 6
> > + epel_load_inc v10 \len \hsize h 1 t
> > +.endif
> > + addi a4, a4, -1
> > +1:
> > + addi a4, a4, -1
> > + epel_load v30 \len \vsize v 0 s
> > + vse8.v v30, (a0)
> > +.if \hsize == 6 || \vsize == 6
> > + vmv.v.v v0, v2
> > +.endif
> > + vmv.v.v v2, v4
> > + vmv.v.v v4, v6
> > + vmv.v.v v6, v8
> > +.if \hsize == 6 || \vsize == 6
> > + vmv.v.v v8, v10
> > + epel_load_inc v10 \len \hsize h 1 t
> > +.else
> > + epel_load_inc v8 \len 4 h 1 t
> > +.endif
> > + add a0, a0, a1
> > + bnez a4, 1b
> > + epel_load v30 \len \vsize v 0 s
> > + vse8.v v30, (a0)
> > +
> > + .irp n 0,1,2,3,4,5
> > +#if __riscv_xlen >= 64
> > + ld s\n, \n\()<<3(sp)
> > +#else
> > + lw s\n, \n\()<<3(sp)
> > +#endif
> > + .endr
> > + addi sp, sp, 48
> > +
> > + ret
> > +endfunc
> > +.endm
> > +
> > .irp len 16,8,4
> > put_vp8_bilin_h_v \len h a5
> > put_vp8_bilin_h_v \len v a6
> > @@ -244,4 +315,8 @@ epel \len 6 h
> > epel \len 4 h
> > epel \len 6 v
> > epel \len 4 v
> > +epel_hv \len 6 6
> > +epel_hv \len 4 4
> > +epel_hv \len 6 4
> > +epel_hv \len 4 6
> > .endr
>
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 2/9] lavc/vp8dsp: R-V V put_bilin_h v
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 2/9] lavc/vp8dsp: R-V V put_bilin_h v uk7b
2024-05-06 3:44 ` flow gg
@ 2024-05-07 15:59 ` Rémi Denis-Courmont
2024-05-07 16:07 ` flow gg
1 sibling, 1 reply; 15+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-07 15:59 UTC (permalink / raw)
To: ffmpeg-devel
Le maanantaina 6. toukokuuta 2024, 6.38.02 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908:
> vp8_put_bilin4_h_c: 367.0
> vp8_put_bilin4_h_rvv_i32: 137.7
> vp8_put_bilin4_v_c: 377.0
> vp8_put_bilin4_v_rvv_i32: 137.7
> vp8_put_bilin8_h_c: 1431.0
> vp8_put_bilin8_h_rvv_i32: 297.5
> vp8_put_bilin8_v_c: 1449.0
> vp8_put_bilin8_v_rvv_i32: 297.5
> vp8_put_bilin16_h_c: 2839.0
> vp8_put_bilin16_h_rvv_i32: 344.7
> vp8_put_bilin16_v_c: 2857.0
> vp8_put_bilin16_v_rvv_i32: 344.7
> ---
> libavcodec/riscv/vp8dsp_init.c | 21 +++++++++++++++
> libavcodec/riscv/vp8dsp_rvv.S | 49 ++++++++++++++++++++++++++++++++++
> 2 files changed, 70 insertions(+)
>
> diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
> index fa3feeacf7..afffa6de2f 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -34,6 +34,10 @@ VP8_EPEL(16, rvi);
> VP8_EPEL(8, rvi);
> VP8_EPEL(4, rvi);
>
> +VP8_BILIN(16, rvv);
> +VP8_BILIN(8, rvv);
> +VP8_BILIN(4, rvv);
> +
> av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> {
> #if HAVE_RV
> @@ -48,6 +52,23 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> c->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
> c->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
> }
> +#if HAVE_RVV
> + if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
> + c->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_rvv;
> + c->put_vp8_bilinear_pixels_tab[0][0][2] =
> ff_put_vp8_bilin16_h_rvv; + c->put_vp8_bilinear_pixels_tab[1][0][1]
> = ff_put_vp8_bilin8_h_rvv; + c->put_vp8_bilinear_pixels_tab[1][0][2]
> = ff_put_vp8_bilin8_h_rvv; + c->put_vp8_bilinear_pixels_tab[2][0][1]
> = ff_put_vp8_bilin4_h_rvv; + c->put_vp8_bilinear_pixels_tab[2][0][2]
> = ff_put_vp8_bilin4_h_rvv; +
> + c->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_rvv;
> + c->put_vp8_bilinear_pixels_tab[0][2][0] =
> ff_put_vp8_bilin16_v_rvv; + c->put_vp8_bilinear_pixels_tab[1][1][0]
> = ff_put_vp8_bilin8_v_rvv; + c->put_vp8_bilinear_pixels_tab[1][2][0]
> = ff_put_vp8_bilin8_v_rvv; + c->put_vp8_bilinear_pixels_tab[2][1][0]
> = ff_put_vp8_bilin4_v_rvv; + c->put_vp8_bilinear_pixels_tab[2][2][0]
> = ff_put_vp8_bilin4_v_rvv; + }
> +#endif
> #endif
> }
>
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index 8a0773f964..9bf969d794 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -20,6 +20,18 @@
>
> #include "libavutil/riscv/asm.S"
>
> +.macro vsetvlstatic8 len
> +.if \len <= 4
> + vsetivli zero, \len, e8, mf4, ta, ma
> +.elseif \len <= 8
> + vsetivli zero, \len, e8, mf2, ta, ma
> +.elseif \len <= 16
> + vsetivli zero, \len, e8, m1, ta, ma
> +.elseif \len <= 31
> + vsetivli zero, \len, e8, m2, ta, ma
> +.endif
> +.endm
> +
> .macro vp8_idct_dc_add
> vlse32.v v0, (a0), a2
> lh a5, 0(a1)
> @@ -71,3 +83,40 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
>
> ret
> endfunc
> +
> +.macro bilin_load dst len type mn
> +.ifc \type,v
> + add t5, a2, a3
> +.elseif \type == h
h is not a number so that's not a valid condition.
> + addi t5, a2, 1
> +.endif
> + vle8.v \dst, (a2)
> + vle8.v v2, (t5)
> + vwmulu.vx v28, \dst, t1
> + vwmaccu.vx v28, \mn, v2
> + vwaddu.wx v24, v28, t4
> + vnsra.wi \dst, v24, 3
> +.endm
> +
> +.macro put_vp8_bilin_h_v len type mn
> +func ff_put_vp8_bilin\len\()_\type\()_rvv, zve32x
> + vsetvlstatic8 \len
> + li t1, 8
> + li t4, 4
> + sub t1, t1, \mn
> +1:
> + addi a4, a4, -1
> + bilin_load v0, \len, \type, \mn
> + vse8.v v0, (a0)
> + add a2, a2, a3
> + add a0, a0, a1
> + bnez a4, 1b
> +
> + ret
> +endfunc
> +.endm
> +
> +.irp len 16,8,4
> +put_vp8_bilin_h_v \len h a5
> +put_vp8_bilin_h_v \len v a6
> +.endr
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 2/9] lavc/vp8dsp: R-V V put_bilin_h v
2024-05-07 15:59 ` Rémi Denis-Courmont
@ 2024-05-07 16:07 ` flow gg
2024-05-07 16:10 ` Rémi Denis-Courmont
0 siblings, 1 reply; 15+ messages in thread
From: flow gg @ 2024-05-07 16:07 UTC (permalink / raw)
To: FFmpeg development discussions and patches
I didn't understand what you mean... What does judging whether the type is
'h' or 'v' have to do with the number?
Rémi Denis-Courmont <remi@remlab.net> 于2024年5月8日周三 00:00写道:
> Le maanantaina 6. toukokuuta 2024, 6.38.02 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> > C908:
> > vp8_put_bilin4_h_c: 367.0
> > vp8_put_bilin4_h_rvv_i32: 137.7
> > vp8_put_bilin4_v_c: 377.0
> > vp8_put_bilin4_v_rvv_i32: 137.7
> > vp8_put_bilin8_h_c: 1431.0
> > vp8_put_bilin8_h_rvv_i32: 297.5
> > vp8_put_bilin8_v_c: 1449.0
> > vp8_put_bilin8_v_rvv_i32: 297.5
> > vp8_put_bilin16_h_c: 2839.0
> > vp8_put_bilin16_h_rvv_i32: 344.7
> > vp8_put_bilin16_v_c: 2857.0
> > vp8_put_bilin16_v_rvv_i32: 344.7
> > ---
> > libavcodec/riscv/vp8dsp_init.c | 21 +++++++++++++++
> > libavcodec/riscv/vp8dsp_rvv.S | 49 ++++++++++++++++++++++++++++++++++
> > 2 files changed, 70 insertions(+)
> >
> > diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> > index fa3feeacf7..afffa6de2f 100644
> > --- a/libavcodec/riscv/vp8dsp_init.c
> > +++ b/libavcodec/riscv/vp8dsp_init.c
> > @@ -34,6 +34,10 @@ VP8_EPEL(16, rvi);
> > VP8_EPEL(8, rvi);
> > VP8_EPEL(4, rvi);
> >
> > +VP8_BILIN(16, rvv);
> > +VP8_BILIN(8, rvv);
> > +VP8_BILIN(4, rvv);
> > +
> > av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> > {
> > #if HAVE_RV
> > @@ -48,6 +52,23 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> > c->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
> > c->put_vp8_bilinear_pixels_tab[2][0][0] =
> ff_put_vp8_pixels4_rvi;
> > }
> > +#if HAVE_RVV
> > + if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
> > + c->put_vp8_bilinear_pixels_tab[0][0][1] =
> ff_put_vp8_bilin16_h_rvv;
> > + c->put_vp8_bilinear_pixels_tab[0][0][2] =
> > ff_put_vp8_bilin16_h_rvv; +
> c->put_vp8_bilinear_pixels_tab[1][0][1]
> > = ff_put_vp8_bilin8_h_rvv; +
> c->put_vp8_bilinear_pixels_tab[1][0][2]
> > = ff_put_vp8_bilin8_h_rvv; +
> c->put_vp8_bilinear_pixels_tab[2][0][1]
> > = ff_put_vp8_bilin4_h_rvv; +
> c->put_vp8_bilinear_pixels_tab[2][0][2]
> > = ff_put_vp8_bilin4_h_rvv; +
> > + c->put_vp8_bilinear_pixels_tab[0][1][0] =
> ff_put_vp8_bilin16_v_rvv;
> > + c->put_vp8_bilinear_pixels_tab[0][2][0] =
> > ff_put_vp8_bilin16_v_rvv; +
> c->put_vp8_bilinear_pixels_tab[1][1][0]
> > = ff_put_vp8_bilin8_v_rvv; +
> c->put_vp8_bilinear_pixels_tab[1][2][0]
> > = ff_put_vp8_bilin8_v_rvv; +
> c->put_vp8_bilinear_pixels_tab[2][1][0]
> > = ff_put_vp8_bilin4_v_rvv; +
> c->put_vp8_bilinear_pixels_tab[2][2][0]
> > = ff_put_vp8_bilin4_v_rvv; + }
> > +#endif
> > #endif
> > }
> >
> > diff --git a/libavcodec/riscv/vp8dsp_rvv.S
> b/libavcodec/riscv/vp8dsp_rvv.S
> > index 8a0773f964..9bf969d794 100644
> > --- a/libavcodec/riscv/vp8dsp_rvv.S
> > +++ b/libavcodec/riscv/vp8dsp_rvv.S
> > @@ -20,6 +20,18 @@
> >
> > #include "libavutil/riscv/asm.S"
> >
> > +.macro vsetvlstatic8 len
> > +.if \len <= 4
> > + vsetivli zero, \len, e8, mf4, ta, ma
> > +.elseif \len <= 8
> > + vsetivli zero, \len, e8, mf2, ta, ma
> > +.elseif \len <= 16
> > + vsetivli zero, \len, e8, m1, ta, ma
> > +.elseif \len <= 31
> > + vsetivli zero, \len, e8, m2, ta, ma
> > +.endif
> > +.endm
> > +
> > .macro vp8_idct_dc_add
> > vlse32.v v0, (a0), a2
> > lh a5, 0(a1)
> > @@ -71,3 +83,40 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
> >
> > ret
> > endfunc
> > +
> > +.macro bilin_load dst len type mn
> > +.ifc \type,v
> > + add t5, a2, a3
> > +.elseif \type == h
>
> h is not a number so that's not a valid condition.
>
> > + addi t5, a2, 1
> > +.endif
> > + vle8.v \dst, (a2)
> > + vle8.v v2, (t5)
> > + vwmulu.vx v28, \dst, t1
> > + vwmaccu.vx v28, \mn, v2
> > + vwaddu.wx v24, v28, t4
> > + vnsra.wi \dst, v24, 3
> > +.endm
> > +
> > +.macro put_vp8_bilin_h_v len type mn
> > +func ff_put_vp8_bilin\len\()_\type\()_rvv, zve32x
> > + vsetvlstatic8 \len
> > + li t1, 8
> > + li t4, 4
> > + sub t1, t1, \mn
> > +1:
> > + addi a4, a4, -1
> > + bilin_load v0, \len, \type, \mn
> > + vse8.v v0, (a0)
> > + add a2, a2, a3
> > + add a0, a0, a1
> > + bnez a4, 1b
> > +
> > + ret
> > +endfunc
> > +.endm
> > +
> > +.irp len 16,8,4
> > +put_vp8_bilin_h_v \len h a5
> > +put_vp8_bilin_h_v \len v a6
> > +.endr
>
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 2/9] lavc/vp8dsp: R-V V put_bilin_h v
2024-05-07 16:07 ` flow gg
@ 2024-05-07 16:10 ` Rémi Denis-Courmont
0 siblings, 0 replies; 15+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-07 16:10 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Le tiistaina 7. toukokuuta 2024, 19.07.46 EEST flow gg a écrit :
> I didn't understand what you mean... What does judging whether the type is
> 'h' or 'v' have to do with the number?
== is only valid over numeric values, not identifiers.
--
レミ・デニ-クールモン
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2024-05-07 16:10 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <20240506033809.3790245-1-uk7b@foxmail.com>
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 2/9] lavc/vp8dsp: R-V V put_bilin_h v uk7b
2024-05-06 3:44 ` flow gg
2024-05-07 15:59 ` Rémi Denis-Courmont
2024-05-07 16:07 ` flow gg
2024-05-07 16:10 ` Rémi Denis-Courmont
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 3/9] lavc/vp8dsp: R-V V put_bilin_hv uk7b
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 4/9] lavc/vp8dsp: R-V V put_epel h uk7b
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 5/9] lavc/vp8dsp: R-V V put_epel v uk7b
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 6/9] lavc/vp8dsp: R-V V put_epel hv uk7b
2024-05-06 19:24 ` Rémi Denis-Courmont
2024-05-07 2:27 ` [FFmpeg-devel] [PATCH " uk7b
2024-05-07 2:31 ` [FFmpeg-devel] [PATCH v3 " flow gg
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 7/9] lavc/vp8dsp: R-V V loop_filter_simple uk7b
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 8/9] lavc/vp8dsp: R-V V loop_filter_inner uk7b
2024-05-06 3:38 ` [FFmpeg-devel] [PATCH v3 9/9] lavc/vp8dsp: R-V V loop_filter uk7b
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git