* [FFmpeg-devel] [PATCH v2 2/4] lavc/vp8dsp: R-V V loop_filter_simple
[not found] <20240714162824.2728146-1-uk7b@foxmail.com>
@ 2024-07-14 16:28 ` uk7b
2024-07-14 16:33 ` flow gg
2024-07-14 16:28 ` [FFmpeg-devel] [PATCH v2 3/4] lavc/vp8dsp: R-V V loop_filter_inner uk7b
2024-07-14 16:28 ` [FFmpeg-devel] [PATCH v2 4/4] lavc/vp8dsp: R-V V loop_filter uk7b
2 siblings, 1 reply; 4+ messages in thread
From: uk7b @ 2024-07-14 16:28 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
vp8_loop_filter_simple_h_c : 6.2 5.7
vp8_loop_filter_simple_h_rvv_i32 : 3.0 2.5
vp8_loop_filter_simple_v_c : 6.5 6.2
vp8_loop_filter_simple_v_rvv_i32 : 2.0 1.5
---
libavcodec/riscv/vp8dsp_init.c | 18 +++++++-
libavcodec/riscv/vp8dsp_rvv.S | 77 ++++++++++++++++++++++++++++++++++
2 files changed, 94 insertions(+), 1 deletion(-)
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index dcb6307d5b..8c5b2c8b04 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -49,6 +49,9 @@ VP8_BILIN(16, rvv256);
VP8_BILIN(8, rvv256);
VP8_BILIN(4, rvv256);
+VP8_LF(rvv128);
+VP8_LF(rvv256);
+
av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
{
#if HAVE_RV
@@ -147,9 +150,15 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
{
#if HAVE_RVV
+ int vlenb = ff_get_rv_vlenb();
+
+#define init_loop_filter(vlen) \
+ c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_rvv##vlen; \
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_rvv##vlen;
+
int flags = av_get_cpu_flags();
- if (flags & AV_CPU_FLAG_RVV_I32 && ff_rv_vlen_least(128)) {
+ if (flags & AV_CPU_FLAG_RVV_I32 && vlenb >= 16) {
#if __riscv_xlen >= 64
if (flags & AV_CPU_FLAG_RVV_I64)
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_rvv;
@@ -159,6 +168,13 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_rvv;
if (flags & AV_CPU_FLAG_RVV_I64)
c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
+
+ if (vlenb >= 32) {
+ init_loop_filter(256);
+ } else {
+ init_loop_filter(128);
+ }
}
+#undef init_loop_filter
#endif
}
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 0cbf1672f7..3cec4dd135 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -275,6 +275,83 @@ func ff_vp78_idct_dc_add4uv_rvv, zve64x
ret
endfunc
+.macro filter_fmin len, vlen, a, f1, p0f2, q0f1, p0, q0
+ vsetvlstatic16 \len, \vlen
+ vsext.vf2 \q0f1, \a
+ vmin.vx \p0f2, \q0f1, a6
+ vmin.vx \q0f1, \q0f1, t6
+ vadd.vi \p0f2, \p0f2, 3
+ vadd.vi \q0f1, \q0f1, 4
+ vsra.vi \p0f2, \p0f2, 3
+ vsra.vi \f1, \q0f1, 3
+ vadd.vv \p0f2, \p0f2, \p0
+ vsub.vv \q0f1, \q0, \f1
+ vmax.vx \p0f2, \p0f2, zero
+ vmax.vx \q0f1, \q0f1, zero
+.endm
+
+.macro filter len, vlen, type, normal, inner, dst, stride, fE, fI, thresh
+.ifc \type,v
+ sub t3, \dst, \stride // -1
+ sub t2, t3, \stride // -2
+ add t4, \dst, \stride // 1
+ vle8.v v3, (t2) // p1
+ vle8.v v4, (t3) // p0
+ vle8.v v5, (\dst) // q0
+ vle8.v v6, (t4) // q1
+.else
+ addi t2, \dst, -2
+ addi t3, \dst, -1
+ vlsseg4e8.v v3, (t2), \stride
+.endif
+ vwsubu.vv v10, v3, v6 // p1-q1
+ vwsubu.vv v12, v5, v4 // q0-p0
+
+ vnclip.wi v16, v10, 0 // clip_int8(p1 - q1)
+ vsetvlstatic16 \len, \vlen
+ // vp8_simple_limit(dst + i, stride, flim)
+ li a6, 2
+ vneg.v v22, v10
+ vneg.v v24, v12
+ vmax.vv v22, v22, v10
+ vmax.vv v24, v24, v12
+ vsrl.vi v22, v22, 1
+ vmacc.vx v22, a6, v24
+ vmsleu.vx v0, v22, \fE
+
+ li a7, 3
+ li a6, 124
+ li t6, 123
+ vmul.vx v22, v12, a7 // 3 * (q0 - p0)
+ vzext.vf2 v24, v4 // p0
+ vzext.vf2 v20, v5 // q0
+ vsetvlstatic8 \len, \vlen
+ vwadd.wv v10, v22, v16
+ vnclip.wi v28, v10, 0
+ filter_fmin \len, \vlen, v28, v12, v26, v10, v24, v20
+ vsetvlstatic8 \len, \vlen
+ vnclipu.wi v30, v26, 0
+ vnclipu.wi v31, v10, 0
+.ifc \type,v
+ vse8.v v30, (t3), v0.t
+ vse8.v v31, (\dst), v0.t
+.else
+ vssseg2e8.v v30, (t3), \stride, v0.t
+.endif
+
+.endm
+
+.irp type,v,h
+.irp vlen,256,128
+func ff_vp8_\type\()_loop_filter16_simple_rvv\vlen, zve32x
+ csrwi vxrm, 0
+ vsetvlstatic8 16, \vlen
+ filter 16, \vlen, \type, 0, 0, a0, a1, a2, a3, a4
+ ret
+endfunc
+.endr
+.endr
+
.macro bilin_load_h dst mn
addi t5, a2, 1
vle8.v \dst, (a2)
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 2/4] lavc/vp8dsp: R-V V loop_filter_simple
2024-07-14 16:28 ` [FFmpeg-devel] [PATCH v2 2/4] lavc/vp8dsp: R-V V loop_filter_simple uk7b
@ 2024-07-14 16:33 ` flow gg
0 siblings, 0 replies; 4+ messages in thread
From: flow gg @ 2024-07-14 16:33 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> vssseg2e8
> vlsseg4e8
> vwadd.wv
> I can't find where VXRM is initialised for that.
Updated them and add csrwi
<uk7b@foxmail.com> 于2024年7月15日周一 00:30写道:
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908 X60
> vp8_loop_filter_simple_h_c : 6.2 5.7
> vp8_loop_filter_simple_h_rvv_i32 : 3.0 2.5
> vp8_loop_filter_simple_v_c : 6.5 6.2
> vp8_loop_filter_simple_v_rvv_i32 : 2.0 1.5
> ---
> libavcodec/riscv/vp8dsp_init.c | 18 +++++++-
> libavcodec/riscv/vp8dsp_rvv.S | 77 ++++++++++++++++++++++++++++++++++
> 2 files changed, 94 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> index dcb6307d5b..8c5b2c8b04 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -49,6 +49,9 @@ VP8_BILIN(16, rvv256);
> VP8_BILIN(8, rvv256);
> VP8_BILIN(4, rvv256);
>
> +VP8_LF(rvv128);
> +VP8_LF(rvv256);
> +
> av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> {
> #if HAVE_RV
> @@ -147,9 +150,15 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
> {
> #if HAVE_RVV
> + int vlenb = ff_get_rv_vlenb();
> +
> +#define init_loop_filter(vlen) \
> + c->vp8_v_loop_filter_simple =
> ff_vp8_v_loop_filter16_simple_rvv##vlen; \
> + c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_rvv##vlen;
> +
> int flags = av_get_cpu_flags();
>
> - if (flags & AV_CPU_FLAG_RVV_I32 && ff_rv_vlen_least(128)) {
> + if (flags & AV_CPU_FLAG_RVV_I32 && vlenb >= 16) {
> #if __riscv_xlen >= 64
> if (flags & AV_CPU_FLAG_RVV_I64)
> c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_rvv;
> @@ -159,6 +168,13 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
> c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_rvv;
> if (flags & AV_CPU_FLAG_RVV_I64)
> c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
> +
> + if (vlenb >= 32) {
> + init_loop_filter(256);
> + } else {
> + init_loop_filter(128);
> + }
> }
> +#undef init_loop_filter
> #endif
> }
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index 0cbf1672f7..3cec4dd135 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -275,6 +275,83 @@ func ff_vp78_idct_dc_add4uv_rvv, zve64x
> ret
> endfunc
>
> +.macro filter_fmin len, vlen, a, f1, p0f2, q0f1, p0, q0
> + vsetvlstatic16 \len, \vlen
> + vsext.vf2 \q0f1, \a
> + vmin.vx \p0f2, \q0f1, a6
> + vmin.vx \q0f1, \q0f1, t6
> + vadd.vi \p0f2, \p0f2, 3
> + vadd.vi \q0f1, \q0f1, 4
> + vsra.vi \p0f2, \p0f2, 3
> + vsra.vi \f1, \q0f1, 3
> + vadd.vv \p0f2, \p0f2, \p0
> + vsub.vv \q0f1, \q0, \f1
> + vmax.vx \p0f2, \p0f2, zero
> + vmax.vx \q0f1, \q0f1, zero
> +.endm
> +
> +.macro filter len, vlen, type, normal, inner, dst, stride, fE, fI, thresh
> +.ifc \type,v
> + sub t3, \dst, \stride // -1
> + sub t2, t3, \stride // -2
> + add t4, \dst, \stride // 1
> + vle8.v v3, (t2) // p1
> + vle8.v v4, (t3) // p0
> + vle8.v v5, (\dst) // q0
> + vle8.v v6, (t4) // q1
> +.else
> + addi t2, \dst, -2
> + addi t3, \dst, -1
> + vlsseg4e8.v v3, (t2), \stride
> +.endif
> + vwsubu.vv v10, v3, v6 // p1-q1
> + vwsubu.vv v12, v5, v4 // q0-p0
> +
> + vnclip.wi v16, v10, 0 // clip_int8(p1 - q1)
> + vsetvlstatic16 \len, \vlen
> + // vp8_simple_limit(dst + i, stride, flim)
> + li a6, 2
> + vneg.v v22, v10
> + vneg.v v24, v12
> + vmax.vv v22, v22, v10
> + vmax.vv v24, v24, v12
> + vsrl.vi v22, v22, 1
> + vmacc.vx v22, a6, v24
> + vmsleu.vx v0, v22, \fE
> +
> + li a7, 3
> + li a6, 124
> + li t6, 123
> + vmul.vx v22, v12, a7 // 3 * (q0 - p0)
> + vzext.vf2 v24, v4 // p0
> + vzext.vf2 v20, v5 // q0
> + vsetvlstatic8 \len, \vlen
> + vwadd.wv v10, v22, v16
> + vnclip.wi v28, v10, 0
> + filter_fmin \len, \vlen, v28, v12, v26, v10, v24, v20
> + vsetvlstatic8 \len, \vlen
> + vnclipu.wi v30, v26, 0
> + vnclipu.wi v31, v10, 0
> +.ifc \type,v
> + vse8.v v30, (t3), v0.t
> + vse8.v v31, (\dst), v0.t
> +.else
> + vssseg2e8.v v30, (t3), \stride, v0.t
> +.endif
> +
> +.endm
> +
> +.irp type,v,h
> +.irp vlen,256,128
> +func ff_vp8_\type\()_loop_filter16_simple_rvv\vlen, zve32x
> + csrwi vxrm, 0
> + vsetvlstatic8 16, \vlen
> + filter 16, \vlen, \type, 0, 0, a0, a1, a2, a3, a4
> + ret
> +endfunc
> +.endr
> +.endr
> +
> .macro bilin_load_h dst mn
> addi t5, a2, 1
> vle8.v \dst, (a2)
> --
> 2.45.2
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread
* [FFmpeg-devel] [PATCH v2 3/4] lavc/vp8dsp: R-V V loop_filter_inner
[not found] <20240714162824.2728146-1-uk7b@foxmail.com>
2024-07-14 16:28 ` [FFmpeg-devel] [PATCH v2 2/4] lavc/vp8dsp: R-V V loop_filter_simple uk7b
@ 2024-07-14 16:28 ` uk7b
2024-07-14 16:28 ` [FFmpeg-devel] [PATCH v2 4/4] lavc/vp8dsp: R-V V loop_filter uk7b
2 siblings, 0 replies; 4+ messages in thread
From: uk7b @ 2024-07-14 16:28 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
vp8_loop_filter8uv_inner_h_c : 11.0 9.5
vp8_loop_filter8uv_inner_h_rvv_i32 : 10.5 8.7
vp8_loop_filter8uv_inner_v_c : 11.2 11.0
vp8_loop_filter8uv_inner_v_rvv_i32 : 7.7 6.2
vp8_loop_filter16y_inner_h_c : 11.2 9.0
vp8_loop_filter16y_inner_h_rvv_i32 : 8.0 6.2
vp8_loop_filter16y_inner_v_c : 11.5 10.5
vp8_loop_filter16y_inner_v_rvv_i32 : 5.2 3.7
---
libavcodec/riscv/vp8dsp_init.c | 6 ++-
libavcodec/riscv/vp8dsp_rvv.S | 90 ++++++++++++++++++++++++++++++++++
2 files changed, 95 insertions(+), 1 deletion(-)
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 8c5b2c8b04..8cb21b8ceb 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -154,7 +154,11 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
#define init_loop_filter(vlen) \
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_rvv##vlen; \
- c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_rvv##vlen;
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_rvv##vlen; \
+ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_rvv##vlen; \
+ c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_rvv##vlen; \
+ c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_rvv##vlen; \
+ c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_rvv##vlen;
int flags = av_get_cpu_flags();
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 3cec4dd135..036872a29e 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -275,6 +275,13 @@ func ff_vp78_idct_dc_add4uv_rvv, zve64x
ret
endfunc
+.macro filter_abs dst, mask, tmp, diff, fI
+ vneg.v \tmp, \diff
+ vmax.vv \dst, \tmp, \diff
+ vmsleu.vx \tmp, \dst, \fI
+ vmand.mm \mask, \mask, \tmp
+.endm
+
.macro filter_fmin len, vlen, a, f1, p0f2, q0f1, p0, q0
vsetvlstatic16 \len, \vlen
vsext.vf2 \q0f1, \a
@@ -299,13 +306,36 @@ endfunc
vle8.v v4, (t3) // p0
vle8.v v5, (\dst) // q0
vle8.v v6, (t4) // q1
+ .if \normal
+ sub t1, t2, \stride // -3
+ sub t0, t1, \stride // -4
+ add t5, t4, \stride // 2
+ add t6, t5, \stride // 3
+ vle8.v v1, (t0) // p3
+ vle8.v v2, (t1) // p2
+ vle8.v v7, (t5) // q2
+ vle8.v v8, (t6) // q3
+ .endif
.else
addi t2, \dst, -2
addi t3, \dst, -1
+ .if \normal
+ addi t1, \dst, -4
+ vlsseg8e8.v v1, (t1), \stride
+ .else
vlsseg4e8.v v3, (t2), \stride
+ .endif
.endif
vwsubu.vv v10, v3, v6 // p1-q1
vwsubu.vv v12, v5, v4 // q0-p0
+.if \normal
+ vwsubu.vv v30, v1, v2 // p3-p2
+ vwsubu.vv v28, v2, v3 // p2-p1
+ vwsubu.vv v26, v3, v4 // p1-p0
+ vwsubu.vv v24, v8, v7 // q3-q2
+ vwsubu.vv v22, v7, v6 // q2-q1
+ vwsubu.vv v8, v6, v5 // q1-q0
+.endif
vnclip.wi v16, v10, 0 // clip_int8(p1 - q1)
vsetvlstatic16 \len, \vlen
@@ -319,6 +349,24 @@ endfunc
vmacc.vx v22, a6, v24
vmsleu.vx v0, v22, \fE
+.if \normal
+ vneg.v v22, v26
+ vmax.vv v26, v22, v26
+ vmsleu.vx v1, v26, \fI
+ filter_abs v22, v1, v10, v28, \fI
+ filter_abs v22, v1, v10, v30, \fI
+ filter_abs v22, v1, v10, v24, \fI
+ filter_abs v22, v1, v10, v22, \fI
+ filter_abs v20, v1, v10, v8, \fI
+ vzext.vf2 v8, v3 // p1
+ vmand.mm v1, v0, v1 // vp8_simple_limit && normal
+ vmsgtu.vx v26, v26, \thresh // hev: FFABS(p1 - p0) > thresh
+ vmsgtu.vx v3, v20, \thresh // hev: FFABS(q1 - q0) > thresh
+ vzext.vf2 v14, v6 // q1
+ vmor.mm v3, v3, v26 // FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh
+ vmand.mm v0, v1, v3 // v0 = normal && hev
+ vmnot.m v3, v3 // v3 = !hv
+.endif
li a7, 3
li a6, 124
li t6, 123
@@ -339,6 +387,33 @@ endfunc
vssseg2e8.v v30, (t3), \stride, v0.t
.endif
+.if \normal
+ vmand.mm v0, v1, v3 // vp8_normal_limit & !hv
+
+ vnclip.wi v22, v22, 0 // clip_int8(a);
+ filter_fmin \len, \vlen, v22, v12, v26, v10, v24, v20
+ vadd.vi v12, v12, 1
+ vsra.vi v12, v12, 1 // (f1 + 1) >> 1;
+ vadd.vv v8, v8, v12 // p1 + a
+ vsub.vv v14, v14, v12 // q1 - a
+
+ vmax.vx v8, v8, zero
+ vmax.vx v14, v14, zero
+ vsetvlstatic8 \len, \vlen
+ vnclipu.wi v3, v8, 0 // -2
+ vnclipu.wi v4, v26, 0 // -1
+ vnclipu.wi v5, v10, 0 // 0
+ vnclipu.wi v6, v14, 0 // 1
+
+ .ifc \type,v
+ vse8.v v3, (t2), v0.t // -2
+ vse8.v v4, (t3), v0.t // -1
+ vse8.v v5, (\dst), v0.t // 0
+ vse8.v v6, (t4), v0.t // 1
+ .else
+ vssseg4e8.v v3, (t2), \stride, v0.t
+ .endif
+.endif
.endm
.irp type,v,h
@@ -349,6 +424,21 @@ func ff_vp8_\type\()_loop_filter16_simple_rvv\vlen, zve32x
filter 16, \vlen, \type, 0, 0, a0, a1, a2, a3, a4
ret
endfunc
+
+func ff_vp8_\type\()_loop_filter16_inner_rvv\vlen, zve32x
+ csrwi vxrm, 0
+ vsetvlstatic8 16, \vlen
+ filter 16, \vlen, \type, 1, 1, a0, a1, a2, a3, a4
+ ret
+endfunc
+
+func ff_vp8_\type\()_loop_filter8uv_inner_rvv\vlen, zve32x
+ csrwi vxrm, 0
+ vsetvlstatic8 8, \vlen
+ filter 8, \vlen, \type, 1, 1, a0, a2, a3, a4, a5
+ filter 8, \vlen, \type, 1, 1, a1, a2, a3, a4, a5
+ ret
+endfunc
.endr
.endr
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread
* [FFmpeg-devel] [PATCH v2 4/4] lavc/vp8dsp: R-V V loop_filter
[not found] <20240714162824.2728146-1-uk7b@foxmail.com>
2024-07-14 16:28 ` [FFmpeg-devel] [PATCH v2 2/4] lavc/vp8dsp: R-V V loop_filter_simple uk7b
2024-07-14 16:28 ` [FFmpeg-devel] [PATCH v2 3/4] lavc/vp8dsp: R-V V loop_filter_inner uk7b
@ 2024-07-14 16:28 ` uk7b
2 siblings, 0 replies; 4+ messages in thread
From: uk7b @ 2024-07-14 16:28 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
vp8_loop_filter8uv_h_c : 12.2 10.0
vp8_loop_filter8uv_h_rvv_i32 : 11.5 9.7
vp8_loop_filter8uv_v_c : 13.2 11.2
vp8_loop_filter8uv_v_rvv_i32 : 8.0 6.5
vp8_loop_filter16y_h_c : 11.7 10.5
vp8_loop_filter16y_h_rvv_i32 : 9.2 7.2
vp8_loop_filter16y_v_c : 11.5 10.5
vp8_loop_filter16y_v_rvv_i32 : 5.5 3.7
---
libavcodec/riscv/vp8dsp_init.c | 6 +++-
libavcodec/riscv/vp8dsp_rvv.S | 59 ++++++++++++++++++++++++++++++++++
2 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 8cb21b8ceb..c53223a0e8 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -158,7 +158,11 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_rvv##vlen; \
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_rvv##vlen; \
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_rvv##vlen; \
- c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_rvv##vlen;
+ c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_rvv##vlen; \
+ c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_rvv##vlen; \
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_rvv##vlen; \
+ c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_rvv##vlen; \
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_rvv##vlen;
int flags = av_get_cpu_flags();
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 036872a29e..298910ea90 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -390,12 +390,43 @@ endfunc
.if \normal
vmand.mm v0, v1, v3 // vp8_normal_limit & !hv
+ .if \inner
vnclip.wi v22, v22, 0 // clip_int8(a);
filter_fmin \len, \vlen, v22, v12, v26, v10, v24, v20
vadd.vi v12, v12, 1
vsra.vi v12, v12, 1 // (f1 + 1) >> 1;
vadd.vv v8, v8, v12 // p1 + a
vsub.vv v14, v14, v12 // q1 - a
+ .else
+ li t6, 27
+ li a7, 18
+ li a6, 9
+ vwmul.vx v22, v28, t6
+ vwmul.vx v4, v28, a7
+ vwmul.vx v26, v28, a6
+ vsetvlstatic16 \len, \vlen
+ li a7, 63
+ vzext.vf2 v18, v2 // p2
+ vzext.vf2 v28, v7 // q2
+ vadd.vx v22, v22, a7
+ vadd.vx v4, v4, a7
+ vadd.vx v26, v26, a7
+ vsra.vi v22, v22, 7 // a0
+ vsra.vi v4, v4, 7 // a1
+ vsra.vi v26, v26, 7 // a2
+ vadd.vv v18, v18, v26
+ vadd.vv v8, v8, v4
+ vadd.vv v30, v24, v22
+ vsub.vv v28, v28, v26
+ vsub.vv v10, v20, v22
+ vsub.vv v14, v14, v4
+ vmax.vx v18, v18, zero
+ vmax.vx v8, v8, zero
+ vmax.vx v26, v30, zero
+ vmax.vx v10, v10, zero
+ vmax.vx v14, v14, zero
+ vmax.vx v28, v28, zero
+ .endif
vmax.vx v8, v8, zero
vmax.vx v14, v14, zero
@@ -404,14 +435,27 @@ endfunc
vnclipu.wi v4, v26, 0 // -1
vnclipu.wi v5, v10, 0 // 0
vnclipu.wi v6, v14, 0 // 1
+ .if !\inner
+ vnclipu.wi v2, v18, 0
+ vnclipu.wi v7, v28, 0
+ addi t0, \dst, -3
+ .endif
.ifc \type,v
vse8.v v3, (t2), v0.t // -2
vse8.v v4, (t3), v0.t // -1
vse8.v v5, (\dst), v0.t // 0
vse8.v v6, (t4), v0.t // 1
+ .if !\inner
+ vse8.v v2, (t1), v0.t
+ vse8.v v7, (t5), v0.t
+ .endif
.else
+ .if \inner
vssseg4e8.v v3, (t2), \stride, v0.t
+ .else
+ vssseg6e8.v v2, (t0), \stride, v0.t
+ .endif
.endif
.endif
.endm
@@ -439,6 +483,21 @@ func ff_vp8_\type\()_loop_filter8uv_inner_rvv\vlen, zve32x
filter 8, \vlen, \type, 1, 1, a1, a2, a3, a4, a5
ret
endfunc
+
+func ff_vp8_\type\()_loop_filter16_rvv\vlen, zve32x
+ csrwi vxrm, 0
+ vsetvlstatic8 16, \vlen
+ filter 16, \vlen, \type, 1, 0, a0, a1, a2, a3, a4
+ ret
+endfunc
+
+func ff_vp8_\type\()_loop_filter8uv_rvv\vlen, zve32x
+ csrwi vxrm, 0
+ vsetvlstatic8 8, \vlen
+ filter 8, \vlen, \type, 1, 0, a0, a2, a3, a4, a5
+ filter 8, \vlen, \type, 1, 0, a1, a2, a3, a4, a5
+ ret
+endfunc
.endr
.endr
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread