Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 02/10] lavc/vp8dsp: R-V V put_bilin_h
       [not found] <20240505164536.872683-1-uk7b@foxmail.com>
@ 2024-05-05 16:45 ` uk7b
  2024-05-05 19:06   ` Rémi Denis-Courmont
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 03/10] lavc/vp8dsp: R-V V put_bilin_v uk7b
                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 13+ messages in thread
From: uk7b @ 2024-05-05 16:45 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp8_put_bilin4_h_c: 373.5
vp8_put_bilin4_h_rvv_i32: 158.7
vp8_put_bilin8_h_c: 1437.7
vp8_put_bilin8_h_rvv_i32: 318.7
vp8_put_bilin16_h_c: 2845.7
vp8_put_bilin16_h_rvv_i32: 374.7
---
 libavcodec/riscv/vp8dsp_init.c | 14 +++++++++++
 libavcodec/riscv/vp8dsp_rvv.S  | 45 ++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index fa3feeacf7..778d5ceb29 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -34,6 +34,10 @@ VP8_EPEL(16, rvi);
 VP8_EPEL(8,  rvi);
 VP8_EPEL(4,  rvi);
 
+VP8_BILIN(16, rvv);
+VP8_BILIN(8,  rvv);
+VP8_BILIN(4,  rvv);
+
 av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
 {
 #if HAVE_RV
@@ -48,6 +52,16 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
         c->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
         c->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
     }
+#if HAVE_RVV
+    if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
+        c->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_rvv;
+        c->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_rvv;
+        c->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_rvv;
+        c->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_rvv;
+        c->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_rvv;
+        c->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_rvv;
+    }
+#endif
 #endif
 }
 
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 8a0773f964..760d9d3871 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -20,6 +20,18 @@
 
 #include "libavutil/riscv/asm.S"
 
+.macro vsetvlstatic8 len
+.if \len <= 4
+        vsetivli        zero, \len, e8, mf4, ta, ma
+.elseif \len <= 8
+        vsetivli        zero, \len, e8, mf2, ta, ma
+.elseif \len <= 16
+        vsetivli        zero, \len, e8, m1, ta, ma
+.elseif \len <= 31
+        vsetivli        zero, \len, e8, m2, ta, ma
+.endif
+.endm
+
 .macro vp8_idct_dc_add
         vlse32.v      v0, (a0), a2
         lh            a5, 0(a1)
@@ -71,3 +83,36 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
 
         ret
 endfunc
+
+.macro bilin_h_load dst len
+        vsetvlstatic8   \len + 1
+        vle8.v          \dst, (a2)
+        vslide1down.vx  v2, \dst, t5
+        vsetvlstatic8   \len
+        vwmulu.vx       v28, \dst, t1
+        vwmaccu.vx      v28, a5, v2
+        vwaddu.wx       v24, v28, t4
+        vnsra.wi        \dst, v24, 3
+.endm
+
+.macro put_vp8_bilin_h len
+func ff_put_vp8_bilin\len\()_h_rvv, zve32x
+        li              t1, 8
+        li              t4, 4
+        li              t5, 1
+        sub             t1, t1, a5
+1:
+        addi            a4, a4, -1
+        bilin_h_load    v0, \len
+        vse8.v          v0, (a0)
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
+.irp len 16,8,4
+put_vp8_bilin_h \len
+.endr
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [FFmpeg-devel] [PATCH 03/10] lavc/vp8dsp: R-V V put_bilin_v
       [not found] <20240505164536.872683-1-uk7b@foxmail.com>
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 02/10] lavc/vp8dsp: R-V V put_bilin_h uk7b
@ 2024-05-05 16:45 ` uk7b
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 04/10] lavc/vp8dsp: R-V V put_bilin_hv uk7b
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: uk7b @ 2024-05-05 16:45 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp8_put_bilin4_v_c: 383.5
vp8_put_bilin4_v_rvv_i32: 139.7
vp8_put_bilin8_v_c: 1455.7
vp8_put_bilin8_v_rvv_i32: 299.7
vp8_put_bilin16_v_c: 2863.7
vp8_put_bilin16_v_rvv_i32: 347.7
---
 libavcodec/riscv/vp8dsp_init.c |  7 +++++++
 libavcodec/riscv/vp8dsp_rvv.S  | 25 +++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 778d5ceb29..afffa6de2f 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -60,6 +60,13 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
         c->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_rvv;
         c->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_rvv;
         c->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_rvv;
+
+        c->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_rvv;
+        c->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_rvv;
+        c->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_rvv;
+        c->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_rvv;
+        c->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_rvv;
+        c->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_rvv;
     }
 #endif
 #endif
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 760d9d3871..2a2d40d77d 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -113,6 +113,31 @@ func ff_put_vp8_bilin\len\()_h_rvv, zve32x
 endfunc
 .endm
 
+.macro put_vp8_bilin_v len
+func ff_put_vp8_bilin\len\()_v_rvv, zve32x
+        vsetvlstatic8   \len
+        li              t1, 8
+        li              t4, 4
+        sub             t1, t1, a6
+1:
+        add             t2, a2, a3
+        addi            a4, a4, -1
+        vle8.v          v0, (a2)
+        vle8.v          v2, (t2)
+        vwmulu.vx       v28, v0, t1
+        vwmaccu.vx      v28, a6, v2
+        vwaddu.wx       v24, v28, t4
+        vnsra.wi        v0, v24, 3
+        vse8.v          v0, (a0)
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len 16,8,4
 put_vp8_bilin_h \len
+put_vp8_bilin_v \len
 .endr
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [FFmpeg-devel] [PATCH 04/10] lavc/vp8dsp: R-V V put_bilin_hv
       [not found] <20240505164536.872683-1-uk7b@foxmail.com>
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 02/10] lavc/vp8dsp: R-V V put_bilin_h uk7b
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 03/10] lavc/vp8dsp: R-V V put_bilin_v uk7b
@ 2024-05-05 16:45 ` uk7b
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 05/10] lavc/vp8dsp: R-V V put_epel h uk7b
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: uk7b @ 2024-05-05 16:45 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp8_put_bilin4_hv_c: 567.7
vp8_put_bilin4_hv_rvv_i32: 255.7
vp8_put_bilin8_hv_c: 2169.5
vp8_put_bilin8_hv_rvv_i32: 528.7
vp8_put_bilin16_hv_c: 4777.5
vp8_put_bilin16_hv_rvv_i32: 587.7
---
 libavcodec/riscv/vp8dsp_init.c | 13 +++++++++++++
 libavcodec/riscv/vp8dsp_rvv.S  | 26 ++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index afffa6de2f..9627105fc8 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -67,6 +67,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
         c->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_rvv;
         c->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_rvv;
         c->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_rvv;
+
+        c->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_rvv;
+        c->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_rvv;
+        c->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_rvv;
+        c->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_rvv;
+        c->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_rvv;
+        c->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_rvv;
+        c->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_rvv;
+        c->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_rvv;
+        c->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_rvv;
+        c->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_rvv;
+        c->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_rvv;
+        c->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_rvv;
     }
 #endif
 #endif
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 2a2d40d77d..f8105010c9 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -137,7 +137,33 @@ func ff_put_vp8_bilin\len\()_v_rvv, zve32x
 endfunc
 .endm
 
+.macro put_vp8_bilin_hv len
+func ff_put_vp8_bilin\len\()_hv_rvv, zve32x
+        li              t3, 8
+        sub             t1, t3, a5
+        sub             t2, t3, a6
+        li              t4, 4
+        li              t5, 1
+        bilin_h_load    v4, \len
+        add             a2, a2, a3
+1:
+        addi            a4, a4, -1
+        vwmulu.vx       v20, v4, t2
+        bilin_h_load    v4, \len
+        vwmaccu.vx      v20, a6, v4
+        vwaddu.wx       v24, v20, t4
+        vnsra.wi        v0, v24, 3
+        vse8.v          v0, (a0)
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len 16,8,4
 put_vp8_bilin_h \len
 put_vp8_bilin_v \len
+put_vp8_bilin_hv \len
 .endr
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [FFmpeg-devel] [PATCH 05/10] lavc/vp8dsp: R-V V put_epel h
       [not found] <20240505164536.872683-1-uk7b@foxmail.com>
                   ` (2 preceding siblings ...)
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 04/10] lavc/vp8dsp: R-V V put_bilin_hv uk7b
@ 2024-05-05 16:45 ` uk7b
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 06/10] lavc/vp8dsp: R-V V put_epel v uk7b
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: uk7b @ 2024-05-05 16:45 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp8_put_epel4_h4_c: 10.7
vp8_put_epel4_h4_rvv_i32: 5.0
vp8_put_epel4_h6_c: 15.0
vp8_put_epel4_h6_rvv_i32: 6.2
vp8_put_epel8_h4_c: 43.2
vp8_put_epel8_h4_rvv_i32: 11.2
vp8_put_epel8_h6_c: 57.5
vp8_put_epel8_h6_rvv_i32: 13.5
vp8_put_epel16_h4_c: 92.5
vp8_put_epel16_h4_rvv_i32: 13.7
vp8_put_epel16_h6_c: 139.0
vp8_put_epel16_h6_rvv_i32: 16.5
---
 libavcodec/riscv/vp8dsp_init.c | 10 ++++
 libavcodec/riscv/vp8dsp_rvv.S  | 87 ++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 9627105fc8..a4b7d49932 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -33,6 +33,9 @@ void ff_vp8_idct_dc_add4uv_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t str
 VP8_EPEL(16, rvi);
 VP8_EPEL(8,  rvi);
 VP8_EPEL(4,  rvi);
+VP8_EPEL(16, rvv);
+VP8_EPEL(8,  rvv);
+VP8_EPEL(4,  rvv);
 
 VP8_BILIN(16, rvv);
 VP8_BILIN(8,  rvv);
@@ -80,6 +83,13 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
         c->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_rvv;
         c->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_rvv;
         c->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_rvv;
+
+        c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_rvv;
+        c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_rvv;
+        c->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_rvv;
+        c->put_vp8_epel_pixels_tab[0][0][1] = ff_put_vp8_epel16_h4_rvv;
+        c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_rvv;
+        c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_rvv;
     }
 #endif
 #endif
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index f8105010c9..f5c4c1d85d 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -32,6 +32,16 @@
 .endif
 .endm
 
+.macro vsetvlstatic16 len
+.if \len <= 4
+        vsetivli        zero, \len, e16, mf2, ta, ma
+.elseif \len <= 8
+        vsetivli        zero, \len, e16, m1, ta, ma
+.elseif \len <= 16
+        vsetivli        zero, \len, e16, m2, ta, ma
+.endif
+.endm
+
 .macro vp8_idct_dc_add
         vlse32.v      v0, (a0), a2
         lh            a5, 0(a1)
@@ -162,8 +172,85 @@ func ff_put_vp8_bilin\len\()_hv_rvv, zve32x
 endfunc
 .endm
 
+const subpel_filters
+        .byte 0,  -6, 123,  12,  -1, 0
+        .byte 2, -11, 108,  36,  -8, 1
+        .byte 0,  -9,  93,  50,  -6, 0
+        .byte 3, -16,  77,  77, -16, 3
+        .byte 0,  -6,  50,  93,  -9, 0
+        .byte 1,  -8,  36, 108, -11, 2
+        .byte 0,  -1,  12, 123,  -6, 0
+endconst
+
+.macro epel_filter size
+        lla             t2, subpel_filters
+        addi            t0, a5, -1
+        li              t1, 6
+        mul             t0, t0, t1
+        add             t0, t0, t2
+        .irp n 1,2,3,4
+        lb              t\n, \n(t0)
+        .endr
+.ifc \size,6
+        lb              t5, 5(t0)
+        lb              t0, (t0)
+.endif
+.endm
+
+.macro epel_load dst len size
+        addi            t6, a2, -1
+        addi            a7, a2, 1
+        vle8.v          v24, (a2)
+        vle8.v          v22, (t6)
+        vle8.v          v26, (a7)
+        addi            a7, a7, 1
+        vle8.v          v28, (a7)
+        vwmulu.vx       v16, v24, t2
+        vwmulu.vx       v20, v26, t3
+.ifc \size,6
+        addi            t6, t6, -1
+        addi            a7, a7, 1
+        vle8.v          v24, (t6)
+        vle8.v          v26, (a7)
+        vwmaccu.vx      v16, t0, v24
+        vwmaccu.vx      v16, t5, v26
+.endif
+        li              t6, 64
+        vwmaccsu.vx     v16, t1, v22
+        vwmaccsu.vx     v16, t4, v28
+        vwadd.wx        v16, v16, t6
+        vsetvlstatic16  \len
+        vwadd.vv        v24, v16, v20
+        vnsra.wi        v24, v24, 7
+        vmax.vx         v24, v24, zero
+        vsetvlstatic8   \len
+        vnclipu.wi      \dst, v24, 0
+.endm
+
+.macro epel_load_inc dst len size
+        epel_load       \dst \len \size
+        add             a2, a2, a3
+.endm
+
+.macro epel len size type
+func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
+        epel_filter     \size
+        vsetvlstatic8   \len
+1:
+        addi            a4, a4, -1
+        epel_load_inc   v30 \len \size
+        vse8.v          v30, (a0)
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len 16,8,4
 put_vp8_bilin_h \len
 put_vp8_bilin_v \len
 put_vp8_bilin_hv \len
+epel \len 6 h
+epel \len 4 h
 .endr
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [FFmpeg-devel] [PATCH 06/10] lavc/vp8dsp: R-V V put_epel v
       [not found] <20240505164536.872683-1-uk7b@foxmail.com>
                   ` (3 preceding siblings ...)
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 05/10] lavc/vp8dsp: R-V V put_epel h uk7b
@ 2024-05-05 16:45 ` uk7b
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 07/10] lavc/vp8dsp: R-V V put_epel hv uk7b
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: uk7b @ 2024-05-05 16:45 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp8_put_epel4_v4_c: 11.0
vp8_put_epel4_v4_rvv_i32: 5.0
vp8_put_epel4_v6_c: 16.5
vp8_put_epel4_v6_rvv_i32: 6.2
vp8_put_epel8_v4_c: 43.7
vp8_put_epel8_v4_rvv_i32: 11.2
vp8_put_epel8_v6_c: 68.7
vp8_put_epel8_v6_rvv_i32: 13.2
vp8_put_epel16_v4_c: 92.5
vp8_put_epel16_v4_rvv_i32: 13.7
vp8_put_epel16_v6_c: 135.7
vp8_put_epel16_v6_rvv_i32: 16.5
---
 libavcodec/riscv/vp8dsp_init.c |  7 +++++++
 libavcodec/riscv/vp8dsp_rvv.S  | 34 +++++++++++++++++++++++-----------
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index a4b7d49932..dc3e087f01 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -90,6 +90,13 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
         c->put_vp8_epel_pixels_tab[0][0][1] = ff_put_vp8_epel16_h4_rvv;
         c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_rvv;
         c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_rvv;
+
+        c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_rvv;
+        c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_rvv;
+        c->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_rvv;
+        c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
+        c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
+        c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
     }
 #endif
 #endif
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index f5c4c1d85d..ca5581f845 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -182,9 +182,13 @@ const subpel_filters
         .byte 0,  -1,  12, 123,  -6, 0
 endconst
 
-.macro epel_filter size
+.macro epel_filter size type
         lla             t2, subpel_filters
+.ifc \type,v
+        addi            t0, a6, -1
+.elseif \type == h
         addi            t0, a5, -1
+.endif
         li              t1, 6
         mul             t0, t0, t1
         add             t0, t0, t2
@@ -197,19 +201,25 @@ endconst
 .endif
 .endm
 
-.macro epel_load dst len size
-        addi            t6, a2, -1
-        addi            a7, a2, 1
+.macro epel_load dst len size type
+.ifc \type,v
+        mv              a5, a3
+.else
+        li              a5, 1
+.endif
+        sub             t6, a2, a5
+        add             a7, a2, a5
+
         vle8.v          v24, (a2)
         vle8.v          v22, (t6)
         vle8.v          v26, (a7)
-        addi            a7, a7, 1
+        add             a7, a7, a5
         vle8.v          v28, (a7)
         vwmulu.vx       v16, v24, t2
         vwmulu.vx       v20, v26, t3
 .ifc \size,6
-        addi            t6, t6, -1
-        addi            a7, a7, 1
+        sub             t6, t6, a5
+        add             a7, a7, a5
         vle8.v          v24, (t6)
         vle8.v          v26, (a7)
         vwmaccu.vx      v16, t0, v24
@@ -227,18 +237,18 @@ endconst
         vnclipu.wi      \dst, v24, 0
 .endm
 
-.macro epel_load_inc dst len size
-        epel_load       \dst \len \size
+.macro epel_load_inc dst len size type
+        epel_load       \dst \len \size \type
         add             a2, a2, a3
 .endm
 
 .macro epel len size type
 func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
-        epel_filter     \size
+        epel_filter     \size \type
         vsetvlstatic8   \len
 1:
         addi            a4, a4, -1
-        epel_load_inc   v30 \len \size
+        epel_load_inc   v30 \len \size \type
         vse8.v          v30, (a0)
         add             a0, a0, a1
         bnez            a4, 1b
@@ -253,4 +263,6 @@ put_vp8_bilin_v \len
 put_vp8_bilin_hv \len
 epel \len 6 h
 epel \len 4 h
+epel \len 6 v
+epel \len 4 v
 .endr
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [FFmpeg-devel] [PATCH 07/10] lavc/vp8dsp: R-V V put_epel hv
       [not found] <20240505164536.872683-1-uk7b@foxmail.com>
                   ` (4 preceding siblings ...)
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 06/10] lavc/vp8dsp: R-V V put_epel v uk7b
@ 2024-05-05 16:45 ` uk7b
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 08/10] lavc/vp8dsp: R-V V loop_filter_simple uk7b
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: uk7b @ 2024-05-05 16:45 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp8_put_epel4_h4v4_c: 20.0
vp8_put_epel4_h4v4_rvv_i32: 11.0
vp8_put_epel4_h4v6_c: 25.2
vp8_put_epel4_h4v6_rvv_i32: 13.5
vp8_put_epel4_h6v4_c: 22.2
vp8_put_epel4_h6v4_rvv_i32: 14.5
vp8_put_epel4_h6v6_c: 29.0
vp8_put_epel4_h6v6_rvv_i32: 15.7
vp8_put_epel8_h4v4_c: 73.0
vp8_put_epel8_h4v4_rvv_i32: 22.2
vp8_put_epel8_h4v6_c: 90.5
vp8_put_epel8_h4v6_rvv_i32: 26.7
vp8_put_epel8_h6v4_c: 85.0
vp8_put_epel8_h6v4_rvv_i32: 27.2
vp8_put_epel8_h6v6_c: 104.7
vp8_put_epel8_h6v6_rvv_i32: 29.5
vp8_put_epel16_h4v4_c: 145.5
vp8_put_epel16_h4v4_rvv_i32: 26.5
vp8_put_epel16_h4v6_c: 190.7
vp8_put_epel16_h4v6_rvv_i32: 47.5
vp8_put_epel16_h6v4_c: 173.7
vp8_put_epel16_h6v4_rvv_i32: 33.2
vp8_put_epel16_h6v6_c: 222.2
vp8_put_epel16_h6v6_rvv_i32: 35.5
---
 libavcodec/riscv/vp8dsp_init.c |  14 ++++
 libavcodec/riscv/vp8dsp_rvv.S  | 118 +++++++++++++++++++++++++++------
 2 files changed, 111 insertions(+), 21 deletions(-)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index dc3e087f01..6ebb2e11e0 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -1,3 +1,4 @@
+
 /*
  * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
  *
@@ -97,6 +98,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
         c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
         c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
         c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
+
+        c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv;
+        c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
+        c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
+        c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv;
+        c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
+        c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
+        c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv;
+        c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
+        c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
+        c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv;
+        c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
+        c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
     }
 #endif
 #endif
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index ca5581f845..2d5e2260b7 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -1,3 +1,4 @@
+
 /*
  * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
  *
@@ -182,26 +183,26 @@ const subpel_filters
         .byte 0,  -1,  12, 123,  -6, 0
 endconst
 
-.macro epel_filter size type
-        lla             t2, subpel_filters
+.macro epel_filter size type regtype
+        lla             \regtype\()2, subpel_filters
 .ifc \type,v
-        addi            t0, a6, -1
+        addi            \regtype\()0, a6, -1
 .elseif \type == h
-        addi            t0, a5, -1
+        addi            \regtype\()0, a5, -1
 .endif
-        li              t1, 6
-        mul             t0, t0, t1
-        add             t0, t0, t2
+        li              \regtype\()1, 6
+        mul             \regtype\()0, \regtype\()0, \regtype\()1
+        add             \regtype\()0, \regtype\()0, \regtype\()2
         .irp n 1,2,3,4
-        lb              t\n, \n(t0)
+        lb              \regtype\n, \n(\regtype\()0)
         .endr
 .ifc \size,6
-        lb              t5, 5(t0)
-        lb              t0, (t0)
+        lb              \regtype\()5, 5(\regtype\()0)
+        lb              \regtype\()0, (\regtype\()0)
 .endif
 .endm
 
-.macro epel_load dst len size type
+.macro epel_load dst len size type from_mem regtype
 .ifc \type,v
         mv              a5, a3
 .else
@@ -210,24 +211,35 @@ endconst
         sub             t6, a2, a5
         add             a7, a2, a5
 
+.if \from_mem
         vle8.v          v24, (a2)
         vle8.v          v22, (t6)
         vle8.v          v26, (a7)
         add             a7, a7, a5
         vle8.v          v28, (a7)
-        vwmulu.vx       v16, v24, t2
-        vwmulu.vx       v20, v26, t3
+        vwmulu.vx       v16, v24, \regtype\()2
+        vwmulu.vx       v20, v26, \regtype\()3
 .ifc \size,6
         sub             t6, t6, a5
         add             a7, a7, a5
         vle8.v          v24, (t6)
         vle8.v          v26, (a7)
-        vwmaccu.vx      v16, t0, v24
-        vwmaccu.vx      v16, t5, v26
+        vwmaccu.vx      v16, \regtype\()0, v24
+        vwmaccu.vx      v16, \regtype\()5, v26
+.endif
+        vwmaccsu.vx     v16, \regtype\()1, v22
+        vwmaccsu.vx     v16, \regtype\()4, v28
+.else
+        vwmulu.vx       v16, v4, \regtype\()2
+        vwmulu.vx       v20, v6, \regtype\()3
+        .ifc \size,6
+        vwmaccu.vx      v16, \regtype\()0, v0
+        vwmaccu.vx      v16, \regtype\()5, v10
+        .endif
+        vwmaccsu.vx     v16, \regtype\()1, v2
+        vwmaccsu.vx     v16, \regtype\()4, v8
 .endif
         li              t6, 64
-        vwmaccsu.vx     v16, t1, v22
-        vwmaccsu.vx     v16, t4, v28
         vwadd.wx        v16, v16, t6
         vsetvlstatic16  \len
         vwadd.vv        v24, v16, v20
@@ -237,21 +249,81 @@ endconst
         vnclipu.wi      \dst, v24, 0
 .endm
 
-.macro epel_load_inc dst len size type
-        epel_load       \dst \len \size \type
+.macro epel_load_inc dst len size type from_mem regtype
+        epel_load       \dst \len \size \type \from_mem \regtype
         add             a2, a2, a3
 .endm
 
 .macro epel len size type
 func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
-        epel_filter     \size \type
+        epel_filter     \size \type t
+        vsetvlstatic8   \len
+1:
+        addi            a4, a4, -1
+        epel_load_inc   v30 \len \size \type 1 t
+        vse8.v          v30, (a0)
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
+.macro epel_hv len hsize vsize
+func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x
+        addi            sp, sp, -48
+        .irp n 0,1,2,3,4,5
+#if __riscv_xlen >= 64
+        sd              s\n, \n\()<<3(sp)
+#else
+        sw              s\n, \n\()<<3(sp)
+#endif
+        .endr
+        sub             a2, a2, a3
+        epel_filter     \hsize h t
+        epel_filter     \vsize v s
         vsetvlstatic8   \len
+.if \hsize == 6 || \vsize == 6
+        sub             a2, a2, a3
+        epel_load_inc   v0 \len \hsize h 1 t
+.endif
+        epel_load_inc   v2 \len \hsize h 1 t
+        epel_load_inc   v4 \len \hsize h 1 t
+        epel_load_inc   v6 \len \hsize h 1 t
+        epel_load_inc   v8 \len \hsize h 1 t
+.if \hsize == 6 || \vsize == 6
+        epel_load_inc   v10 \len \hsize h 1 t
+.endif
+        addi            a4, a4, -1
 1:
         addi            a4, a4, -1
-        epel_load_inc   v30 \len \size \type
+        epel_load       v30 \len \vsize v 0 s
         vse8.v          v30, (a0)
+.if \hsize == 6 || \vsize == 6
+        vmv.v.v         v0, v2
+.endif
+        vmv.v.v         v2, v4
+        vmv.v.v         v4, v6
+        vmv.v.v         v6, v8
+.if \hsize == 6 || \vsize == 6
+        vmv.v.v         v8, v10
+        epel_load_inc   v10 \len \hsize h 1 t
+.else
+        epel_load_inc   v8 \len 4 h 1 t
+.endif
         add             a0, a0, a1
         bnez            a4, 1b
+        epel_load       v30 \len \vsize v 0 s
+        vse8.v          v30, (a0)
+
+        .irp n 0,1,2,3,4,5
+#if __riscv_xlen >= 64
+        ld              s\n, \n\()<<3(sp)
+#else
+        lw              s\n, \n\()<<3(sp)
+#endif
+        .endr
+        addi            sp, sp, 48
 
         ret
 endfunc
@@ -265,4 +337,8 @@ epel \len 6 h
 epel \len 4 h
 epel \len 6 v
 epel \len 4 v
+epel_hv \len 6 6
+epel_hv \len 4 4
+epel_hv \len 6 4
+epel_hv \len 4 6
 .endr
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [FFmpeg-devel] [PATCH 08/10] lavc/vp8dsp: R-V V loop_filter_simple
       [not found] <20240505164536.872683-1-uk7b@foxmail.com>
                   ` (5 preceding siblings ...)
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 07/10] lavc/vp8dsp: R-V V put_epel hv uk7b
@ 2024-05-05 16:45 ` uk7b
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 09/10] lavc/vp8dsp: R-V V loop_filter_inner uk7b
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 10/10] lavc/vp8dsp: R-V V loop_filter uk7b
  8 siblings, 0 replies; 13+ messages in thread
From: uk7b @ 2024-05-05 16:45 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp8_loop_filter_simple_h_c: 416.0
vp8_loop_filter_simple_h_rvv_i32: 187.5
vp8_loop_filter_simple_v_c: 429.7
vp8_loop_filter_simple_v_rvv_i32: 104.0
---
 libavcodec/riscv/vp8dsp_init.c |  5 ++
 libavcodec/riscv/vp8dsp_rvv.S  | 85 ++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 6ebb2e11e0..6037c86e19 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -42,6 +42,8 @@ VP8_BILIN(16, rvv);
 VP8_BILIN(8,  rvv);
 VP8_BILIN(4,  rvv);
 
+VP8_LF(rvv);
+
 av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
 {
 #if HAVE_RV
@@ -127,6 +129,9 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
         if (flags & AV_CPU_FLAG_RVB_ADDR) {
             c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
         }
+
+        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_rvv;
+        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_rvv;
     }
 #endif
 }
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 2d5e2260b7..bef5f0ebdc 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -95,6 +95,91 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
         ret
 endfunc
 
+.macro filter_fmin len a f1 p0f2 q0f1
+        vsetvlstatic16  \len
+        vsext.vf2       \q0f1, \a
+        vmin.vx         \p0f2, \q0f1, a7
+        vmin.vx         \q0f1, \q0f1, t3
+        vadd.vi         \p0f2, \p0f2, 3
+        vadd.vi         \q0f1, \q0f1, 4
+        vsra.vi         \p0f2, \p0f2, 3
+        vsra.vi         \f1,   \q0f1, 3
+        vadd.vv         \p0f2, \p0f2, v8
+        vsub.vv         \q0f1, v16, \f1
+        vmax.vx         \p0f2, \p0f2, zero
+        vmax.vx         \q0f1, \q0f1, zero
+.endm
+
+.macro filter len type normal inner dst stride fE fI thresh
+.ifc \type,v
+        slli            a6, \stride, 1
+        sub             t2, \dst, a6
+        add             t4, \dst, \stride
+        sub             t1, \dst, \stride
+        vle8.v          v1, (t2)
+        vle8.v          v11, (t4)
+        vle8.v          v17, (t1)
+        vle8.v          v22, (\dst)
+.else
+        addi            t1, \dst, -1
+        addi            a6, \dst, -2
+        addi            t4, \dst, 1
+        vlse8.v         v1, (a6), \stride
+        vlse8.v         v11, (t4), \stride
+        vlse8.v         v17, (t1), \stride
+        vlse8.v         v22, (\dst), \stride
+.endif
+        vwsubu.vv       v12, v1, v11             // p1-q1
+        vwsubu.vv       v24, v22, v17            // q0-p0
+        vnclip.wi       v23, v12, 0
+        vsetvlstatic16  \len
+        // vp8_simple_limit(dst + i, stride, flim)
+        li              a7, 2
+        vneg.v          v18, v12
+        vmax.vv         v18, v18, v12
+        vneg.v          v8, v24
+        vmax.vv         v8, v8, v24
+        vsrl.vi         v18, v18, 1
+        vmacc.vx        v18, a7, v8
+        vmsleu.vx       v0, v18, \fE
+
+        li              t5, 3
+        li              a7, 124
+        li              t3, 123
+        vsext.vf2       v4, v23
+        vzext.vf2       v8, v17                  // p0
+        vzext.vf2       v16, v22                 // q0
+        vmul.vx         v30, v24, t5
+        vadd.vv         v12, v30, v4
+        vsetvlstatic8   \len
+        vnclip.wi       v11, v12, 0
+        filter_fmin     \len v11 v24 v4 v6
+        vsetvlstatic8   \len
+        vnclipu.wi      v4, v4, 0
+        vnclipu.wi      v6, v6, 0
+
+.ifc \type,v
+        vse8.v          v4, (t1), v0.t
+        vse8.v          v6, (\dst), v0.t
+.else
+        vsse8.v         v4, (t1), \stride, v0.t
+        vsse8.v         v6, (\dst), \stride, v0.t
+.endif
+
+.endm
+
+func ff_vp8_v_loop_filter16_simple_rvv, zve32x
+        vsetvlstatic8   16
+        filter 16 v 0 0 a0 a1 a2 a3 a4
+        ret
+endfunc
+
+func ff_vp8_h_loop_filter16_simple_rvv, zve32x
+        vsetvlstatic8   16
+        filter 16 h 0 0 a0 a1 a2 a3 a4
+        ret
+endfunc
+
 .macro bilin_h_load dst len
         vsetvlstatic8   \len + 1
         vle8.v          \dst, (a2)
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [FFmpeg-devel] [PATCH 09/10] lavc/vp8dsp: R-V V loop_filter_inner
       [not found] <20240505164536.872683-1-uk7b@foxmail.com>
                   ` (6 preceding siblings ...)
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 08/10] lavc/vp8dsp: R-V V loop_filter_simple uk7b
@ 2024-05-05 16:45 ` uk7b
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 10/10] lavc/vp8dsp: R-V V loop_filter uk7b
  8 siblings, 0 replies; 13+ messages in thread
From: uk7b @ 2024-05-05 16:45 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp8_loop_filter8uv_inner_v_c: 738.2
vp8_loop_filter8uv_inner_v_rvv_i32: 455.2
vp8_loop_filter16y_inner_h_c: 685.0
vp8_loop_filter16y_inner_h_rvv_i32: 497.0
vp8_loop_filter16y_inner_v_c: 743.7
vp8_loop_filter16y_inner_v_rvv_i32: 295.7
---
 libavcodec/riscv/vp8dsp_init.c |   4 ++
 libavcodec/riscv/vp8dsp_rvv.S  | 104 +++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 6037c86e19..4f38abba93 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -130,6 +130,10 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
             c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
         }
 
+        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_rvv;
+        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_rvv;
+        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_rvv;
+
         c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_rvv;
         c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_rvv;
     }
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index bef5f0ebdc..d7e8b6ae58 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -95,6 +95,13 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
         ret
 endfunc
 
+.macro filter_abs dst diff fI
+        vneg.v          v8, \diff
+        vmax.vv         \dst, v8, \diff
+        vmsleu.vx       v8, \dst, \fI
+        vmand.mm        v27, v27, v8
+.endm
+
 .macro filter_fmin len a f1 p0f2 q0f1
         vsetvlstatic16  \len
         vsext.vf2       \q0f1, \a
@@ -120,6 +127,16 @@ endfunc
         vle8.v          v11, (t4)
         vle8.v          v17, (t1)
         vle8.v          v22, (\dst)
+        .if \normal
+        sub             t3, t2, a6
+        sub             t0, t1, a6
+        add             t6, \dst, a6
+        add             a7, t4, a6
+        vle8.v          v2, (t3)
+        vle8.v          v15, (t0)
+        vle8.v          v10, (t6)
+        vle8.v          v14, (a7)
+        .endif
 .else
         addi            t1, \dst, -1
         addi            a6, \dst, -2
@@ -128,9 +145,27 @@ endfunc
         vlse8.v         v11, (t4), \stride
         vlse8.v         v17, (t1), \stride
         vlse8.v         v22, (\dst), \stride
+        .if \normal
+        addi            t5, \dst, -4
+        addi            t0, \dst, -3
+        addi            t6, \dst, 2
+        addi            a7, \dst, 3
+        vlse8.v         v2, (t5), \stride
+        vlse8.v         v15, (t0), \stride
+        vlse8.v         v10, (t6), \stride
+        vlse8.v         v14, (a7), \stride
+        .endif
 .endif
         vwsubu.vv       v12, v1, v11             // p1-q1
         vwsubu.vv       v24, v22, v17            // q0-p0
+.if \normal
+        vwsubu.vv       v30, v1, v17
+        vwsubu.vv       v20, v11, v22
+        vwsubu.vv       v28, v1, v15
+        vwsubu.vv       v4, v2, v15
+        vwsubu.vv       v6, v10, v11
+        vwsubu.vv       v2, v14, v10
+.endif
         vnclip.wi       v23, v12, 0
         vsetvlstatic16  \len
         // vp8_simple_limit(dst + i, stride, flim)
@@ -142,6 +177,25 @@ endfunc
         vsrl.vi         v18, v18, 1
         vmacc.vx        v18, a7, v8
         vmsleu.vx       v0, v18, \fE
+.if \normal
+        vneg.v          v18, v30
+        vmax.vv         v30, v18, v30
+        vmsleu.vx       v27, v30, \fI
+        filter_abs      v18 v28 \fI
+        filter_abs      v18 v4 \fI
+        filter_abs      v18 v6 \fI
+        filter_abs      v18 v2 \fI
+        filter_abs      v20 v20 \fI
+        vmand.mm        v27, v0, v27             // vp8_simple_limit && normal
+
+        vmsgtu.vx       v20, v20, \thresh        // hev
+        vmsgtu.vx       v3, v30, \thresh
+        vmor.mm         v3, v3, v20              // v3 = hev: > thresh
+        vzext.vf2       v18, v1                  // v18 = p1
+        vmand.mm        v0, v27, v3              // v0 = normal && hev
+        vzext.vf2       v20, v11                 // v12 = q1
+        vmnot.m         v3, v3                   // v3 = !hv
+.endif
 
         li              t5, 3
         li              a7, 124
@@ -166,6 +220,37 @@ endfunc
         vsse8.v         v6, (\dst), \stride, v0.t
 .endif
 
+.if \normal
+        vmand.mm        v0, v27, v3              // vp8_normal_limit & !hv
+
+        .if \inner
+        vnclip.wi       v30, v30, 0
+        filter_fmin     \len v30 v24 v4 v6
+        vadd.vi         v24, v24, 1
+        vsra.vi         v24, v24, 1              // (f1 + 1) >> 1;
+        vadd.vv         v8, v18, v24
+        vsub.vv         v10, v20, v24
+        .endif
+
+        vmax.vx         v8, v8, zero
+        vmax.vx         v10, v10, zero
+        vsetvlstatic8   \len
+        vnclipu.wi      v4, v4, 0
+        vnclipu.wi      v5, v6, 0
+        vnclipu.wi      v6, v8, 0
+        vnclipu.wi      v7, v10, 0
+        .ifc \type,v
+        vse8.v          v4, (t1), v0.t
+        vse8.v          v5, (\dst), v0.t
+        vse8.v          v6, (t2), v0.t
+        vse8.v          v7, (t4), v0.t
+        .else
+        vsse8.v         v4, (t1), \stride, v0.t
+        vsse8.v         v5, (\dst), \stride, v0.t
+        vsse8.v         v6, (a6), \stride, v0.t
+        vsse8.v         v7, (t4), \stride, v0.t
+        .endif
+.endif
 .endm
 
 func ff_vp8_v_loop_filter16_simple_rvv, zve32x
@@ -180,6 +265,25 @@ func ff_vp8_h_loop_filter16_simple_rvv, zve32x
         ret
 endfunc
 
+func ff_vp8_h_loop_filter16_inner_rvv, zve32x
+        vsetvlstatic8   16
+        filter 16 h 1 1 a0 a1 a2 a3 a4
+        ret
+endfunc
+
+func ff_vp8_v_loop_filter16_inner_rvv, zve32x
+        vsetvlstatic8   16
+        filter 16 v 1 1 a0 a1 a2 a3 a4
+        ret
+endfunc
+
+func ff_vp8_v_loop_filter8uv_inner_rvv, zve32x
+        vsetvlstatic8   8
+        filter 8 v 1 1 a0 a2 a3 a4 a5
+        filter 8 v 1 1 a1 a2 a3 a4 a5
+        ret
+endfunc
+
 .macro bilin_h_load dst len
         vsetvlstatic8   \len + 1
         vle8.v          \dst, (a2)
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [FFmpeg-devel] [PATCH 10/10] lavc/vp8dsp: R-V V loop_filter
       [not found] <20240505164536.872683-1-uk7b@foxmail.com>
                   ` (7 preceding siblings ...)
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 09/10] lavc/vp8dsp: R-V V loop_filter_inner uk7b
@ 2024-05-05 16:45 ` uk7b
  8 siblings, 0 replies; 13+ messages in thread
From: uk7b @ 2024-05-05 16:45 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp8_loop_filter8uv_v_c: 745.5
vp8_loop_filter8uv_v_rvv_i32: 467.2
vp8_loop_filter16y_h_c: 674.2
vp8_loop_filter16y_h_rvv_i32: 553.0
vp8_loop_filter16y_v_c: 732.7
vp8_loop_filter16y_v_rvv_i32: 324.5
---
 libavcodec/riscv/vp8dsp_init.c |  4 +++
 libavcodec/riscv/vp8dsp_rvv.S  | 57 ++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 4f38abba93..35c1646dab 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -130,6 +130,10 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
             c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
         }
 
+        c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_rvv;
+        c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_rvv;
+        c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_rvv;
+
         c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_rvv;
         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_rvv;
         c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_rvv;
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index d7e8b6ae58..360d79bc22 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -230,6 +230,33 @@ endfunc
         vsra.vi         v24, v24, 1              // (f1 + 1) >> 1;
         vadd.vv         v8, v18, v24
         vsub.vv         v10, v20, v24
+        .else
+        li              t5, 27
+        li              t3, 9
+        li              a7, 18
+        vwmul.vx        v2, v11, t5
+        vwmul.vx        v6, v11, t3
+        vwmul.vx        v4, v11, a7
+        vsetvlstatic16  \len
+        li              a7, 63
+        vzext.vf2       v14, v15                 // p2
+        vzext.vf2       v24, v10                 // q2
+        vadd.vx         v2, v2, a7
+        vadd.vx         v4, v4, a7
+        vadd.vx         v6, v6, a7
+        vsra.vi         v2, v2, 7                // a0
+        vsra.vi         v12, v4, 7               // a1
+        vsra.vi         v6, v6, 7                // a2
+        vadd.vv         v14, v14, v6             // p2 + a2
+        vsub.vv         v22, v24, v6             // q2 - a2
+        vsub.vv         v10, v20, v12            // q1 - a1
+        vadd.vv         v4, v8, v2               // p0 + a0
+        vsub.vv         v6, v16, v2              // q0 - a0
+        vadd.vv         v8, v12, v18             // a1 + p1
+        vmax.vx         v4, v4, zero
+        vmax.vx         v6, v6, zero
+        vmax.vx         v14, v14, zero
+        vmax.vx         v16, v22, zero
         .endif
 
         vmax.vx         v8, v8, zero
@@ -250,6 +277,17 @@ endfunc
         vsse8.v         v6, (a6), \stride, v0.t
         vsse8.v         v7, (t4), \stride, v0.t
         .endif
+        .if !\inner
+        vnclipu.wi      v14, v14, 0
+        vnclipu.wi      v16, v16, 0
+        .ifc \type,v
+        vse8.v          v14, (t0), v0.t
+        vse8.v          v16, (t6), v0.t
+        .else
+        vsse8.v         v14, (t0), \stride, v0.t
+        vsse8.v         v16, (t6), \stride, v0.t
+        .endif
+        .endif
 .endif
 .endm
 
@@ -284,6 +322,25 @@ func ff_vp8_v_loop_filter8uv_inner_rvv, zve32x
         ret
 endfunc
 
+func ff_vp8_v_loop_filter16_rvv, zve32x
+        vsetvlstatic8   16
+        filter 16 v 1 0 a0 a1 a2 a3 a4
+        ret
+endfunc
+
+func ff_vp8_h_loop_filter16_rvv, zve32x
+        vsetvlstatic8   16
+        filter 16 h 1 0 a0 a1 a2 a3 a4
+        ret
+endfunc
+
+func ff_vp8_v_loop_filter8uv_rvv, zve32x
+        vsetvlstatic8   8
+        filter 8 v 1 0 a0 a2 a3 a4 a5
+        filter 8 v 1 0 a1 a2 a3 a4 a5
+        ret
+endfunc
+
 .macro bilin_h_load dst len
         vsetvlstatic8   \len + 1
         vle8.v          \dst, (a2)
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [FFmpeg-devel] [PATCH 02/10] lavc/vp8dsp: R-V V put_bilin_h
  2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 02/10] lavc/vp8dsp: R-V V put_bilin_h uk7b
@ 2024-05-05 19:06   ` Rémi Denis-Courmont
  0 siblings, 0 replies; 13+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-05 19:06 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

Le sunnuntaina 5. toukokuuta 2024, 19.45.28 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
> C908:
> vp8_put_bilin4_h_c: 373.5
> vp8_put_bilin4_h_rvv_i32: 158.7
> vp8_put_bilin8_h_c: 1437.7
> vp8_put_bilin8_h_rvv_i32: 318.7
> vp8_put_bilin16_h_c: 2845.7
> vp8_put_bilin16_h_rvv_i32: 374.7
> ---
>  libavcodec/riscv/vp8dsp_init.c | 14 +++++++++++
>  libavcodec/riscv/vp8dsp_rvv.S  | 45 ++++++++++++++++++++++++++++++++++
>  2 files changed, 59 insertions(+)
> 
> diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
> index fa3feeacf7..778d5ceb29 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -34,6 +34,10 @@ VP8_EPEL(16, rvi);
>  VP8_EPEL(8,  rvi);
>  VP8_EPEL(4,  rvi);
> 
> +VP8_BILIN(16, rvv);
> +VP8_BILIN(8,  rvv);
> +VP8_BILIN(4,  rvv);
> +
>  av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>  {
>  #if HAVE_RV
> @@ -48,6 +52,16 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>          c->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
>          c->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
>      }
> +#if HAVE_RVV
> +    if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
> +        c->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_rvv;
> +        c->put_vp8_bilinear_pixels_tab[0][0][2] =
> ff_put_vp8_bilin16_h_rvv; +        c->put_vp8_bilinear_pixels_tab[1][0][1]
> = ff_put_vp8_bilin8_h_rvv; +        c->put_vp8_bilinear_pixels_tab[1][0][2]
> = ff_put_vp8_bilin8_h_rvv; +        c->put_vp8_bilinear_pixels_tab[2][0][1]
> = ff_put_vp8_bilin4_h_rvv; +        c->put_vp8_bilinear_pixels_tab[2][0][2]
> = ff_put_vp8_bilin4_h_rvv; +    }
> +#endif
>  #endif
>  }
> 
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index 8a0773f964..760d9d3871 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -20,6 +20,18 @@
> 
>  #include "libavutil/riscv/asm.S"
> 
> +.macro vsetvlstatic8 len
> +.if \len <= 4
> +        vsetivli        zero, \len, e8, mf4, ta, ma
> +.elseif \len <= 8
> +        vsetivli        zero, \len, e8, mf2, ta, ma
> +.elseif \len <= 16
> +        vsetivli        zero, \len, e8, m1, ta, ma
> +.elseif \len <= 31
> +        vsetivli        zero, \len, e8, m2, ta, ma
> +.endif
> +.endm
> +
>  .macro vp8_idct_dc_add
>          vlse32.v      v0, (a0), a2
>          lh            a5, 0(a1)
> @@ -71,3 +83,36 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
> 
>          ret
>  endfunc
> +
> +.macro bilin_h_load dst len
> +        vsetvlstatic8   \len + 1
> +        vle8.v          \dst, (a2)
> +        vslide1down.vx  v2, \dst, t5
> +        vsetvlstatic8   \len

Doesn't this effectively discard the last element, t5?
Can't we skip the slide and just load the vector at a2+1? Also then, we can 
keep VL=len and halve the multipler.

> +        vwmulu.vx       v28, \dst, t1
> +        vwmaccu.vx      v28, a5, v2
> +        vwaddu.wx       v24, v28, t4
> +        vnsra.wi        \dst, v24, 3
> +.endm
> +
> +.macro put_vp8_bilin_h len
> +func ff_put_vp8_bilin\len\()_h_rvv, zve32x
> +        li              t1, 8
> +        li              t4, 4
> +        li              t5, 1
> +        sub             t1, t1, a5
> +1:
> +        addi            a4, a4, -1
> +        bilin_h_load    v0, \len
> +        vse8.v          v0, (a0)
> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +
> +        ret
> +endfunc
> +.endm
> +
> +.irp len 16,8,4
> +put_vp8_bilin_h \len
> +.endr


-- 
レミ・デニ-クールモン
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [FFmpeg-devel] [PATCH 02/10] lavc/vp8dsp: R-V V put_bilin_h
  2024-05-04 18:02   ` Rémi Denis-Courmont
@ 2024-05-04 18:06     ` Rémi Denis-Courmont
  0 siblings, 0 replies; 13+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-04 18:06 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

Le lauantaina 4. toukokuuta 2024, 21.02.25 EEST Rémi Denis-Courmont a écrit :
> Le lauantaina 4. toukokuuta 2024, 17.48.31 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> > 
> > C908:
> > vp8_put_bilin4_h_c: 373.5
> > vp8_put_bilin4_h_rvv_i32: 158.7
> > vp8_put_bilin8_h_c: 1437.7
> > vp8_put_bilin8_h_rvv_i32: 318.7
> > vp8_put_bilin16_h_c: 2845.7
> > vp8_put_bilin16_h_rvv_i32: 374.7
> > ---
> > 
> >  libavcodec/riscv/vp8dsp_init.c | 11 +++++++
> >  libavcodec/riscv/vp8dsp_rvv.S  | 54 ++++++++++++++++++++++++++++++++++
> >  2 files changed, 65 insertions(+)
> > 
> > diff --git a/libavcodec/riscv/vp8dsp_init.c
> > b/libavcodec/riscv/vp8dsp_init.c index c364de3dc9..32cb4893a4 100644
> > --- a/libavcodec/riscv/vp8dsp_init.c
> > +++ b/libavcodec/riscv/vp8dsp_init.c
> > @@ -34,6 +34,10 @@ VP8_EPEL(16, rvv);
> > 
> >  VP8_EPEL(8,  rvv);
> >  VP8_EPEL(4,  rvv);
> > 
> > +VP8_BILIN(16, rvv);
> > +VP8_BILIN(8,  rvv);
> > +VP8_BILIN(4,  rvv);
> > +
> > 
> >  av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> >  {
> >  #if HAVE_RVV
> > 
> > @@ -47,6 +51,13 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> > 
> >          c->put_vp8_bilinear_pixels_tab[0][0][0] =
> >          ff_put_vp8_pixels16_rvv;
> >          c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_rvv;
> >          c->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvv;
> > 
> > +
> > +        c->put_vp8_bilinear_pixels_tab[0][0][1] =
> > ff_put_vp8_bilin16_h_rvv; +       
> > c->put_vp8_bilinear_pixels_tab[0][0][2] =
> > ff_put_vp8_bilin16_h_rvv; +        c->put_vp8_bilinear_pixels_tab[1][0][1]
> > = ff_put_vp8_bilin8_h_rvv; +       
> > c->put_vp8_bilinear_pixels_tab[1][0][2]
> > = ff_put_vp8_bilin8_h_rvv; +       
> > c->put_vp8_bilinear_pixels_tab[2][0][1]
> > = ff_put_vp8_bilin4_h_rvv; +       
> > c->put_vp8_bilinear_pixels_tab[2][0][2]
> > = ff_put_vp8_bilin4_h_rvv; }
> > 
> >  #endif
> >  }
> > 
> > diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> > index 063ab7110c..c8d265e516 100644
> > --- a/libavcodec/riscv/vp8dsp_rvv.S
> > +++ b/libavcodec/riscv/vp8dsp_rvv.S
> > @@ -98,3 +98,57 @@ func ff_put_vp8_pixels4_rvv, zve32x
> > 
> >          vsetivli      zero, 4, e8, mf4, ta, ma
> >          put_vp8_pixels
> >  
> >  endfunc
> > 
> > +
> > +.macro bilin_h_load dst len
> > +.ifc \len,4
> > +        vsetivli        zero, 5, e8, mf2, ta, ma
> > +.elseif \len == 8
> > +        vsetivli        zero, 9, e8, m1, ta, ma
> > +.else
> > +        vsetivli        zero, 17, e8, m2, ta, ma
> > +.endif
> 
> It might be worth defining a pseudo-instruction macro in asm.S that would
> statically compute the minimal LMUL from just the AVL and SEW. Then we don't
> to repeat these if blocks times and again, we can just do:
> 
> vsetvlstatic \len + 1, e8
> 
> or something like that

On second thought, concealing the LMUL from the programmer is perhaps not the 
smartest idea, since it heavily constrains register allocation.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [FFmpeg-devel] [PATCH 02/10] lavc/vp8dsp: R-V V put_bilin_h
  2024-05-04 14:48 ` [FFmpeg-devel] [PATCH 02/10] lavc/vp8dsp: R-V V put_bilin_h uk7b
@ 2024-05-04 18:02   ` Rémi Denis-Courmont
  2024-05-04 18:06     ` Rémi Denis-Courmont
  0 siblings, 1 reply; 13+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-04 18:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

Le lauantaina 4. toukokuuta 2024, 17.48.31 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
> C908:
> vp8_put_bilin4_h_c: 373.5
> vp8_put_bilin4_h_rvv_i32: 158.7
> vp8_put_bilin8_h_c: 1437.7
> vp8_put_bilin8_h_rvv_i32: 318.7
> vp8_put_bilin16_h_c: 2845.7
> vp8_put_bilin16_h_rvv_i32: 374.7
> ---
>  libavcodec/riscv/vp8dsp_init.c | 11 +++++++
>  libavcodec/riscv/vp8dsp_rvv.S  | 54 ++++++++++++++++++++++++++++++++++
>  2 files changed, 65 insertions(+)
> 
> diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
> index c364de3dc9..32cb4893a4 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -34,6 +34,10 @@ VP8_EPEL(16, rvv);
>  VP8_EPEL(8,  rvv);
>  VP8_EPEL(4,  rvv);
> 
> +VP8_BILIN(16, rvv);
> +VP8_BILIN(8,  rvv);
> +VP8_BILIN(4,  rvv);
> +
>  av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>  {
>  #if HAVE_RVV
> @@ -47,6 +51,13 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>          c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_rvv;
>          c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_rvv;
>          c->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvv;
> +
> +        c->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_rvv;
> +        c->put_vp8_bilinear_pixels_tab[0][0][2] =
> ff_put_vp8_bilin16_h_rvv; +        c->put_vp8_bilinear_pixels_tab[1][0][1]
> = ff_put_vp8_bilin8_h_rvv; +        c->put_vp8_bilinear_pixels_tab[1][0][2]
> = ff_put_vp8_bilin8_h_rvv; +        c->put_vp8_bilinear_pixels_tab[2][0][1]
> = ff_put_vp8_bilin4_h_rvv; +        c->put_vp8_bilinear_pixels_tab[2][0][2]
> = ff_put_vp8_bilin4_h_rvv; }
>  #endif
>  }
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index 063ab7110c..c8d265e516 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -98,3 +98,57 @@ func ff_put_vp8_pixels4_rvv, zve32x
>          vsetivli      zero, 4, e8, mf4, ta, ma
>          put_vp8_pixels
>  endfunc
> +
> +.macro bilin_h_load dst len
> +.ifc \len,4
> +        vsetivli        zero, 5, e8, mf2, ta, ma
> +.elseif \len == 8
> +        vsetivli        zero, 9, e8, m1, ta, ma
> +.else
> +        vsetivli        zero, 17, e8, m2, ta, ma
> +.endif

It might be worth defining a pseudo-instruction macro in asm.S that would 
statically compute the minimal LMUL from just the AVL and SEW. Then we don't 
to repeat these if blocks times and again, we can just do:

vsetvlstatic \len + 1, e8

or something like that

> +
> +        vle8.v          \dst, (a2)
> +        vslide1down.vx  v2, \dst, t5
> +
> +.ifc \len,4
> +        vsetivli        zero, 4, e8, mf4, ta, ma
> +.elseif \len == 8
> +        vsetivli        zero, 8, e8, mf2, ta, ma
> +.else
> +        vsetivli        zero, 16, e8, m1, ta, ma
> +.endif
> +
> +        vwmulu.vx       v28, \dst, t1
> +        vwmaccu.vx      v28, a5, v2
> +        vwaddu.wx       v24, v28, t4
> +        vnsra.wi        \dst, v24, 3
> +.endm
> +
> +.macro put_vp8_bilin_h len
> +        li              t1, 8
> +        li              t4, 4
> +        li              t5, 1
> +        sub             t1, t1, a5
> +1:
> +        addi            a4, a4, -1
> +        bilin_h_load    v0, \len
> +        vse8.v          v0, (a0)
> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +
> +        ret
> +.endm

FWIW, it should be possible to include func and endfunc in the macro too.

> +
> +func ff_put_vp8_bilin16_h_rvv, zve32x
> +        put_vp8_bilin_h 16
> +endfunc
> +
> +func ff_put_vp8_bilin8_h_rvv, zve32x
> +        put_vp8_bilin_h 8
> +endfunc
> +
> +func ff_put_vp8_bilin4_h_rvv, zve32x
> +        put_vp8_bilin_h 4
> +endfunc


-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [FFmpeg-devel] [PATCH 02/10] lavc/vp8dsp: R-V V put_bilin_h
       [not found] <20240504144840.2411603-1-uk7b@foxmail.com>
@ 2024-05-04 14:48 ` uk7b
  2024-05-04 18:02   ` Rémi Denis-Courmont
  0 siblings, 1 reply; 13+ messages in thread
From: uk7b @ 2024-05-04 14:48 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp8_put_bilin4_h_c: 373.5
vp8_put_bilin4_h_rvv_i32: 158.7
vp8_put_bilin8_h_c: 1437.7
vp8_put_bilin8_h_rvv_i32: 318.7
vp8_put_bilin16_h_c: 2845.7
vp8_put_bilin16_h_rvv_i32: 374.7
---
 libavcodec/riscv/vp8dsp_init.c | 11 +++++++
 libavcodec/riscv/vp8dsp_rvv.S  | 54 ++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index c364de3dc9..32cb4893a4 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -34,6 +34,10 @@ VP8_EPEL(16, rvv);
 VP8_EPEL(8,  rvv);
 VP8_EPEL(4,  rvv);
 
+VP8_BILIN(16, rvv);
+VP8_BILIN(8,  rvv);
+VP8_BILIN(4,  rvv);
+
 av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
 {
 #if HAVE_RVV
@@ -47,6 +51,13 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_rvv;
         c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_rvv;
         c->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvv;
+
+        c->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_rvv;
+        c->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_rvv;
+        c->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_rvv;
+        c->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_rvv;
+        c->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_rvv;
+        c->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_rvv;
     }
 #endif
 }
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 063ab7110c..c8d265e516 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -98,3 +98,57 @@ func ff_put_vp8_pixels4_rvv, zve32x
         vsetivli      zero, 4, e8, mf4, ta, ma
         put_vp8_pixels
 endfunc
+
+.macro bilin_h_load dst len
+.ifc \len,4
+        vsetivli        zero, 5, e8, mf2, ta, ma
+.elseif \len == 8
+        vsetivli        zero, 9, e8, m1, ta, ma
+.else
+        vsetivli        zero, 17, e8, m2, ta, ma
+.endif
+
+        vle8.v          \dst, (a2)
+        vslide1down.vx  v2, \dst, t5
+
+.ifc \len,4
+        vsetivli        zero, 4, e8, mf4, ta, ma
+.elseif \len == 8
+        vsetivli        zero, 8, e8, mf2, ta, ma
+.else
+        vsetivli        zero, 16, e8, m1, ta, ma
+.endif
+
+        vwmulu.vx       v28, \dst, t1
+        vwmaccu.vx      v28, a5, v2
+        vwaddu.wx       v24, v28, t4
+        vnsra.wi        \dst, v24, 3
+.endm
+
+.macro put_vp8_bilin_h len
+        li              t1, 8
+        li              t4, 4
+        li              t5, 1
+        sub             t1, t1, a5
+1:
+        addi            a4, a4, -1
+        bilin_h_load    v0, \len
+        vse8.v          v0, (a0)
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+.endm
+
+func ff_put_vp8_bilin16_h_rvv, zve32x
+        put_vp8_bilin_h 16
+endfunc
+
+func ff_put_vp8_bilin8_h_rvv, zve32x
+        put_vp8_bilin_h 8
+endfunc
+
+func ff_put_vp8_bilin4_h_rvv, zve32x
+        put_vp8_bilin_h 4
+endfunc
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2024-05-05 19:06 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20240505164536.872683-1-uk7b@foxmail.com>
2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 02/10] lavc/vp8dsp: R-V V put_bilin_h uk7b
2024-05-05 19:06   ` Rémi Denis-Courmont
2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 03/10] lavc/vp8dsp: R-V V put_bilin_v uk7b
2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 04/10] lavc/vp8dsp: R-V V put_bilin_hv uk7b
2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 05/10] lavc/vp8dsp: R-V V put_epel h uk7b
2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 06/10] lavc/vp8dsp: R-V V put_epel v uk7b
2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 07/10] lavc/vp8dsp: R-V V put_epel hv uk7b
2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 08/10] lavc/vp8dsp: R-V V loop_filter_simple uk7b
2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 09/10] lavc/vp8dsp: R-V V loop_filter_inner uk7b
2024-05-05 16:45 ` [FFmpeg-devel] [PATCH 10/10] lavc/vp8dsp: R-V V loop_filter uk7b
     [not found] <20240504144840.2411603-1-uk7b@foxmail.com>
2024-05-04 14:48 ` [FFmpeg-devel] [PATCH 02/10] lavc/vp8dsp: R-V V put_bilin_h uk7b
2024-05-04 18:02   ` Rémi Denis-Courmont
2024-05-04 18:06     ` Rémi Denis-Courmont

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git