Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V mc bilin hv
       [not found] <20240803095040.3648582-1-uk7b@foxmail.com>
@ 2024-08-03  9:50 ` uk7b
  2024-08-07 17:23   ` Rémi Denis-Courmont
  2024-08-03  9:50 ` [FFmpeg-devel] [PATCH 3/4] lavc/vp9dsp: R-V V mc tap h v uk7b
  2024-08-03  9:50 ` [FFmpeg-devel] [PATCH 4/4] lavc/vp9dsp: R-V V mc tap hv uk7b
  2 siblings, 1 reply; 7+ messages in thread
From: uk7b @ 2024-08-03  9:50 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                     C908   X60
vp9_avg_bilin_4hv_8bpp_c                           :   10.7    9.5
vp9_avg_bilin_4hv_8bpp_rvv_i32                     :    4.0    3.5
vp9_avg_bilin_8hv_8bpp_c                           :   38.5   34.2
vp9_avg_bilin_8hv_8bpp_rvv_i32                     :    7.2    6.5
vp9_avg_bilin_16hv_8bpp_c                          :  147.2  130.5
vp9_avg_bilin_16hv_8bpp_rvv_i32                    :   14.5   12.7
vp9_avg_bilin_32hv_8bpp_c                          :  574.2  509.7
vp9_avg_bilin_32hv_8bpp_rvv_i32                    :   42.5   38.0
vp9_avg_bilin_64hv_8bpp_c                          : 2321.2 2017.7
vp9_avg_bilin_64hv_8bpp_rvv_i32                    :  163.5  131.0
vp9_put_bilin_4hv_8bpp_c                           :   10.0    8.7
vp9_put_bilin_4hv_8bpp_rvv_i32                     :    3.5    3.0
vp9_put_bilin_8hv_8bpp_c                           :   35.2   31.2
vp9_put_bilin_8hv_8bpp_rvv_i32                     :    6.5    5.7
vp9_put_bilin_16hv_8bpp_c                          :  134.0  119.0
vp9_put_bilin_16hv_8bpp_rvv_i32                    :   12.7   11.5
vp9_put_bilin_32hv_8bpp_c                          :  538.5  464.2
vp9_put_bilin_32hv_8bpp_rvv_i32                    :   39.7   35.2
vp9_put_bilin_64hv_8bpp_c                          : 2111.7 1833.2
vp9_put_bilin_64hv_8bpp_rvv_i32                    :  138.5  122.5
---
 libavcodec/riscv/vp9_mc_rvv.S  | 39 +++++++++++++++++++++++++++++++++-
 libavcodec/riscv/vp9dsp_init.c | 10 +++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 817cc58b5e..caab2093be 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -149,6 +149,41 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
 endfunc
 .endm
 
+.macro bilin_hv op
+func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
+        lpad    0
+        vsetvlstatic8   64, t0, 64
+.Lbilin_hv\op:
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+        neg             t1, a5
+        neg             t2, a6
+        li              t4, 8
+        bilin_load_h    v24, put, a5
+        add             a2, a2, a3
+1:
+        addi            a4, a4, -1
+        bilin_load_h    v4, put, a5
+        vwmulu.vx       v16, v4, a6
+        vwmaccsu.vx     v16, t2, v24
+        vwadd.wx        v16, v16, t4
+        vnsra.wi        v16, v16, 4
+        vadd.vv         v0, v16, v24
+.ifc \op,avg
+        vle8.v          v16, (a0)
+        vaaddu.vv       v0, v0, v16
+.endif
+        vse8.v          v0, (a0)
+        vmv.v.v         v24, v4
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
 .endr
@@ -157,6 +192,8 @@ bilin_h_v  put, h, a5
 bilin_h_v  avg, h, a5
 bilin_h_v  put, v, a6
 bilin_h_v  avg, v, a6
+bilin_hv   put
+bilin_hv   avg
 
 .macro func_bilin_h_v len, op, type
 func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
@@ -168,7 +205,7 @@ endfunc
 
 .irp len, 32, 16, 8, 4
         .irp op, put, avg
-                .irp type, h, v
+                .irp type, h, v, hv
                         func_bilin_h_v \len, \op, \type
                 .endr
         .endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 83dbe1b5d9..d53852f673 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv;
     dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv;
     dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
+    dsp->mc[0][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_64hv_rvv;
+    dsp->mc[0][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_64hv_rvv;
+    dsp->mc[1][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_32hv_rvv;
+    dsp->mc[1][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_32hv_rvv;
+    dsp->mc[2][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_16hv_rvv;
+    dsp->mc[2][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_16hv_rvv;
+    dsp->mc[3][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_8hv_rvv;
+    dsp->mc[3][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_8hv_rvv;
+    dsp->mc[4][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_4hv_rvv;
+    dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
 
 #undef init_fpel
     }
-- 
2.46.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH 3/4] lavc/vp9dsp: R-V V mc tap h v
       [not found] <20240803095040.3648582-1-uk7b@foxmail.com>
  2024-08-03  9:50 ` [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
@ 2024-08-03  9:50 ` uk7b
  2024-08-03  9:50 ` [FFmpeg-devel] [PATCH 4/4] lavc/vp9dsp: R-V V mc tap hv uk7b
  2 siblings, 0 replies; 7+ messages in thread
From: uk7b @ 2024-08-03  9:50 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                     C908   X60
vp9_avg_8tap_smooth_4h_8bpp_c                      :   12.7   11.2
vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    4.7    4.2
vp9_avg_8tap_smooth_4v_8bpp_c                      :   29.7   12.5
vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    4.7    4.2
vp9_avg_8tap_smooth_8h_8bpp_c                      :   48.7   42.2
vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.5    8.5
vp9_avg_8tap_smooth_8v_8bpp_c                      :   49.7   45.5
vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.0  166.5
vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.7   19.5
vp9_avg_8tap_smooth_16v_8bpp_c                     :  191.2  175.2
vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.2   19.0
vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.2
vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   68.2   60.5
vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.0  685.7
vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.0   59.5
vp9_avg_8tap_smooth_64h_8bpp_c                     : 3116.2 2648.2
vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  270.7  120.7
vp9_avg_8tap_smooth_64v_8bpp_c                     : 3058.5 2731.7
vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  266.5  119.0
vp9_put_8tap_smooth_4h_8bpp_c                      :   11.0    9.7
vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    3.7
vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.5
vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.0    3.7
vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
vp9_put_8tap_smooth_8v_8bpp_c                      :   43.5   38.5
vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.7    7.7
vp9_put_8tap_smooth_16h_8bpp_c                     :  181.7  147.2
vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   20.0   18.0
vp9_put_8tap_smooth_16v_8bpp_c                     :  168.5  149.7
vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
vp9_put_8tap_smooth_32h_8bpp_c                     :  675.0  586.5
vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   65.2   58.0
vp9_put_8tap_smooth_32v_8bpp_c                     :  664.7  591.2
vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.0   57.0
vp9_put_8tap_smooth_64h_8bpp_c                     : 2696.2 2339.0
vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  259.7  115.7
vp9_put_8tap_smooth_64v_8bpp_c                     : 2691.0 2348.5
vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
---
 libavcodec/riscv/vp9_mc_rvv.S  | 183 +++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp.h      |  72 ++++++++-----
 libavcodec/riscv/vp9dsp_init.c |  38 ++++++-
 3 files changed, 268 insertions(+), 25 deletions(-)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index caab2093be..c9fee6a3c5 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -36,6 +36,18 @@
 .endif
 .endm
 
+.macro vsetvlstatic16 len
+.ifc \len,4
+        vsetvli         zero, zero, e16, mf2, ta, ma
+.elseif \len == 8
+        vsetvli         zero, zero, e16, m1, ta, ma
+.elseif \len == 16
+        vsetvli         zero, zero, e16, m2, ta, ma
+.else
+        vsetvli         zero, zero, e16, m4, ta, ma
+.endif
+.endm
+
 .macro copy_avg len
 func ff_vp9_avg\len\()_rvv, zve32x
         lpad    0
@@ -184,8 +196,179 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
 endfunc
 .endm
 
+.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
+.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
+.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
+
+.macro epel_filter name, type, regtype, arg
+        lla             \regtype\()2, ff_vp9_subpel_filters_\name
+.ifc \type,v
+        slli            \regtype\()0, a6, 4
+.else
+        slli            \regtype\()0, a5, 4
+.endif
+        add             \regtype\()0, \regtype\()0, \regtype\()2
+        lh              \regtype\()1, 2(\regtype\()0)
+        lh              \regtype\()2, 4(\regtype\()0)
+        lh              \regtype\()3, 6(\regtype\()0)
+        lh              \regtype\()4, 8(\regtype\()0)
+        lh              \regtype\()5, 10(\regtype\()0)
+        lh              \regtype\()6, 12(\regtype\()0)
+        lh              \arg, 14(\regtype\()0)
+        lh              \regtype\()0, 0(\regtype\()0)
+.endm
+
+.macro epel_load dst, len, op, name, type, from_mem, regtype
+.ifc \from_mem, 1
+        vle8.v          v22, (a2)
+.ifc \type,v
+        add             a5, a3, a2
+        sub             a2, a2, a3
+        vle8.v          v24, (a5)
+        vle8.v          v20, (a2)
+        sh1add          a2, a3, a5
+        add             a5, a5, a3
+        vle8.v          v26, (a5)
+        vle8.v          v28, (a2)
+        add             a2, a2, a3
+        vle8.v          v30, (a2)
+.else
+        addi            a5, a2, 1
+        addi            a2, a2, -1
+        vle8.v          v24, (a5)
+        vle8.v          v20, (a2)
+        addi            a5, a5, 2
+        addi            a2, a2, 3
+        vle8.v          v28, (a5)
+        vle8.v          v26, (a2)
+        addi            a2, a5, 1
+        vle8.v          v30, (a2)
+.endif
+
+.ifc \name,smooth
+        vwmulu.vx       v16, v24, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v20
+        vwmaccu.vx      v16, \regtype\()5, v26
+        vwmaccsu.vx     v16, \regtype\()6, v28
+.else
+        vwmulu.vx       v16, v28, \regtype\()6
+        vwmaccsu.vx     v16, \regtype\()2, v20
+        vwmaccsu.vx     v16, \regtype\()5, v26
+.endif
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v30
+.else
+        vwmaccsu.vx     v16, s7, v30
+.endif
+
+.ifc \type,v
+        sh1add          a5, a3, a3
+        sub             a2, a2, a5
+        sub             a2, a2, a5
+        sub             a5, a2, a3
+        vle8.v          v28, (a2)
+        vle8.v          v26, (a5)
+        sh1add          a2, a3, a2
+.else
+        addi            a5, a2, -7
+        addi            a2, a2, -6
+        vle8.v          v26, (a5)
+        vle8.v          v28, (a2)
+        addi            a2, a2, 2
+.endif
+
+.ifc \name,smooth
+        vwmaccsu.vx     v16, \regtype\()1, v28
+.else
+        vwmaccu.vx      v16, \regtype\()1, v28
+        vwmulu.vx       v28, v24, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v26
+        vwmulu.vx       v20, v22, \regtype\()3
+.else
+.ifc \name,smooth
+        vwmulu.vx       v16, v8, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v4
+        vwmaccu.vx      v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()1, v2
+.else
+        vwmulu.vx       v16, v2, \regtype\()1
+        vwmaccu.vx      v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()2, v4
+        vwmulu.vx       v28, v8, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v0
+        vwmulu.vx       v20, v6, \regtype\()3
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v14
+.else
+        vwmaccsu.vx     v16, s7, v14
+.endif
+
+.endif
+        vsetvlstatic16  \len
+.ifc \name,smooth
+        vwadd.vv        v24, v16, v20
+.else
+        vwadd.vv        v24, v16, v28
+        vwadd.wv        v24, v24, v20
+.endif
+        vnclipu.wi      v24, v24, 7
+        vmax.vx         v24, v24, zero
+        vsetvlstatic8   \len, zero, 32, m2
+        vnclipu.wi      \dst, v24, 0
+.ifc \op,avg
+        vle8.v          v24, (a0)
+        vaaddu.vv       \dst, \dst, v24
+.endif
+
+.endm
+
+.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
+        epel_load       \dst, \len, \op, \name, \type, \from_mem, \regtype
+        add             a2, a2, a3
+.endm
+
+.macro epel len, op, name, type, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
+        lpad    0
+        epel_filter     \name, \type, t, a7
+        vsetvlstatic8   \len, a5, 64, m2
+        csrwi           vxrm, 0
+1:
+        addi            a4, a4, -1
+        epel_load       v30, \len, \op, \name, \type, 1, t
+        vse8.v          v30, (a0)
+.if \len == 64 && \vlen < 256
+        addi            a0, a0, 32
+        addi            a2, a2, 32
+        epel_load       v30, \len, \op, \name, \type, 1, t
+        vse8.v          v30, (a0)
+        addi            a0, a0, -32
+        addi            a2, a2, -32
+.endif
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
+        .irp op, put, avg
+                .irp name, regular, sharp, smooth
+                        .irp type, h, v
+                                epel \len, \op, \name, \type, 128
+                                epel \len, \op, \name, \type, 256
+                        .endr
+                .endr
+        .endr
 .endr
 
 bilin_h_v  put, h, a5
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index 8fb326dae0..5fd64a1b8c 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
 void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
                    const uint8_t *a);
 
-#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)                         \
-void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen)              \
+void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
+void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,         \
+                                         ptrdiff_t dststride,                \
                                          const uint8_t *src,                 \
                                          ptrdiff_t srcstride,                \
                                          int h, int mx, int my);             \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
+void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,         \
+                                         ptrdiff_t dststride,                \
                                          const uint8_t *src,                 \
                                          ptrdiff_t srcstride,                \
                                          int h, int mx, int my);
@@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride,     \
                         const uint8_t *src, ptrdiff_t srcstride,   \
                         int h, int mx, int my);
 
-VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
 
 VP9_BILINEAR_RISCV_RVV_FUNC(64);
 VP9_BILINEAR_RISCV_RVV_FUNC(32);
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index d53852f673..d7a1f51d6e 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
 # endif
 
 #if HAVE_RVV
-    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) {
+    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
+    int vlenb = ff_get_rv_vlenb();
+    if (vlenb >= 16) {
 
 #define init_fpel(idx1, sz)                                           \
     dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv;  \
@@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
 
 #undef init_fpel
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
+
+#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
+
+    init_subpel2(0, 1, 0, h, put, 128);
+    init_subpel2(1, 1, 0, h, avg, 128);
+
+    if (flags & AV_CPU_FLAG_RVB_ADDR) {
+        init_subpel2(0, 0, 1, v, put, 128);
+        init_subpel2(1, 0, 1, v, avg, 128);
+    }
+
+    }
+    if (vlenb >= 32) {
+        init_subpel2(0, 1, 0, h, put, 256);
+        init_subpel2(1, 1, 0, h, avg, 256);
+
+        if (flags & AV_CPU_FLAG_RVB_ADDR) {
+            init_subpel2(0, 0, 1, v, put, 256);
+            init_subpel2(1, 0, 1, v, avg, 256);
+        }
+    }
     }
 #endif
 #endif
-- 
2.46.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH 4/4] lavc/vp9dsp: R-V V mc tap hv
       [not found] <20240803095040.3648582-1-uk7b@foxmail.com>
  2024-08-03  9:50 ` [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
  2024-08-03  9:50 ` [FFmpeg-devel] [PATCH 3/4] lavc/vp9dsp: R-V V mc tap h v uk7b
@ 2024-08-03  9:50 ` uk7b
  2 siblings, 0 replies; 7+ messages in thread
From: uk7b @ 2024-08-03  9:50 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                     C908   X60
vp9_avg_8tap_smooth_4hv_8bpp_c                     :   32.0   28.0
vp9_avg_8tap_smooth_4hv_8bpp_rvv_i32               :   15.0   13.2
vp9_avg_8tap_smooth_8hv_8bpp_c                     :   98.0   86.2
vp9_avg_8tap_smooth_8hv_8bpp_rvv_i32               :   23.7   21.2
vp9_avg_8tap_smooth_16hv_8bpp_c                    :  355.7  297.0
vp9_avg_8tap_smooth_16hv_8bpp_rvv_i32              :   47.0   41.5
vp9_avg_8tap_smooth_32hv_8bpp_c                    : 1272.7 1099.7
vp9_avg_8tap_smooth_32hv_8bpp_rvv_i32              :  134.7  119.7
vp9_avg_8tap_smooth_64hv_8bpp_c                    : 4937.0 4224.2
vp9_avg_8tap_smooth_64hv_8bpp_rvv_i32              :  528.5  228.5
vp9_put_8tap_smooth_4hv_8bpp_c                     :   30.2   26.7
vp9_put_8tap_smooth_4hv_8bpp_rvv_i32               :   30.5   12.5
vp9_put_8tap_smooth_8hv_8bpp_c                     :   91.5   81.2
vp9_put_8tap_smooth_8hv_8bpp_rvv_i32               :   22.7   20.2
vp9_put_8tap_smooth_16hv_8bpp_c                    :  313.2  277.5
vp9_put_8tap_smooth_16hv_8bpp_rvv_i32              :   45.2   40.2
vp9_put_8tap_smooth_32hv_8bpp_c                    : 1166.7 1022.2
vp9_put_8tap_smooth_32hv_8bpp_rvv_i32              :  131.7  117.2
vp9_put_8tap_smooth_64hv_8bpp_c                    : 4560.5 3961.7
vp9_put_8tap_smooth_64hv_8bpp_rvv_i32              :  517.0  223.2
---
 libavcodec/riscv/vp9_mc_rvv.S  | 70 ++++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp_init.c |  8 ++++
 2 files changed, 78 insertions(+)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index c9fee6a3c5..41034444fa 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -359,6 +359,72 @@ func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
 endfunc
 .endm
 
+#if __riscv_xlen == 64
+.macro epel_hv_once len, name, op
+        sub             a2, a2, a3
+        sub             a2, a2, a3
+        sub             a2, a2, a3
+        .irp n,0,2,4,6,8,10,12,14
+        epel_load_inc   v\n, \len, put, \name, h, 1, t
+        .endr
+        addi            a4, a4, -1
+1:
+        addi            a4, a4, -1
+        epel_load       v30, \len, \op, \name, v, 0, s
+        vse8.v          v30, (a0)
+        vmv.v.v         v0, v2
+        vmv.v.v         v2, v4
+        vmv.v.v         v4, v6
+        vmv.v.v         v6, v8
+        vmv.v.v         v8, v10
+        vmv.v.v         v10, v12
+        vmv.v.v         v12, v14
+        epel_load       v14, \len, put, \name, h, 1, t
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+        epel_load       v30, \len, \op, \name, v, 0, s
+        vse8.v          v30, (a0)
+.endm
+
+.macro epel_hv op, name, len, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()hv_rvv\vlen\(), zve32x
+        lpad    0
+        addi            sp, sp, -64
+        .irp n,0,1,2,3,4,5,6,7
+        sd              s\n, \n\()<<3(sp)
+        .endr
+.if \len == 64 && \vlen < 256
+        addi            sp, sp, -48
+        .irp n,0,1,2,3,4,5
+        sd              a\n, \n\()<<3(sp)
+        .endr
+.endif
+        csrwi           vxrm, 0
+        epel_filter     \name, h, t, a7
+        epel_filter     \name, v, s, s7
+        vsetvlstatic8   \len, a6, 64, m2
+        epel_hv_once    \len, \name, \op
+.if \len == 64 && \vlen < 256
+        .irp n,0,1,2,3,4,5
+        ld              a\n, \n\()<<3(sp)
+        .endr
+        addi            sp, sp, 48
+        addi            a0, a0, 32
+        addi            a2, a2, 32
+        epel_filter     \name, h, t, a7
+        epel_hv_once    \len, \name, \op
+.endif
+        .irp n,0,1,2,3,4,5,6,7
+        ld              s\n, \n\()<<3(sp)
+        .endr
+        addi            sp, sp, 64
+
+        ret
+endfunc
+.endm
+#endif
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
         .irp op, put, avg
@@ -367,6 +433,10 @@ endfunc
                                 epel \len, \op, \name, \type, 128
                                 epel \len, \op, \name, \type, 256
                         .endr
+                        #if __riscv_xlen == 64
+                        epel_hv \op, \name, \len, 128
+                        epel_hv \op, \name, \len, 256
+                        #endif
                 .endr
         .endr
 .endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index d7a1f51d6e..6999e93374 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -119,6 +119,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     if (flags & AV_CPU_FLAG_RVB_ADDR) {
         init_subpel2(0, 0, 1, v, put, 128);
         init_subpel2(1, 0, 1, v, avg, 128);
+# if __riscv_xlen == 64
+        init_subpel2(0, 1, 1, hv, put, 128);
+        init_subpel2(1, 1, 1, hv, avg, 128);
+# endif
     }
 
     }
@@ -129,6 +133,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
         if (flags & AV_CPU_FLAG_RVB_ADDR) {
             init_subpel2(0, 0, 1, v, put, 256);
             init_subpel2(1, 0, 1, v, avg, 256);
+# if __riscv_xlen == 64
+            init_subpel2(0, 1, 1, hv, put, 256);
+            init_subpel2(1, 1, 1, hv, avg, 256);
+# endif
         }
     }
     }
-- 
2.46.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V mc bilin hv
  2024-08-03  9:50 ` [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
@ 2024-08-07 17:23   ` Rémi Denis-Courmont
  0 siblings, 0 replies; 7+ messages in thread
From: Rémi Denis-Courmont @ 2024-08-07 17:23 UTC (permalink / raw)
  To: ffmpeg-devel

Le lauantaina 3. elokuuta 2024, 12.50.38 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
>                                                      C908   X60
> vp9_avg_bilin_4hv_8bpp_c                           :   10.7    9.5
> vp9_avg_bilin_4hv_8bpp_rvv_i32                     :    4.0    3.5
> vp9_avg_bilin_8hv_8bpp_c                           :   38.5   34.2
> vp9_avg_bilin_8hv_8bpp_rvv_i32                     :    7.2    6.5
> vp9_avg_bilin_16hv_8bpp_c                          :  147.2  130.5
> vp9_avg_bilin_16hv_8bpp_rvv_i32                    :   14.5   12.7
> vp9_avg_bilin_32hv_8bpp_c                          :  574.2  509.7
> vp9_avg_bilin_32hv_8bpp_rvv_i32                    :   42.5   38.0
> vp9_avg_bilin_64hv_8bpp_c                          : 2321.2 2017.7
> vp9_avg_bilin_64hv_8bpp_rvv_i32                    :  163.5  131.0
> vp9_put_bilin_4hv_8bpp_c                           :   10.0    8.7
> vp9_put_bilin_4hv_8bpp_rvv_i32                     :    3.5    3.0
> vp9_put_bilin_8hv_8bpp_c                           :   35.2   31.2
> vp9_put_bilin_8hv_8bpp_rvv_i32                     :    6.5    5.7
> vp9_put_bilin_16hv_8bpp_c                          :  134.0  119.0
> vp9_put_bilin_16hv_8bpp_rvv_i32                    :   12.7   11.5
> vp9_put_bilin_32hv_8bpp_c                          :  538.5  464.2
> vp9_put_bilin_32hv_8bpp_rvv_i32                    :   39.7   35.2
> vp9_put_bilin_64hv_8bpp_c                          : 2111.7 1833.2
> vp9_put_bilin_64hv_8bpp_rvv_i32                    :  138.5  122.5
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 39 +++++++++++++++++++++++++++++++++-
>  libavcodec/riscv/vp9dsp_init.c | 10 +++++++++
>  2 files changed, 48 insertions(+), 1 deletion(-)
> 
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 817cc58b5e..caab2093be 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -149,6 +149,41 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
>  endfunc
>  .endm
> 
> +.macro bilin_hv op
> +func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> +        lpad    0
> +        vsetvlstatic8   64, t0, 64
> +.Lbilin_hv\op:
> +.ifc \op,avg
> +        csrwi           vxrm, 0
> +.endif
> +        neg             t1, a5
> +        neg             t2, a6
> +        li              t4, 8
> +        bilin_load_h    v24, put, a5
> +        add             a2, a2, a3
> +1:
> +        addi            a4, a4, -1
> +        bilin_load_h    v4, put, a5
> +        vwmulu.vx       v16, v4, a6
> +        vwmaccsu.vx     v16, t2, v24
> +        vwadd.wx        v16, v16, t4
> +        vnsra.wi        v16, v16, 4
> +        vadd.vv         v0, v16, v24
> +.ifc \op,avg
> +        vle8.v          v16, (a0)
> +        vaaddu.vv       v0, v0, v16
> +.endif
> +        vse8.v          v0, (a0)
> +        vmv.v.v         v24, v4

That seems suboptimal and unnecessary.

> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>          copy_avg \len
>  .endr
> @@ -157,6 +192,8 @@ bilin_h_v  put, h, a5
>  bilin_h_v  avg, h, a5
>  bilin_h_v  put, v, a6
>  bilin_h_v  avg, v, a6
> +bilin_hv   put
> +bilin_hv   avg
> 
>  .macro func_bilin_h_v len, op, type
>  func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> @@ -168,7 +205,7 @@ endfunc
> 
>  .irp len, 32, 16, 8, 4
>          .irp op, put, avg
> -                .irp type, h, v
> +                .irp type, h, v, hv
>                          func_bilin_h_v \len, \op, \type
>                  .endr
>          .endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
> index 83dbe1b5d9..d53852f673 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][0][1][0] =
> ff_put_vp9_bilin_4h_rvv; dsp->mc[4][FILTER_BILINEAR ][1][0][1] =
> ff_avg_vp9_bilin_4v_rvv; dsp->mc[4][FILTER_BILINEAR ][1][1][0] =
> ff_avg_vp9_bilin_4h_rvv; +    dsp->mc[0][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_64hv_rvv; +    dsp->mc[0][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_64hv_rvv; +    dsp->mc[1][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_32hv_rvv; +    dsp->mc[1][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_32hv_rvv; +    dsp->mc[2][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_16hv_rvv; +    dsp->mc[2][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_16hv_rvv; +    dsp->mc[3][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_8hv_rvv; +    dsp->mc[3][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_8hv_rvv; +    dsp->mc[4][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_4hv_rvv; +    dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_4hv_rvv;
> 
>  #undef init_fpel
>      }


-- 
レミ・デニ-クールモン
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V mc bilin hv
  2024-08-09 14:24 ` [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
@ 2024-08-09 14:24   ` flow gg
  0 siblings, 0 replies; 7+ messages in thread
From: flow gg @ 2024-08-09 14:24 UTC (permalink / raw)
  To: FFmpeg development discussions and patches, Rémi Denis-Courmont

> That seems suboptimal and unnecessary.

Updated it, there is no longer any vmv.

<uk7b@foxmail.com> 于2024年8月9日周五 22:24写道:

> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
>                                                      C908   X60
> vp9_avg_bilin_4hv_8bpp_c                           :   10.7    9.5
> vp9_avg_bilin_4hv_8bpp_rvv_i32                     :    4.0    3.5
> vp9_avg_bilin_8hv_8bpp_c                           :   38.5   34.2
> vp9_avg_bilin_8hv_8bpp_rvv_i32                     :    7.2    6.5
> vp9_avg_bilin_16hv_8bpp_c                          :  147.2  130.5
> vp9_avg_bilin_16hv_8bpp_rvv_i32                    :   14.5   12.7
> vp9_avg_bilin_32hv_8bpp_c                          :  574.2  509.7
> vp9_avg_bilin_32hv_8bpp_rvv_i32                    :   42.5   38.0
> vp9_avg_bilin_64hv_8bpp_c                          : 2321.2 2017.7
> vp9_avg_bilin_64hv_8bpp_rvv_i32                    :  163.5  131.0
> vp9_put_bilin_4hv_8bpp_c                           :   10.0    8.7
> vp9_put_bilin_4hv_8bpp_rvv_i32                     :    3.5    3.0
> vp9_put_bilin_8hv_8bpp_c                           :   35.2   31.2
> vp9_put_bilin_8hv_8bpp_rvv_i32                     :    6.5    5.7
> vp9_put_bilin_16hv_8bpp_c                          :  134.0  119.0
> vp9_put_bilin_16hv_8bpp_rvv_i32                    :   12.7   11.5
> vp9_put_bilin_32hv_8bpp_c                          :  538.5  464.2
> vp9_put_bilin_32hv_8bpp_rvv_i32                    :   39.7   35.2
> vp9_put_bilin_64hv_8bpp_c                          : 2111.7 1833.2
> vp9_put_bilin_64hv_8bpp_rvv_i32                    :  138.5  122.5
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 46 +++++++++++++++++++++++++++++++++-
>  libavcodec/riscv/vp9dsp_init.c | 10 ++++++++
>  2 files changed, 55 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 9e8061616f..d1ddbe007b 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -138,6 +138,48 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
>  endfunc
>  .endm
>
> +.macro bilin_hv op
> +func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> +        lpad    0
> +        vsetvlstatic8   64, t0, 64
> +.Lbilin_hv\op:
> +.ifc \op,avg
> +        csrwi           vxrm, 0
> +.endif
> +        neg             t1, a5
> +        neg             t2, a6
> +        li              t3, 8
> +        bilin_load      v24, a5, h
> +1:
> +        addi            a4, a4, -2
> +        bilin_load      v8, a5, h
> +        vwmulu.vx       v16, v8, a6
> +        vwmaccsu.vx     v16, t2, v24
> +        vwadd.wx        v16, v16, t3
> +        vnsra.wi        v16, v16, 4
> +        vadd.vv         v12, v16, v24
> +        add             t5, a0, a1
> +        bilin_load      v24, a5, h
> +        vwmulu.vx       v16, v24, a6
> +        vwmaccsu.vx     v16, t2, v8
> +        vwadd.wx        v16, v16, t3
> +        vnsra.wi        v16, v16, 4
> +        vadd.vv         v0, v16, v8
> +.ifc \op,avg
> +        vle8.v          v8, (a0)
> +        vle8.v          v16, (t5)
> +        vaaddu.vv       v12, v12, v8
> +        vaaddu.vv       v0, v0, v16
> +.endif
> +        vse8.v          v12, (a0)
> +        vse8.v          v0, (t5)
> +        add             a0, t5, a1
> +        bnez            a4, 1b
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>          copy_avg \len
>  .endr
> @@ -146,6 +188,8 @@ bilin_h_v  put, h, a5
>  bilin_h_v  avg, h, a5
>  bilin_h_v  put, v, a6
>  bilin_h_v  avg, v, a6
> +bilin_hv   put
> +bilin_hv   avg
>
>  .macro func_bilin_h_v len, op, type
>  func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> @@ -157,7 +201,7 @@ endfunc
>
>  .irp len, 32, 16, 8, 4
>          .irp op, put, avg
> -                .irp type, h, v
> +                .irp type, h, v, hv
>                          func_bilin_h_v \len, \op, \type
>                  .endr
>          .endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index 83dbe1b5d9..d53852f673 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
>      dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv;
>      dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv;
>      dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
> +    dsp->mc[0][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_64hv_rvv;
> +    dsp->mc[0][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_64hv_rvv;
> +    dsp->mc[1][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_32hv_rvv;
> +    dsp->mc[1][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_32hv_rvv;
> +    dsp->mc[2][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_16hv_rvv;
> +    dsp->mc[2][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_16hv_rvv;
> +    dsp->mc[3][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_8hv_rvv;
> +    dsp->mc[3][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_8hv_rvv;
> +    dsp->mc[4][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_4hv_rvv;
> +    dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
>
>  #undef init_fpel
>      }
> --
> 2.46.0
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V mc bilin hv
       [not found] <20240809142406.3845162-1-uk7b@foxmail.com>
@ 2024-08-09 14:24 ` uk7b
  2024-08-09 14:24   ` flow gg
  0 siblings, 1 reply; 7+ messages in thread
From: uk7b @ 2024-08-09 14:24 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                     C908   X60
vp9_avg_bilin_4hv_8bpp_c                           :   10.7    9.5
vp9_avg_bilin_4hv_8bpp_rvv_i32                     :    4.0    3.5
vp9_avg_bilin_8hv_8bpp_c                           :   38.5   34.2
vp9_avg_bilin_8hv_8bpp_rvv_i32                     :    7.2    6.5
vp9_avg_bilin_16hv_8bpp_c                          :  147.2  130.5
vp9_avg_bilin_16hv_8bpp_rvv_i32                    :   14.5   12.7
vp9_avg_bilin_32hv_8bpp_c                          :  574.2  509.7
vp9_avg_bilin_32hv_8bpp_rvv_i32                    :   42.5   38.0
vp9_avg_bilin_64hv_8bpp_c                          : 2321.2 2017.7
vp9_avg_bilin_64hv_8bpp_rvv_i32                    :  163.5  131.0
vp9_put_bilin_4hv_8bpp_c                           :   10.0    8.7
vp9_put_bilin_4hv_8bpp_rvv_i32                     :    3.5    3.0
vp9_put_bilin_8hv_8bpp_c                           :   35.2   31.2
vp9_put_bilin_8hv_8bpp_rvv_i32                     :    6.5    5.7
vp9_put_bilin_16hv_8bpp_c                          :  134.0  119.0
vp9_put_bilin_16hv_8bpp_rvv_i32                    :   12.7   11.5
vp9_put_bilin_32hv_8bpp_c                          :  538.5  464.2
vp9_put_bilin_32hv_8bpp_rvv_i32                    :   39.7   35.2
vp9_put_bilin_64hv_8bpp_c                          : 2111.7 1833.2
vp9_put_bilin_64hv_8bpp_rvv_i32                    :  138.5  122.5
---
 libavcodec/riscv/vp9_mc_rvv.S  | 46 +++++++++++++++++++++++++++++++++-
 libavcodec/riscv/vp9dsp_init.c | 10 ++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 9e8061616f..d1ddbe007b 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -138,6 +138,48 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
 endfunc
 .endm
 
+.macro bilin_hv op
+func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
+        lpad    0
+        vsetvlstatic8   64, t0, 64
+.Lbilin_hv\op:
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+        neg             t1, a5
+        neg             t2, a6
+        li              t3, 8
+        bilin_load      v24, a5, h
+1:
+        addi            a4, a4, -2
+        bilin_load      v8, a5, h
+        vwmulu.vx       v16, v8, a6
+        vwmaccsu.vx     v16, t2, v24
+        vwadd.wx        v16, v16, t3
+        vnsra.wi        v16, v16, 4
+        vadd.vv         v12, v16, v24
+        add             t5, a0, a1
+        bilin_load      v24, a5, h
+        vwmulu.vx       v16, v24, a6
+        vwmaccsu.vx     v16, t2, v8
+        vwadd.wx        v16, v16, t3
+        vnsra.wi        v16, v16, 4
+        vadd.vv         v0, v16, v8
+.ifc \op,avg
+        vle8.v          v8, (a0)
+        vle8.v          v16, (t5)
+        vaaddu.vv       v12, v12, v8
+        vaaddu.vv       v0, v0, v16
+.endif
+        vse8.v          v12, (a0)
+        vse8.v          v0, (t5)
+        add             a0, t5, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
 .endr
@@ -146,6 +188,8 @@ bilin_h_v  put, h, a5
 bilin_h_v  avg, h, a5
 bilin_h_v  put, v, a6
 bilin_h_v  avg, v, a6
+bilin_hv   put
+bilin_hv   avg
 
 .macro func_bilin_h_v len, op, type
 func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
@@ -157,7 +201,7 @@ endfunc
 
 .irp len, 32, 16, 8, 4
         .irp op, put, avg
-                .irp type, h, v
+                .irp type, h, v, hv
                         func_bilin_h_v \len, \op, \type
                 .endr
         .endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 83dbe1b5d9..d53852f673 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv;
     dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv;
     dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
+    dsp->mc[0][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_64hv_rvv;
+    dsp->mc[0][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_64hv_rvv;
+    dsp->mc[1][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_32hv_rvv;
+    dsp->mc[1][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_32hv_rvv;
+    dsp->mc[2][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_16hv_rvv;
+    dsp->mc[2][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_16hv_rvv;
+    dsp->mc[3][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_8hv_rvv;
+    dsp->mc[3][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_8hv_rvv;
+    dsp->mc[4][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_4hv_rvv;
+    dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
 
 #undef init_fpel
     }
-- 
2.46.0


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V mc bilin hv
       [not found] <20240801120851.3097443-1-uk7b@foxmail.com>
@ 2024-08-01 12:08 ` uk7b
  0 siblings, 0 replies; 7+ messages in thread
From: uk7b @ 2024-08-01 12:08 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                     C908   X60
vp9_avg_bilin_4hv_8bpp_c                           :   10.7    9.5
vp9_avg_bilin_4hv_8bpp_rvv_i32                     :    4.0    3.5
vp9_avg_bilin_8hv_8bpp_c                           :   38.5   34.2
vp9_avg_bilin_8hv_8bpp_rvv_i32                     :    7.2    6.5
vp9_avg_bilin_16hv_8bpp_c                          :  147.2  130.5
vp9_avg_bilin_16hv_8bpp_rvv_i32                    :   14.5   12.7
vp9_avg_bilin_32hv_8bpp_c                          :  574.2  509.7
vp9_avg_bilin_32hv_8bpp_rvv_i32                    :   42.5   38.0
vp9_avg_bilin_64hv_8bpp_c                          : 2321.2 2017.7
vp9_avg_bilin_64hv_8bpp_rvv_i32                    :  163.5  131.0
vp9_put_bilin_4hv_8bpp_c                           :   10.0    8.7
vp9_put_bilin_4hv_8bpp_rvv_i32                     :    3.5    3.0
vp9_put_bilin_8hv_8bpp_c                           :   35.2   31.2
vp9_put_bilin_8hv_8bpp_rvv_i32                     :    6.5    5.7
vp9_put_bilin_16hv_8bpp_c                          :  134.0  119.0
vp9_put_bilin_16hv_8bpp_rvv_i32                    :   12.7   11.5
vp9_put_bilin_32hv_8bpp_c                          :  538.5  464.2
vp9_put_bilin_32hv_8bpp_rvv_i32                    :   39.7   35.2
vp9_put_bilin_64hv_8bpp_c                          : 2111.7 1833.2
vp9_put_bilin_64hv_8bpp_rvv_i32                    :  138.5  122.5
---
 libavcodec/riscv/vp9_mc_rvv.S  | 38 +++++++++++++++++++++++++++++++++-
 libavcodec/riscv/vp9dsp_init.c | 10 +++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 2fe8e5b6d3..4faf932100 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -148,6 +148,40 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
 endfunc
 .endm
 
+.macro bilin_hv op
+func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
+        vsetvlstatic8   64, t0, 64
+.Lbilin_hv\op:
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+        neg             t1, a5
+        neg             t2, a6
+        li              t4, 8
+        bilin_load_h    v24, put, a5
+        add             a2, a2, a3
+1:
+        addi            a4, a4, -1
+        bilin_load_h    v4, put, a5
+        vwmulu.vx       v16, v4, a6
+        vwmaccsu.vx     v16, t2, v24
+        vwadd.wx        v16, v16, t4
+        vnsra.wi        v16, v16, 4
+        vadd.vv         v0, v16, v24
+.ifc \op,avg
+        vle8.v          v16, (a0)
+        vaaddu.vv       v0, v0, v16
+.endif
+        vse8.v          v0, (a0)
+        vmv.v.v         v24, v4
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
 .endr
@@ -156,6 +190,8 @@ bilin_h_v  put, h, a5
 bilin_h_v  avg, h, a5
 bilin_h_v  put, v, a6
 bilin_h_v  avg, v, a6
+bilin_hv   put
+bilin_hv   avg
 
 .macro func_bilin_h_v len, op, type
 func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
@@ -166,7 +202,7 @@ endfunc
 
 .irp len, 32, 16, 8, 4
         .irp op, put, avg
-                .irp type, h, v
+                .irp type, h, v, hv
                         func_bilin_h_v \len, \op, \type
                 .endr
         .endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 83dbe1b5d9..d53852f673 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv;
     dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv;
     dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
+    dsp->mc[0][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_64hv_rvv;
+    dsp->mc[0][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_64hv_rvv;
+    dsp->mc[1][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_32hv_rvv;
+    dsp->mc[1][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_32hv_rvv;
+    dsp->mc[2][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_16hv_rvv;
+    dsp->mc[2][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_16hv_rvv;
+    dsp->mc[3][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_8hv_rvv;
+    dsp->mc[3][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_8hv_rvv;
+    dsp->mc[4][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_4hv_rvv;
+    dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
 
 #undef init_fpel
     }
-- 
2.46.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-08-09 14:25 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20240803095040.3648582-1-uk7b@foxmail.com>
2024-08-03  9:50 ` [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
2024-08-07 17:23   ` Rémi Denis-Courmont
2024-08-03  9:50 ` [FFmpeg-devel] [PATCH 3/4] lavc/vp9dsp: R-V V mc tap h v uk7b
2024-08-03  9:50 ` [FFmpeg-devel] [PATCH 4/4] lavc/vp9dsp: R-V V mc tap hv uk7b
     [not found] <20240809142406.3845162-1-uk7b@foxmail.com>
2024-08-09 14:24 ` [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
2024-08-09 14:24   ` flow gg
     [not found] <20240801120851.3097443-1-uk7b@foxmail.com>
2024-08-01 12:08 ` uk7b

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git