Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH v3 2/5] lavc/vp9dsp: R-V V mc bilin h v
       [not found] <20240529171540.911099-1-uk7b@foxmail.com>
@ 2024-05-29 17:15 ` uk7b
  2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 3/5] lavc/vp9dsp: R-V V mc bilin hv uk7b
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: uk7b @ 2024-05-29 17:15 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp9_avg_bilin_4h_8bpp_c: 5.2
vp9_avg_bilin_4h_8bpp_rvv_i64: 2.2
vp9_avg_bilin_4v_8bpp_c: 5.5
vp9_avg_bilin_4v_8bpp_rvv_i64: 2.2
vp9_avg_bilin_8h_8bpp_c: 20.0
vp9_avg_bilin_8h_8bpp_rvv_i64: 4.5
vp9_avg_bilin_8v_8bpp_c: 21.0
vp9_avg_bilin_8v_8bpp_rvv_i64: 4.2
vp9_avg_bilin_16h_8bpp_c: 78.2
vp9_avg_bilin_16h_8bpp_rvv_i64: 9.0
vp9_avg_bilin_16v_8bpp_c: 82.0
vp9_avg_bilin_16v_8bpp_rvv_i64: 9.0
vp9_avg_bilin_32h_8bpp_c: 325.5
vp9_avg_bilin_32h_8bpp_rvv_i64: 26.2
vp9_avg_bilin_32v_8bpp_c: 326.2
vp9_avg_bilin_32v_8bpp_rvv_i64: 26.2
vp9_avg_bilin_64h_8bpp_c: 1265.7
vp9_avg_bilin_64h_8bpp_rvv_i64: 91.5
vp9_avg_bilin_64v_8bpp_c: 1317.0
vp9_avg_bilin_64v_8bpp_rvv_i64: 91.2
vp9_put_bilin_4h_8bpp_c: 4.5
vp9_put_bilin_4h_8bpp_rvv_i64: 1.7
vp9_put_bilin_4v_8bpp_c: 4.7
vp9_put_bilin_4v_8bpp_rvv_i64: 1.7
vp9_put_bilin_8h_8bpp_c: 17.0
vp9_put_bilin_8h_8bpp_rvv_i64: 3.5
vp9_put_bilin_8v_8bpp_c: 18.0
vp9_put_bilin_8v_8bpp_rvv_i64: 3.5
vp9_put_bilin_16h_8bpp_c: 65.2
vp9_put_bilin_16h_8bpp_rvv_i64: 7.5
vp9_put_bilin_16v_8bpp_c: 85.7
vp9_put_bilin_16v_8bpp_rvv_i64: 7.5
vp9_put_bilin_32h_8bpp_c: 257.5
vp9_put_bilin_32h_8bpp_rvv_i64: 23.5
vp9_put_bilin_32v_8bpp_c: 274.5
vp9_put_bilin_32v_8bpp_rvv_i64: 23.5
vp9_put_bilin_64h_8bpp_c: 1040.5
vp9_put_bilin_64h_8bpp_rvv_i64: 82.5
vp9_put_bilin_64v_8bpp_c: 1108.7
vp9_put_bilin_64v_8bpp_rvv_i64: 82.2
---
 libavcodec/riscv/vp9_mc_rvv.S  | 60 ++++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp.h      | 12 +++----
 libavcodec/riscv/vp9dsp_init.c | 21 ++++++++++++
 3 files changed, 87 insertions(+), 6 deletions(-)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 7cb38ec94a..9611aba0ed 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -53,6 +53,66 @@ func ff_vp9_avg\len\()_rvv, zve32x
 endfunc
 .endm
 
+.macro bilin_load dst, op, type, mn
+.ifc \type,v
+        add             t5, a2, a3
+.else
+        addi            t5, a2, 1
+.endif
+        vle8.v          v8, (a2)
+        vle8.v          v0, (t5)
+        vwmulu.vx       v16, v0, \mn
+        vwmaccsu.vx     v16, t1, v8
+        vwadd.wx        v16, v16, t4
+        vnsra.wi        v16, v16, 4
+        vadd.vv         \dst, v16, v8
+.ifc \op,avg
+        vle8.v          v16, (a0)
+        vaaddu.vv       \dst, \dst, v16
+.endif
+.endm
+
+.macro bilin_h_v op, type, mn
+func ff_\op\()_vp9_bilin_4\type\()_rvv, zve32x
+        vsetvlstatic8   4, t0, 64
+.Lbilin_\type\op:
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+        li              t4, 8
+        neg             t1, \mn
+1:
+        addi            a4, a4, -1
+        bilin_load      v0, \op, \type, \mn
+        vse8.v          v0, (a0)
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
 .endr
+
+bilin_h_v  put, h, a5
+bilin_h_v  avg, h, a5
+bilin_h_v  put, v, a6
+bilin_h_v  avg, v, a6
+
+.macro func_bilin_h_v len, op, type
+func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
+        vsetvlstatic8   \len, t0, 64
+        j               .Lbilin_\type\()\op
+endfunc
+.endm
+
+.irp len, 64, 32, 16, 8
+        .irp op, put, avg
+                .irp type, h, v
+                        func_bilin_h_v \len, \op, \type
+                .endr
+        .endr
+.endr
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index ff8431591c..8fb326dae0 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -113,27 +113,27 @@ void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
                                          int h, int mx, int my);
 
 #define VP9_BILINEAR_RISCV_RVV_FUNC(SIZE)                                   \
-void ff_put_bilin_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,         \
+void ff_put_vp9_bilin_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,     \
                                 const uint8_t *src, ptrdiff_t srcstride,   \
                                 int h, int mx, int my);                    \
                                                                            \
-void ff_put_bilin_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,         \
+void ff_put_vp9_bilin_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,     \
                                 const uint8_t *src, ptrdiff_t srcstride,   \
                                 int h, int mx, int my);                    \
                                                                            \
-void ff_put_bilin_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,        \
+void ff_put_vp9_bilin_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,    \
                                  const uint8_t *src, ptrdiff_t srcstride,  \
                                  int h, int mx, int my);                   \
                                                                            \
-void ff_avg_bilin_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,         \
+void ff_avg_vp9_bilin_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,     \
                                 const uint8_t *src, ptrdiff_t srcstride,   \
                                 int h, int mx, int my);                    \
                                                                            \
-void ff_avg_bilin_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,         \
+void ff_avg_vp9_bilin_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,     \
                                 const uint8_t *src, ptrdiff_t srcstride,   \
                                 int h, int mx, int my);                    \
                                                                            \
-void ff_avg_bilin_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,        \
+void ff_avg_vp9_bilin_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,    \
                                  const uint8_t *src, ptrdiff_t srcstride,  \
                                  int h, int mx, int my);
 
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 454dcd963f..9606d8545f 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -63,6 +63,27 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     init_fpel(3, 8);
     init_fpel(4, 4);
 
+    dsp->mc[0][FILTER_BILINEAR ][0][0][1] = ff_put_vp9_bilin_64v_rvv;
+    dsp->mc[0][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_64h_rvv;
+    dsp->mc[0][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_64v_rvv;
+    dsp->mc[0][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_64h_rvv;
+    dsp->mc[1][FILTER_BILINEAR ][0][0][1] = ff_put_vp9_bilin_32v_rvv;
+    dsp->mc[1][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_32h_rvv;
+    dsp->mc[1][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_32v_rvv;
+    dsp->mc[1][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_32h_rvv;
+    dsp->mc[2][FILTER_BILINEAR ][0][0][1] = ff_put_vp9_bilin_16v_rvv;
+    dsp->mc[2][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_16h_rvv;
+    dsp->mc[2][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_16v_rvv;
+    dsp->mc[2][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_16h_rvv;
+    dsp->mc[3][FILTER_BILINEAR ][0][0][1] = ff_put_vp9_bilin_8v_rvv;
+    dsp->mc[3][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_8h_rvv;
+    dsp->mc[3][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_8v_rvv;
+    dsp->mc[3][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_8h_rvv;
+    dsp->mc[4][FILTER_BILINEAR ][0][0][1] = ff_put_vp9_bilin_4v_rvv;
+    dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv;
+    dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv;
+    dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
+
 #undef init_fpel
     }
 #endif
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH v3 3/5] lavc/vp9dsp: R-V V mc bilin hv
       [not found] <20240529171540.911099-1-uk7b@foxmail.com>
  2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 2/5] lavc/vp9dsp: R-V V mc bilin h v uk7b
@ 2024-05-29 17:15 ` uk7b
  2024-05-29 20:07   ` Rémi Denis-Courmont
  2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v uk7b
  2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 5/5] lavc/vp9dsp: R-V V mc tap hv uk7b
  3 siblings, 1 reply; 7+ messages in thread
From: uk7b @ 2024-05-29 17:15 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp9_avg_bilin_4hv_8bpp_c: 11.0
vp9_avg_bilin_4hv_8bpp_rvv_i64: 3.7
vp9_avg_bilin_8hv_8bpp_c: 38.7
vp9_avg_bilin_8hv_8bpp_rvv_i64: 7.2
vp9_avg_bilin_16hv_8bpp_c: 147.0
vp9_avg_bilin_16hv_8bpp_rvv_i64: 14.2
vp9_avg_bilin_32hv_8bpp_c: 574.5
vp9_avg_bilin_32hv_8bpp_rvv_i64: 42.7
vp9_avg_bilin_64hv_8bpp_c: 2311.5
vp9_avg_bilin_64hv_8bpp_rvv_i64: 201.7
vp9_put_bilin_4hv_8bpp_c: 10.0
vp9_put_bilin_4hv_8bpp_rvv_i64: 3.2
vp9_put_bilin_8hv_8bpp_c: 35.2
vp9_put_bilin_8hv_8bpp_rvv_i64: 6.5
vp9_put_bilin_16hv_8bpp_c: 133.7
vp9_put_bilin_16hv_8bpp_rvv_i64: 13.0
vp9_put_bilin_32hv_8bpp_c: 538.2
vp9_put_bilin_32hv_8bpp_rvv_i64: 39.7
vp9_put_bilin_64hv_8bpp_c: 2114.0
vp9_put_bilin_64hv_8bpp_rvv_i64: 153.7
---
 libavcodec/riscv/vp9_mc_rvv.S  | 38 +++++++++++++++++++++++++++++++++-
 libavcodec/riscv/vp9dsp_init.c | 10 +++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 9611aba0ed..990271736b 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -93,6 +93,40 @@ func ff_\op\()_vp9_bilin_4\type\()_rvv, zve32x
 endfunc
 .endm
 
+.macro bilin_hv op
+func ff_\op\()_vp9_bilin_4hv_rvv, zve32x
+        vsetvlstatic8   4, t0, 64
+.Lbilin_hv\op:
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+        neg             t1, a5
+        neg             t2, a6
+        li              t4, 8
+        bilin_load      v24, put, h, a5
+        add             a2, a2, a3
+1:
+        addi            a4, a4, -1
+        bilin_load      v4, put, h, a5
+        vwmulu.vx       v16, v4, a6
+        vwmaccsu.vx     v16, t2, v24
+        vwadd.wx        v16, v16, t4
+        vnsra.wi        v16, v16, 4
+        vadd.vv         v0, v16, v24
+.ifc \op,avg
+        vle8.v          v16, (a0)
+        vaaddu.vv       v0, v0, v16
+.endif
+        vse8.v          v0, (a0)
+        vmv.v.v         v24, v4
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
 .endr
@@ -101,6 +135,8 @@ bilin_h_v  put, h, a5
 bilin_h_v  avg, h, a5
 bilin_h_v  put, v, a6
 bilin_h_v  avg, v, a6
+bilin_hv   put
+bilin_hv   avg
 
 .macro func_bilin_h_v len, op, type
 func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
@@ -111,7 +147,7 @@ endfunc
 
 .irp len, 64, 32, 16, 8
         .irp op, put, avg
-                .irp type, h, v
+                .irp type, h, v, hv
                         func_bilin_h_v \len, \op, \type
                 .endr
         .endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 9606d8545f..b3700dfb08 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv;
     dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv;
     dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
+    dsp->mc[0][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_64hv_rvv;
+    dsp->mc[0][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_64hv_rvv;
+    dsp->mc[1][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_32hv_rvv;
+    dsp->mc[1][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_32hv_rvv;
+    dsp->mc[2][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_16hv_rvv;
+    dsp->mc[2][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_16hv_rvv;
+    dsp->mc[3][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_8hv_rvv;
+    dsp->mc[3][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_8hv_rvv;
+    dsp->mc[4][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_4hv_rvv;
+    dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
 
 #undef init_fpel
     }
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v
       [not found] <20240529171540.911099-1-uk7b@foxmail.com>
  2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 2/5] lavc/vp9dsp: R-V V mc bilin h v uk7b
  2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 3/5] lavc/vp9dsp: R-V V mc bilin hv uk7b
@ 2024-05-29 17:15 ` uk7b
  2024-05-29 17:19   ` flow gg
  2024-05-29 20:29   ` Rémi Denis-Courmont
  2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 5/5] lavc/vp9dsp: R-V V mc tap hv uk7b
  3 siblings, 2 replies; 7+ messages in thread
From: uk7b @ 2024-05-29 17:15 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                     C908   X60
vp9_avg_8tap_smooth_4h_8bpp_c                      :   13.0   11.2
vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    5.0    4.2
vp9_avg_8tap_smooth_4v_8bpp_c                      :   13.7   12.5
vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    5.0    4.2
vp9_avg_8tap_smooth_8h_8bpp_c                      :   49.5   42.2
vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.2    8.5
vp9_avg_8tap_smooth_8v_8bpp_c                      :   66.5   45.0
vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.7  166.5
vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.2   18.7
vp9_avg_8tap_smooth_16v_8bpp_c                     :  192.2  175.7
vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.5   19.0
vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.7
vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   83.5   60.0
vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.5  689.2
vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.2   60.0
vp9_avg_8tap_smooth_64h_8bpp_c                     : 3115.5 2647.2
vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  283.5  119.2
vp9_avg_8tap_smooth_64v_8bpp_c                     : 3082.2 2729.0
vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  305.2  119.0
vp9_put_8tap_smooth_4h_8bpp_c                      :   11.2    9.7
vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    4.0
vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.7
vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.2    4.0
vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
vp9_put_8tap_smooth_8v_8bpp_c                      :   44.2   38.7
vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.5    7.7
vp9_put_8tap_smooth_16h_8bpp_c                     :  165.7  147.2
vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   19.5   17.5
vp9_put_8tap_smooth_16v_8bpp_c                     :  169.0  149.7
vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
vp9_put_8tap_smooth_32h_8bpp_c                     :  659.7  586.7
vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   64.2   57.2
vp9_put_8tap_smooth_32v_8bpp_c                     :  680.5  591.2
vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.2   57.2
vp9_put_8tap_smooth_64h_8bpp_c                     : 2681.5 2339.0
vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  255.5  114.2
vp9_put_8tap_smooth_64v_8bpp_c                     : 2709.7 2348.7
vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
---
 libavcodec/riscv/vp9_mc_rvv.S  | 204 +++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp.h      |  72 ++++++++----
 libavcodec/riscv/vp9dsp_init.c |  37 +++++-
 3 files changed, 288 insertions(+), 25 deletions(-)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 990271736b..53dd833dac 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -36,6 +36,18 @@
 .endif
 .endm
 
+.macro vsetvlstatic16 len
+.ifc \len,4
+        vsetvli         zero, zero, e16, mf2, ta, ma
+.elseif \len == 8
+        vsetvli         zero, zero, e16, m1, ta, ma
+.elseif \len == 16
+        vsetvli         zero, zero, e16, m2, ta, ma
+.else
+        vsetvli         zero, zero, e16, m4, ta, ma
+.endif
+.endm
+
 .macro copy_avg len
 func ff_vp9_avg\len\()_rvv, zve32x
         csrwi           vxrm, 0
@@ -127,8 +139,200 @@ func ff_\op\()_vp9_bilin_4hv_rvv, zve32x
 endfunc
 .endm
 
+.macro epel_filter name, type, regtype
+        lla             \regtype\()2, ff_vp9_subpel_filters
+
+.ifc \name,regular
+        addi            \regtype\()2, \regtype\()2, 16*8*2
+.endif
+.ifc \name,sharp
+        addi            \regtype\()2, \regtype\()2, 16*8*2*2
+.endif
+
+.ifc \type,v
+        slli            \regtype\()0, a6, 4
+.else
+        slli            \regtype\()0, a5, 4
+.endif
+        add             \regtype\()0, \regtype\()0, \regtype\()2
+
+        lh              \regtype\()1, 2(\regtype\()0)
+        lh              \regtype\()2, 4(\regtype\()0)
+        lh              \regtype\()3, 6(\regtype\()0)
+        lh              \regtype\()4, 8(\regtype\()0)
+        lh              \regtype\()5, 10(\regtype\()0)
+        lh              \regtype\()6, 12(\regtype\()0)
+
+.ifc \regtype,t
+        lh              a7, 14(\regtype\()0)
+.else
+        lh              s7, 14(\regtype\()0)
+.endif
+        lh              \regtype\()0, 0(\regtype\()0)
+.endm
+
+.macro epel_load dst, len, op, name, type, from_mem, regtype
+        li              a5, 64
+.ifc \from_mem, 1
+        vle8.v          v22, (a2)
+.ifc \type,v
+        sub             a2, a2, a3
+        vle8.v          v20, (a2)
+        sh1add          a2, a3, a2
+        vle8.v          v24, (a2)
+        add             a2, a2, a3
+        vle8.v          v26, (a2)
+        add             a2, a2, a3
+        vle8.v          v28, (a2)
+        add             a2, a2, a3
+        vle8.v          v30, (a2)
+.else
+        addi            a2, a2, -1
+        vle8.v          v20, (a2)
+        addi            a2, a2, 2
+        vle8.v          v24, (a2)
+        addi            a2, a2, 1
+        vle8.v          v26, (a2)
+        addi            a2, a2, 1
+        vle8.v          v28, (a2)
+        addi            a2, a2, 1
+        vle8.v          v30, (a2)
+.endif
+
+.ifc \name,smooth
+        vwmulu.vx       v16, v24, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v20
+        vwmaccu.vx      v16, \regtype\()5, v26
+        vwmaccsu.vx     v16, \regtype\()6, v28
+.else
+        vwmulu.vx       v16, v28, \regtype\()6
+        vwmaccsu.vx     v16, \regtype\()2, v20
+        vwmaccsu.vx     v16, \regtype\()5, v26
+.endif
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v30
+.else
+        vwmaccsu.vx     v16, s7, v30
+.endif
+
+.ifc \type,v
+        .rept 6
+        sub             a2, a2, a3
+        .endr
+        vle8.v          v28, (a2)
+        sub             a2, a2, a3
+        vle8.v          v26, (a2)
+        sh1add          a2, a3, a2
+        add             a2, a2, a3
+.else
+        addi            a2, a2, -6
+        vle8.v          v28, (a2)
+        addi            a2, a2, -1
+        vle8.v          v26, (a2)
+        addi            a2, a2, 3
+.endif
+
+.ifc \name,smooth
+        vwmaccsu.vx     v16, \regtype\()1, v28
+.else
+        vwmaccu.vx      v16, \regtype\()1, v28
+        vwmulu.vx       v28, v24, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v26
+        vwmulu.vx       v20, v22, \regtype\()3
+.else
+.ifc \name,smooth
+        vwmulu.vx       v16, v8, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v4
+        vwmaccu.vx      v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()1, v2
+.else
+        vwmulu.vx       v16, v2, \regtype\()1
+        vwmaccu.vx      v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()2, v4
+        vwmulu.vx       v28, v8, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v0
+        vwmulu.vx       v20, v6, \regtype\()3
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v14
+.else
+        vwmaccsu.vx     v16, s7, v14
+.endif
+
+.endif
+        vwadd.wx        v16, v16, a5
+        vsetvlstatic16  \len
+
+.ifc \name,smooth
+        vwadd.vv        v24, v16, v20
+.else
+        vwadd.vv        v24, v16, v28
+        vwadd.wv        v24, v24, v20
+.endif
+        vnsra.wi        v24, v24, 7
+        vmax.vx         v24, v24, zero
+        vsetvlstatic8   \len, zero, 32, m2
+
+        vnclipu.wi      \dst, v24, 0
+.ifc \op,avg
+        vle8.v          v24, (a0)
+        vaaddu.vv       \dst, \dst, v24
+.endif
+
+.endm
+
+.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
+        epel_load       \dst, \len, \op, \name, \type, \from_mem, \regtype
+        add             a2, a2, a3
+.endm
+
+.macro epel len, op, name, type, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
+        epel_filter     \name, \type, t
+.if \vlen < 256
+        vsetvlstatic8   \len, a5, 32, m2
+.else
+        vsetvlstatic8   \len, a5, 64, m2
+.endif
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+
+1:
+        addi            a4, a4, -1
+        epel_load       v30, \len, \op, \name, \type, 1, t
+        vse8.v          v30, (a0)
+.if \len == 64 && \vlen < 256
+        addi            a0, a0, 32
+        addi            a2, a2, 32
+        epel_load       v30, \len, \op, \name, \type, 1, t
+        vse8.v          v30, (a0)
+        addi            a0, a0, -32
+        addi            a2, a2, -32
+.endif
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
+        .irp op, put, avg
+                .irp name, regular, sharp, smooth
+                        .irp type, h, v
+                                epel \len, \op, \name, \type, 128
+                                epel \len, \op, \name, \type, 256
+                        .endr
+                .endr
+        .endr
 .endr
 
 bilin_h_v  put, h, a5
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index 8fb326dae0..5fd64a1b8c 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
 void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
                    const uint8_t *a);
 
-#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)                         \
-void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen)              \
+void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
+void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,         \
+                                         ptrdiff_t dststride,                \
                                          const uint8_t *src,                 \
                                          ptrdiff_t srcstride,                \
                                          int h, int mx, int my);             \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
+void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,         \
+                                         ptrdiff_t dststride,                \
                                          const uint8_t *src,                 \
                                          ptrdiff_t srcstride,                \
                                          int h, int mx, int my);
@@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride,     \
                         const uint8_t *src, ptrdiff_t srcstride,   \
                         int h, int mx, int my);
 
-VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
 
 VP9_BILINEAR_RISCV_RVV_FUNC(64);
 VP9_BILINEAR_RISCV_RVV_FUNC(32);
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index b3700dfb08..5f759e6bc8 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
 # endif
 
 #if HAVE_RVV
-    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) {
+    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
+    if (ff_rv_vlen_least(128)) {
 
 #define init_fpel(idx1, sz)                                           \
     dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv;  \
@@ -95,6 +96,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
 
 #undef init_fpel
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
+
+#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
+
+    init_subpel2(0, 1, 0, h, put, 128);
+    init_subpel2(1, 1, 0, h, avg, 128);
+
+    if (flags & AV_CPU_FLAG_RVB_ADDR) {
+        init_subpel2(0, 0, 1, v, put, 128);
+        init_subpel2(1, 0, 1, v, avg, 128);
+    }
+
+    }
+    if (ff_rv_vlen_least(256)) {
+        init_subpel2(0, 1, 0, h, put, 256);
+        init_subpel2(1, 1, 0, h, avg, 256);
+
+        if (flags & AV_CPU_FLAG_RVB_ADDR) {
+            init_subpel2(0, 0, 1, v, put, 256);
+            init_subpel2(1, 0, 1, v, avg, 256);
+        }
+    }
     }
 #endif
 #endif
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH v3 5/5] lavc/vp9dsp: R-V V mc tap hv
       [not found] <20240529171540.911099-1-uk7b@foxmail.com>
                   ` (2 preceding siblings ...)
  2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v uk7b
@ 2024-05-29 17:15 ` uk7b
  3 siblings, 0 replies; 7+ messages in thread
From: uk7b @ 2024-05-29 17:15 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                     C908   X60
vp9_avg_8tap_smooth_4hv_8bpp_c                     :   32.0   28.2
vp9_avg_8tap_smooth_4hv_8bpp_rvv_i32               :   15.0   13.2
vp9_avg_8tap_smooth_8hv_8bpp_c                     :   98.0   86.2
vp9_avg_8tap_smooth_8hv_8bpp_rvv_i32               :   23.7   21.0
vp9_avg_8tap_smooth_16hv_8bpp_c                    :  355.5  297.0
vp9_avg_8tap_smooth_16hv_8bpp_rvv_i32              :   62.7   41.2
vp9_avg_8tap_smooth_32hv_8bpp_c                    : 1273.0 1099.7
vp9_avg_8tap_smooth_32hv_8bpp_rvv_i32              :  133.7  119.2
vp9_avg_8tap_smooth_64hv_8bpp_c                    : 4933.0 4240.5
vp9_avg_8tap_smooth_64hv_8bpp_rvv_i32              :  506.7  227.0
vp9_put_8tap_smooth_4hv_8bpp_c                     :   30.2   27.0
vp9_put_8tap_smooth_4hv_8bpp_rvv_i32               :   14.5   12.7
vp9_put_8tap_smooth_8hv_8bpp_c                     :   91.2   81.2
vp9_put_8tap_smooth_8hv_8bpp_rvv_i32               :   22.7   20.2
vp9_put_8tap_smooth_16hv_8bpp_c                    :  329.2  277.7
vp9_put_8tap_smooth_16hv_8bpp_rvv_i32              :   44.7   40.0
vp9_put_8tap_smooth_32hv_8bpp_c                    : 1183.7 1022.7
vp9_put_8tap_smooth_32hv_8bpp_rvv_i32              :  130.7  116.5
vp9_put_8tap_smooth_64hv_8bpp_c                    : 4502.7 3954.5
vp9_put_8tap_smooth_64hv_8bpp_rvv_i32              :  496.0  224.7
---
 libavcodec/riscv/vp9_mc_rvv.S  | 75 ++++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp_init.c |  8 ++++
 2 files changed, 83 insertions(+)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 53dd833dac..fed698c802 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -323,6 +323,77 @@ func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
 endfunc
 .endm
 
+#if __riscv_xlen == 64
+.macro epel_hv_once len, name, op
+        sub             a2, a2, a3
+        sub             a2, a2, a3
+        sub             a2, a2, a3
+        .irp n,0,2,4,6,8,10,12,14
+        epel_load_inc   v\n, \len, put, \name, h, 1, t
+        .endr
+        addi            a4, a4, -1
+1:
+        addi            a4, a4, -1
+        epel_load       v30, \len, \op, \name, v, 0, s
+        vse8.v          v30, (a0)
+        vmv.v.v         v0, v2
+        vmv.v.v         v2, v4
+        vmv.v.v         v4, v6
+        vmv.v.v         v6, v8
+        vmv.v.v         v8, v10
+        vmv.v.v         v10, v12
+        vmv.v.v         v12, v14
+        epel_load       v14, \len, put, \name, h, 1, t
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+        epel_load       v30, \len, \op, \name, v, 0, s
+        vse8.v          v30, (a0)
+.endm
+
+.macro epel_hv op, name, len, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()hv_rvv\vlen\(), zve32x
+        addi            sp, sp, -64
+        .irp n,0,1,2,3,4,5,6,7
+        sd              s\n, \n\()<<3(sp)
+        .endr
+.if \len == 64 && \vlen < 256
+        addi            sp, sp, -48
+        .irp n,0,1,2,3,4,5
+        sd              a\n, \n\()<<3(sp)
+        .endr
+.endif
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+        epel_filter     \name, h, t
+        epel_filter     \name, v, s
+.if \vlen < 256
+        vsetvlstatic8   \len, a6, 32, m2
+.else
+        vsetvlstatic8   \len, a6, 64, m2
+.endif
+        epel_hv_once    \len, \name, \op
+.if \len == 64 && \vlen < 256
+        .irp n,0,1,2,3,4,5
+        ld              a\n, \n\()<<3(sp)
+        .endr
+        addi            sp, sp, 48
+        addi            a0, a0, 32
+        addi            a2, a2, 32
+        epel_filter     \name, h, t
+        epel_hv_once    \len, \name, \op
+.endif
+        .irp n,0,1,2,3,4,5,6,7
+        ld              s\n, \n\()<<3(sp)
+        .endr
+        addi            sp, sp, 64
+
+        ret
+endfunc
+.endm
+#endif
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
         .irp op, put, avg
@@ -331,6 +402,10 @@ endfunc
                                 epel \len, \op, \name, \type, 128
                                 epel \len, \op, \name, \type, 256
                         .endr
+                        #if __riscv_xlen == 64
+                        epel_hv \op, \name, \len, 128
+                        epel_hv \op, \name, \len, 256
+                        #endif
                 .endr
         .endr
 .endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 5f759e6bc8..6b39ad8ee0 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -118,6 +118,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     if (flags & AV_CPU_FLAG_RVB_ADDR) {
         init_subpel2(0, 0, 1, v, put, 128);
         init_subpel2(1, 0, 1, v, avg, 128);
+# if __riscv_xlen == 64
+        init_subpel2(0, 1, 1, hv, put, 128);
+        init_subpel2(1, 1, 1, hv, avg, 128);
+# endif
     }
 
     }
@@ -128,6 +132,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
         if (flags & AV_CPU_FLAG_RVB_ADDR) {
             init_subpel2(0, 0, 1, v, put, 256);
             init_subpel2(1, 0, 1, v, avg, 256);
+# if __riscv_xlen == 64
+            init_subpel2(0, 1, 1, hv, put, 256);
+            init_subpel2(1, 1, 1, hv, avg, 256);
+# endif
         }
     }
     }
-- 
2.45.1


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v
  2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v uk7b
@ 2024-05-29 17:19   ` flow gg
  2024-05-29 20:29   ` Rémi Denis-Courmont
  1 sibling, 0 replies; 7+ messages in thread
From: flow gg @ 2024-05-29 17:19 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

A portion has been modified according to the previous review, but there are
still some parts that haven't been updated

> Similarly, it
> should be possible to share most of the horizontal and vertical code
(maybe
> also for bilinear. not just EPel) with separate load/store then inner
> procedures. The H.263 loop filter already does that though with almost no
> overhead, though
> H.263 is obviously simpler than VP9.
>
> A French philosopher famously said that Perfect is the ennemy of Good.
> Generally, as with VVC, nested repetition macros for finely specialised
> functions tend to generate way too much byte code, and this ends up being
> worse rather than better in the big picture.

Here, bilin is modified with reference to your vp8 modification method, but
there are some issues with epel. I want to share most of the horizontal and
vertical code like h263, but because there are different types
(op/name/len), such changes seem hard. Trying to make similar modifications
for bilin also seems some hard , maybe leaving it for future optimization
:'(

> It should be possible to spare one ADDI by using just AUIPC here, and
folding
> the immediate offset into the LB's below (see also H.263 loop filter).

I'm not sure where the problem lies, but for smooth it works, but for
sharp, regular, it gives this error:
dangerous relocation: %pcrel_lo overflow with an addend, the value of
%pcrel_hi is 0xa5000 without any addend, but may be 0xa6000 after adding
the %pcrel_lo addend

<uk7b@foxmail.com> 于2024年5月30日周四 01:16写道:

> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
>                                                      C908   X60
> vp9_avg_8tap_smooth_4h_8bpp_c                      :   13.0   11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    5.0    4.2
> vp9_avg_8tap_smooth_4v_8bpp_c                      :   13.7   12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    5.0    4.2
> vp9_avg_8tap_smooth_8h_8bpp_c                      :   49.5   42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.2    8.5
> vp9_avg_8tap_smooth_8v_8bpp_c                      :   66.5   45.0
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
> vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.7  166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.2   18.7
> vp9_avg_8tap_smooth_16v_8bpp_c                     :  192.2  175.7
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.5   19.0
> vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.7
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   83.5   60.0
> vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.5  689.2
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.2   60.0
> vp9_avg_8tap_smooth_64h_8bpp_c                     : 3115.5 2647.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  283.5  119.2
> vp9_avg_8tap_smooth_64v_8bpp_c                     : 3082.2 2729.0
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  305.2  119.0
> vp9_put_8tap_smooth_4h_8bpp_c                      :   11.2    9.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    4.0
> vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.7
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.2    4.0
> vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
> vp9_put_8tap_smooth_8v_8bpp_c                      :   44.2   38.7
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.5    7.7
> vp9_put_8tap_smooth_16h_8bpp_c                     :  165.7  147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   19.5   17.5
> vp9_put_8tap_smooth_16v_8bpp_c                     :  169.0  149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
> vp9_put_8tap_smooth_32h_8bpp_c                     :  659.7  586.7
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   64.2   57.2
> vp9_put_8tap_smooth_32v_8bpp_c                     :  680.5  591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.2   57.2
> vp9_put_8tap_smooth_64h_8bpp_c                     : 2681.5 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  255.5  114.2
> vp9_put_8tap_smooth_64v_8bpp_c                     : 2709.7 2348.7
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 204 +++++++++++++++++++++++++++++++++
>  libavcodec/riscv/vp9dsp.h      |  72 ++++++++----
>  libavcodec/riscv/vp9dsp_init.c |  37 +++++-
>  3 files changed, 288 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 990271736b..53dd833dac 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
>  .endif
>  .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> +        vsetvli         zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> +        vsetvli         zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> +        vsetvli         zero, zero, e16, m2, ta, ma
> +.else
> +        vsetvli         zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
>  .macro copy_avg len
>  func ff_vp9_avg\len\()_rvv, zve32x
>          csrwi           vxrm, 0
> @@ -127,8 +139,200 @@ func ff_\op\()_vp9_bilin_4hv_rvv, zve32x
>  endfunc
>  .endm
>
> +.macro epel_filter name, type, regtype
> +        lla             \regtype\()2, ff_vp9_subpel_filters
> +
> +.ifc \name,regular
> +        addi            \regtype\()2, \regtype\()2, 16*8*2
> +.endif
> +.ifc \name,sharp
> +        addi            \regtype\()2, \regtype\()2, 16*8*2*2
> +.endif
> +
> +.ifc \type,v
> +        slli            \regtype\()0, a6, 4
> +.else
> +        slli            \regtype\()0, a5, 4
> +.endif
> +        add             \regtype\()0, \regtype\()0, \regtype\()2
> +
> +        lh              \regtype\()1, 2(\regtype\()0)
> +        lh              \regtype\()2, 4(\regtype\()0)
> +        lh              \regtype\()3, 6(\regtype\()0)
> +        lh              \regtype\()4, 8(\regtype\()0)
> +        lh              \regtype\()5, 10(\regtype\()0)
> +        lh              \regtype\()6, 12(\regtype\()0)
> +
> +.ifc \regtype,t
> +        lh              a7, 14(\regtype\()0)
> +.else
> +        lh              s7, 14(\regtype\()0)
> +.endif
> +        lh              \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst, len, op, name, type, from_mem, regtype
> +        li              a5, 64
> +.ifc \from_mem, 1
> +        vle8.v          v22, (a2)
> +.ifc \type,v
> +        sub             a2, a2, a3
> +        vle8.v          v20, (a2)
> +        sh1add          a2, a3, a2
> +        vle8.v          v24, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v26, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v28, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v30, (a2)
> +.else
> +        addi            a2, a2, -1
> +        vle8.v          v20, (a2)
> +        addi            a2, a2, 2
> +        vle8.v          v24, (a2)
> +        addi            a2, a2, 1
> +        vle8.v          v26, (a2)
> +        addi            a2, a2, 1
> +        vle8.v          v28, (a2)
> +        addi            a2, a2, 1
> +        vle8.v          v30, (a2)
> +.endif
> +
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v24, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v20
> +        vwmaccu.vx      v16, \regtype\()5, v26
> +        vwmaccsu.vx     v16, \regtype\()6, v28
> +.else
> +        vwmulu.vx       v16, v28, \regtype\()6
> +        vwmaccsu.vx     v16, \regtype\()2, v20
> +        vwmaccsu.vx     v16, \regtype\()5, v26
> +.endif
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v30
> +.else
> +        vwmaccsu.vx     v16, s7, v30
> +.endif
> +
> +.ifc \type,v
> +        .rept 6
> +        sub             a2, a2, a3
> +        .endr
> +        vle8.v          v28, (a2)
> +        sub             a2, a2, a3
> +        vle8.v          v26, (a2)
> +        sh1add          a2, a3, a2
> +        add             a2, a2, a3
> +.else
> +        addi            a2, a2, -6
> +        vle8.v          v28, (a2)
> +        addi            a2, a2, -1
> +        vle8.v          v26, (a2)
> +        addi            a2, a2, 3
> +.endif
> +
> +.ifc \name,smooth
> +        vwmaccsu.vx     v16, \regtype\()1, v28
> +.else
> +        vwmaccu.vx      v16, \regtype\()1, v28
> +        vwmulu.vx       v28, v24, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v26
> +        vwmulu.vx       v20, v22, \regtype\()3
> +.else
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v8, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v4
> +        vwmaccu.vx      v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()1, v2
> +.else
> +        vwmulu.vx       v16, v2, \regtype\()1
> +        vwmaccu.vx      v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()2, v4
> +        vwmulu.vx       v28, v8, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v0
> +        vwmulu.vx       v20, v6, \regtype\()3
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v14
> +.else
> +        vwmaccsu.vx     v16, s7, v14
> +.endif
> +
> +.endif
> +        vwadd.wx        v16, v16, a5
> +        vsetvlstatic16  \len
> +
> +.ifc \name,smooth
> +        vwadd.vv        v24, v16, v20
> +.else
> +        vwadd.vv        v24, v16, v28
> +        vwadd.wv        v24, v24, v20
> +.endif
> +        vnsra.wi        v24, v24, 7
> +        vmax.vx         v24, v24, zero
> +        vsetvlstatic8   \len, zero, 32, m2
> +
> +        vnclipu.wi      \dst, v24, 0
> +.ifc \op,avg
> +        vle8.v          v24, (a0)
> +        vaaddu.vv       \dst, \dst, v24
> +.endif
> +
> +.endm
> +
> +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> +        epel_load       \dst, \len, \op, \name, \type, \from_mem, \regtype
> +        add             a2, a2, a3
> +.endm
> +
> +.macro epel len, op, name, type, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> +        epel_filter     \name, \type, t
> +.if \vlen < 256
> +        vsetvlstatic8   \len, a5, 32, m2
> +.else
> +        vsetvlstatic8   \len, a5, 64, m2
> +.endif
> +.ifc \op,avg
> +        csrwi           vxrm, 0
> +.endif
> +
> +1:
> +        addi            a4, a4, -1
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +.if \len == 64 && \vlen < 256
> +        addi            a0, a0, 32
> +        addi            a2, a2, 32
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +        addi            a0, a0, -32
> +        addi            a2, a2, -32
> +.endif
> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>          copy_avg \len
> +        .irp op, put, avg
> +                .irp name, regular, sharp, smooth
> +                        .irp type, h, v
> +                                epel \len, \op, \name, \type, 128
> +                                epel \len, \op, \name, \type, 256
> +                        .endr
> +                .endr
> +        .endr
>  .endr
>
>  bilin_h_v  put, h, a5
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 8fb326dae0..5fd64a1b8c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride,
> const uint8_t *l,
>  void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
>                     const uint8_t *a);
>
> -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>      \
> -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen)
>     \
> +void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t
> dststride,  \
> +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>    \
> +                                         ptrdiff_t dststride,
>     \
>                                           const uint8_t *src,
>    \
>                                           ptrdiff_t srcstride,
>     \
>                                           int h, int mx, int my);
>    \
>
>     \
> -void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t
> dststride,  \
> +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>    \
> +                                         ptrdiff_t dststride,
>     \
>                                           const uint8_t *src,
>    \
>                                           ptrdiff_t srcstride,
>     \
>                                           int h, int mx, int my);
> @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> dststride,     \
>                          const uint8_t *src, ptrdiff_t srcstride,   \
>                          int h, int mx, int my);
>
> -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
>
>  VP9_BILINEAR_RISCV_RVV_FUNC(64);
>  VP9_BILINEAR_RISCV_RVV_FUNC(32);
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index b3700dfb08..5f759e6bc8 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
>  # endif
>
>  #if HAVE_RVV
> -    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128)) {
> +    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> +    if (ff_rv_vlen_least(128)) {
>
>  #define init_fpel(idx1, sz)                                           \
>      dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv;  \
> @@ -95,6 +96,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
>      dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
>
>  #undef init_fpel
> +
> +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
> +    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
> +    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
> +    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> +
> +#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
> +    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
> +    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
> +    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
> +    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
> +    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
> +
> +    init_subpel2(0, 1, 0, h, put, 128);
> +    init_subpel2(1, 1, 0, h, avg, 128);
> +
> +    if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +        init_subpel2(0, 0, 1, v, put, 128);
> +        init_subpel2(1, 0, 1, v, avg, 128);
> +    }
> +
> +    }
> +    if (ff_rv_vlen_least(256)) {
> +        init_subpel2(0, 1, 0, h, put, 256);
> +        init_subpel2(1, 1, 0, h, avg, 256);
> +
> +        if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +            init_subpel2(0, 0, 1, v, put, 256);
> +            init_subpel2(1, 0, 1, v, avg, 256);
> +        }
> +    }
>      }
>  #endif
>  #endif
> --
> 2.45.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH v3 3/5] lavc/vp9dsp: R-V V mc bilin hv
  2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 3/5] lavc/vp9dsp: R-V V mc bilin hv uk7b
@ 2024-05-29 20:07   ` Rémi Denis-Courmont
  0 siblings, 0 replies; 7+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-29 20:07 UTC (permalink / raw)
  To: ffmpeg-devel

Le keskiviikkona 29. toukokuuta 2024, 20.15.38 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
> C908:
> vp9_avg_bilin_4hv_8bpp_c: 11.0
> vp9_avg_bilin_4hv_8bpp_rvv_i64: 3.7
> vp9_avg_bilin_8hv_8bpp_c: 38.7
> vp9_avg_bilin_8hv_8bpp_rvv_i64: 7.2
> vp9_avg_bilin_16hv_8bpp_c: 147.0
> vp9_avg_bilin_16hv_8bpp_rvv_i64: 14.2
> vp9_avg_bilin_32hv_8bpp_c: 574.5
> vp9_avg_bilin_32hv_8bpp_rvv_i64: 42.7
> vp9_avg_bilin_64hv_8bpp_c: 2311.5
> vp9_avg_bilin_64hv_8bpp_rvv_i64: 201.7
> vp9_put_bilin_4hv_8bpp_c: 10.0
> vp9_put_bilin_4hv_8bpp_rvv_i64: 3.2
> vp9_put_bilin_8hv_8bpp_c: 35.2
> vp9_put_bilin_8hv_8bpp_rvv_i64: 6.5
> vp9_put_bilin_16hv_8bpp_c: 133.7
> vp9_put_bilin_16hv_8bpp_rvv_i64: 13.0
> vp9_put_bilin_32hv_8bpp_c: 538.2
> vp9_put_bilin_32hv_8bpp_rvv_i64: 39.7
> vp9_put_bilin_64hv_8bpp_c: 2114.0
> vp9_put_bilin_64hv_8bpp_rvv_i64: 153.7
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 38 +++++++++++++++++++++++++++++++++-
>  libavcodec/riscv/vp9dsp_init.c | 10 +++++++++
>  2 files changed, 47 insertions(+), 1 deletion(-)
> 
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 9611aba0ed..990271736b 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -93,6 +93,40 @@ func ff_\op\()_vp9_bilin_4\type\()_rvv, zve32x
>  endfunc
>  .endm
> 
> +.macro bilin_hv op
> +func ff_\op\()_vp9_bilin_4hv_rvv, zve32x
> +        vsetvlstatic8   4, t0, 64
> +.Lbilin_hv\op:
> +.ifc \op,avg
> +        csrwi           vxrm, 0
> +.endif
> +        neg             t1, a5
> +        neg             t2, a6
> +        li              t4, 8
> +        bilin_load      v24, put, h, a5
> +        add             a2, a2, a3
> +1:
> +        addi            a4, a4, -1
> +        bilin_load      v4, put, h, a5
> +        vwmulu.vx       v16, v4, a6
> +        vwmaccsu.vx     v16, t2, v24
> +        vwadd.wx        v16, v16, t4
> +        vnsra.wi        v16, v16, 4
> +        vadd.vv         v0, v16, v24
> +.ifc \op,avg
> +        vle8.v          v16, (a0)
> +        vaaddu.vv       v0, v0, v16
> +.endif
> +        vse8.v          v0, (a0)
> +        vmv.v.v         v24, v4

Copying vectors is rarely justified - mostly only before destructive 
instructions such as FMA. If a4 is even (?), this should be faster by 
unrolling to two rows per iteration.

> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>          copy_avg \len
>  .endr
> @@ -101,6 +135,8 @@ bilin_h_v  put, h, a5
>  bilin_h_v  avg, h, a5
>  bilin_h_v  put, v, a6
>  bilin_h_v  avg, v, a6
> +bilin_hv   put
> +bilin_hv   avg
> 
>  .macro func_bilin_h_v len, op, type
>  func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> @@ -111,7 +147,7 @@ endfunc
> 
>  .irp len, 64, 32, 16, 8
>          .irp op, put, avg
> -                .irp type, h, v
> +                .irp type, h, v, hv
>                          func_bilin_h_v \len, \op, \type
>                  .endr
>          .endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
> index 9606d8545f..b3700dfb08 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][0][1][0] =
> ff_put_vp9_bilin_4h_rvv; dsp->mc[4][FILTER_BILINEAR ][1][0][1] =
> ff_avg_vp9_bilin_4v_rvv; dsp->mc[4][FILTER_BILINEAR ][1][1][0] =
> ff_avg_vp9_bilin_4h_rvv; +    dsp->mc[0][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_64hv_rvv; +    dsp->mc[0][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_64hv_rvv; +    dsp->mc[1][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_32hv_rvv; +    dsp->mc[1][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_32hv_rvv; +    dsp->mc[2][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_16hv_rvv; +    dsp->mc[2][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_16hv_rvv; +    dsp->mc[3][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_8hv_rvv; +    dsp->mc[3][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_8hv_rvv; +    dsp->mc[4][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_4hv_rvv; +    dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_4hv_rvv;
> 
>  #undef init_fpel
>      }


-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v
  2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v uk7b
  2024-05-29 17:19   ` flow gg
@ 2024-05-29 20:29   ` Rémi Denis-Courmont
  1 sibling, 0 replies; 7+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-29 20:29 UTC (permalink / raw)
  To: ffmpeg-devel

Le keskiviikkona 29. toukokuuta 2024, 20.15.39 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
>                                                      C908   X60
> vp9_avg_8tap_smooth_4h_8bpp_c                      :   13.0   11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    5.0    4.2
> vp9_avg_8tap_smooth_4v_8bpp_c                      :   13.7   12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    5.0    4.2
> vp9_avg_8tap_smooth_8h_8bpp_c                      :   49.5   42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.2    8.5
> vp9_avg_8tap_smooth_8v_8bpp_c                      :   66.5   45.0
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
> vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.7  166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.2   18.7
> vp9_avg_8tap_smooth_16v_8bpp_c                     :  192.2  175.7
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.5   19.0
> vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.7
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   83.5   60.0
> vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.5  689.2
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.2   60.0
> vp9_avg_8tap_smooth_64h_8bpp_c                     : 3115.5 2647.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  283.5  119.2
> vp9_avg_8tap_smooth_64v_8bpp_c                     : 3082.2 2729.0
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  305.2  119.0
> vp9_put_8tap_smooth_4h_8bpp_c                      :   11.2    9.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    4.0
> vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.7
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.2    4.0
> vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
> vp9_put_8tap_smooth_8v_8bpp_c                      :   44.2   38.7
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.5    7.7
> vp9_put_8tap_smooth_16h_8bpp_c                     :  165.7  147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   19.5   17.5
> vp9_put_8tap_smooth_16v_8bpp_c                     :  169.0  149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
> vp9_put_8tap_smooth_32h_8bpp_c                     :  659.7  586.7
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   64.2   57.2
> vp9_put_8tap_smooth_32v_8bpp_c                     :  680.5  591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.2   57.2
> vp9_put_8tap_smooth_64h_8bpp_c                     : 2681.5 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  255.5  114.2
> vp9_put_8tap_smooth_64v_8bpp_c                     : 2709.7 2348.7
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 204 +++++++++++++++++++++++++++++++++
>  libavcodec/riscv/vp9dsp.h      |  72 ++++++++----
>  libavcodec/riscv/vp9dsp_init.c |  37 +++++-
>  3 files changed, 288 insertions(+), 25 deletions(-)
> 
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 990271736b..53dd833dac 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
>  .endif
>  .endm
> 
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> +        vsetvli         zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> +        vsetvli         zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> +        vsetvli         zero, zero, e16, m2, ta, ma
> +.else
> +        vsetvli         zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
>  .macro copy_avg len
>  func ff_vp9_avg\len\()_rvv, zve32x
>          csrwi           vxrm, 0
> @@ -127,8 +139,200 @@ func ff_\op\()_vp9_bilin_4hv_rvv, zve32x
>  endfunc
>  .endm
> 
> +.macro epel_filter name, type, regtype
> +        lla             \regtype\()2, ff_vp9_subpel_filters
> +
> +.ifc \name,regular
> +        addi            \regtype\()2, \regtype\()2, 16*8*2

You can directly LLA filters + 16 * 8 * 2 and save one add. Same below. You can 
also use .equ to alias the filter addresses, and avoid if's.

> +.endif
> +.ifc \name,sharp
> +        addi            \regtype\()2, \regtype\()2, 16*8*2*2
> +.endif
> +
> +.ifc \type,v
> +        slli            \regtype\()0, a6, 4
> +.else
> +        slli            \regtype\()0, a5, 4
> +.endif

Use a macro parameter for the stride register.

> +        add             \regtype\()0, \regtype\()0, \regtype\()2
> +
> +        lh              \regtype\()1, 2(\regtype\()0)
> +        lh              \regtype\()2, 4(\regtype\()0)
> +        lh              \regtype\()3, 6(\regtype\()0)
> +        lh              \regtype\()4, 8(\regtype\()0)
> +        lh              \regtype\()5, 10(\regtype\()0)
> +        lh              \regtype\()6, 12(\regtype\()0)
> +
> +.ifc \regtype,t
> +        lh              a7, 14(\regtype\()0)
> +.else
> +        lh              s7, 14(\regtype\()0)
> +.endif
> +        lh              \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst, len, op, name, type, from_mem, regtype
> +        li              a5, 64
> +.ifc \from_mem, 1
> +        vle8.v          v22, (a2)
> +.ifc \type,v
> +        sub             a2, a2, a3
> +        vle8.v          v20, (a2)
> +        sh1add          a2, a3, a2
> +        vle8.v          v24, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v26, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v28, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v30, (a2)
> +.else
> +        addi            a2, a2, -1
> +        vle8.v          v20, (a2)
> +        addi            a2, a2, 2
> +        vle8.v          v24, (a2)
> +        addi            a2, a2, 1
> +        vle8.v          v26, (a2)
> +        addi            a2, a2, 1
> +        vle8.v          v28, (a2)
> +        addi            a2, a2, 1
> +        vle8.v          v30, (a2)

That's a lot of address dependencies, which is going to hurt performance. It 
might help to just spill more S registers if needed.

> +.endif
> +
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v24, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v20
> +        vwmaccu.vx      v16, \regtype\()5, v26
> +        vwmaccsu.vx     v16, \regtype\()6, v28
> +.else
> +        vwmulu.vx       v16, v28, \regtype\()6
> +        vwmaccsu.vx     v16, \regtype\()2, v20
> +        vwmaccsu.vx     v16, \regtype\()5, v26
> +.endif
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v30
> +.else
> +        vwmaccsu.vx     v16, s7, v30
> +.endif
> +
> +.ifc \type,v
> +        .rept 6
> +        sub             a2, a2, a3
> +        .endr

This can be done in 3 instructions, even without mul. Of course you'll again 
need a spare register.

> +        vle8.v          v28, (a2)
> +        sub             a2, a2, a3
> +        vle8.v          v26, (a2)
> +        sh1add          a2, a3, a2
> +        add             a2, a2, a3
> +.else
> +        addi            a2, a2, -6
> +        vle8.v          v28, (a2)
> +        addi            a2, a2, -1
> +        vle8.v          v26, (a2)
> +        addi            a2, a2, 3
> +.endif
> +
> +.ifc \name,smooth
> +        vwmaccsu.vx     v16, \regtype\()1, v28
> +.else
> +        vwmaccu.vx      v16, \regtype\()1, v28
> +        vwmulu.vx       v28, v24, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v26
> +        vwmulu.vx       v20, v22, \regtype\()3
> +.else
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v8, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v4
> +        vwmaccu.vx      v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()1, v2
> +.else
> +        vwmulu.vx       v16, v2, \regtype\()1
> +        vwmaccu.vx      v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()2, v4
> +        vwmulu.vx       v28, v8, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v0
> +        vwmulu.vx       v20, v6, \regtype\()3
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v14
> +.else
> +        vwmaccsu.vx     v16, s7, v14
> +.endif
> +
> +.endif
> +        vwadd.wx        v16, v16, a5
> +        vsetvlstatic16  \len
> +
> +.ifc \name,smooth
> +        vwadd.vv        v24, v16, v20
> +.else
> +        vwadd.vv        v24, v16, v28
> +        vwadd.wv        v24, v24, v20
> +.endif
> +        vnsra.wi        v24, v24, 7
> +        vmax.vx         v24, v24, zero
> +        vsetvlstatic8   \len, zero, 32, m2
> +
> +        vnclipu.wi      \dst, v24, 0
> +.ifc \op,avg
> +        vle8.v          v24, (a0)
> +        vaaddu.vv       \dst, \dst, v24
> +.endif
> +
> +.endm
> +
> +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> +        epel_load       \dst, \len, \op, \name, \type, \from_mem, \regtype
> +        add             a2, a2, a3
> +.endm
> +
> +.macro epel len, op, name, type, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> +        epel_filter     \name, \type, t
> +.if \vlen < 256
> +        vsetvlstatic8   \len, a5, 32, m2
> +.else
> +        vsetvlstatic8   \len, a5, 64, m2
> +.endif
> +.ifc \op,avg
> +        csrwi           vxrm, 0
> +.endif
> +
> +1:
> +        addi            a4, a4, -1
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +.if \len == 64 && \vlen < 256
> +        addi            a0, a0, 32
> +        addi            a2, a2, 32
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +        addi            a0, a0, -32
> +        addi            a2, a2, -32
> +.endif
> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>          copy_avg \len
> +        .irp op, put, avg
> +                .irp name, regular, sharp, smooth
> +                        .irp type, h, v
> +                                epel \len, \op, \name, \type, 128
> +                                epel \len, \op, \name, \type, 256
> +                        .endr
> +                .endr
> +        .endr
>  .endr
> 
>  bilin_h_v  put, h, a5
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 8fb326dae0..5fd64a1b8c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, const uint8_t *a);
> 
> -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)                      
>   \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx,
> min_vlen)              \ +void
> ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \ +
>                                        ptrdiff_t dststride,                
> \ const uint8_t *src,                  \ ptrdiff_t srcstride,              
>   \ int h, int mx, int my);              \ \ -void
> ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,       
>   \ +                                        ptrdiff_t dststride,          
>       \ const uint8_t *src,                  \ ptrdiff_t srcstride,        
>         \ int h, int mx, int my);              \ \ -void
> ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,      
>   \ +                                         ptrdiff_t dststride,         
>       \ const uint8_t *src,                 \ ptrdiff_t srcstride,         
>       \ int h, int mx, int my);             \ \ -void
> ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,       
>   \ +                                        ptrdiff_t dststride,          
>       \ const uint8_t *src,                  \ ptrdiff_t srcstride,        
>         \ int h, int mx, int my);              \ \ -void
> ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,       
>   \ +                                        ptrdiff_t dststride,          
>       \ const uint8_t *src,                  \ ptrdiff_t srcstride,        
>         \ int h, int mx, int my);              \ \ -void
> ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,      
>   \ +                                         ptrdiff_t dststride,         
>       \ const uint8_t *src,                 \ ptrdiff_t srcstride,         
>       \ int h, int mx, int my);
> @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> dststride,     \ const uint8_t *src, ptrdiff_t srcstride,   \ int h, int
> mx, int my);
> 
> -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
> 
>  VP9_BILINEAR_RISCV_RVV_FUNC(64);
>  VP9_BILINEAR_RISCV_RVV_FUNC(32);
> diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
> index b3700dfb08..5f759e6bc8 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) # endif
> 
>  #if HAVE_RVV
> -    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128))
> { +    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> +    if (ff_rv_vlen_least(128)) {
> 
>  #define init_fpel(idx1, sz)                                           \
>      dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv;  \
> @@ -95,6 +96,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_4hv_rvv;
> 
>  #undef init_fpel
> +
> +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
> +    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
> +    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
> +    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> +
> +#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
> +    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
> +    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
> +    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
> +    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
> +    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
> +
> +    init_subpel2(0, 1, 0, h, put, 128);
> +    init_subpel2(1, 1, 0, h, avg, 128);
> +
> +    if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +        init_subpel2(0, 0, 1, v, put, 128);
> +        init_subpel2(1, 0, 1, v, avg, 128);
> +    }
> +
> +    }
> +    if (ff_rv_vlen_least(256)) {
> +        init_subpel2(0, 1, 0, h, put, 256);
> +        init_subpel2(1, 1, 0, h, avg, 256);
> +
> +        if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +            init_subpel2(0, 0, 1, v, put, 256);
> +            init_subpel2(1, 0, 1, v, avg, 256);
> +        }
> +    }
>      }
>  #endif
>  #endif


-- 
レミ・デニ-クールモン
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-05-29 20:29 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20240529171540.911099-1-uk7b@foxmail.com>
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 2/5] lavc/vp9dsp: R-V V mc bilin h v uk7b
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 3/5] lavc/vp9dsp: R-V V mc bilin hv uk7b
2024-05-29 20:07   ` Rémi Denis-Courmont
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v uk7b
2024-05-29 17:19   ` flow gg
2024-05-29 20:29   ` Rémi Denis-Courmont
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 5/5] lavc/vp9dsp: R-V V mc tap hv uk7b

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git