Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH v2 2/9] lavc/vp9dsp: R-V mc copy
       [not found] <20240507073613.2871668-1-uk7b@foxmail.com>
@ 2024-05-07  7:36 ` uk7b
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor uk7b
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: uk7b @ 2024-05-07  7:36 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp9_put4_8bpp_c: 0.7
vp9_put4_8bpp_rvi: 0.5
vp9_put8_8bpp_c: 2.5
vp9_put8_8bpp_rvi: 0.5
vp9_put16_8bpp_c: 16.7
vp9_put16_8bpp_rvi: 1.5
vp9_put32_8bpp_c: 37.2
vp9_put32_8bpp_rvi: 5.7
vp9_put64_8bpp_c: 107.5
vp9_put64_8bpp_rvi: 21.7
---
 libavcodec/riscv/Makefile      |   3 +-
 libavcodec/riscv/vp9_mc_rvi.S  | 105 +++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp.h      |   3 +
 libavcodec/riscv/vp9dsp_init.c |  25 ++++++++
 4 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/vp9_mc_rvi.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 65dd0d656a..5846861bac 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -63,7 +63,8 @@ RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
 OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
 RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
 OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
-RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o
+RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
+                                 riscv/vp9_mc_rvi.o
 RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
 OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
 RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
diff --git a/libavcodec/riscv/vp9_mc_rvi.S b/libavcodec/riscv/vp9_mc_rvi.S
new file mode 100644
index 0000000000..0db14e83c7
--- /dev/null
+++ b/libavcodec/riscv/vp9_mc_rvi.S
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+#if __riscv_xlen >= 64
+func ff_copy64_rvi
+1:
+        addi          a4, a4, -1
+        ld            t0, (a2)
+        ld            t1, 8(a2)
+        ld            t2, 16(a2)
+        ld            t3, 24(a2)
+        ld            t4, 32(a2)
+        ld            t5, 40(a2)
+        ld            t6, 48(a2)
+        ld            a7, 56(a2)
+        sd            t0, (a0)
+        sd            t1, 8(a0)
+        sd            t2, 16(a0)
+        sd            t3, 24(a0)
+        sd            t4, 32(a0)
+        sd            t5, 40(a0)
+        sd            t6, 48(a0)
+        sd            a7, 56(a0)
+        add           a2, a2, a3
+        add           a0, a0, a1
+        bnez          a4, 1b
+
+        ret
+endfunc
+
+func ff_copy32_rvi
+1:
+        addi          a4, a4, -1
+        ld            t0, (a2)
+        ld            t1, 8(a2)
+        ld            t2, 16(a2)
+        ld            t3, 24(a2)
+        sd            t0, (a0)
+        sd            t1, 8(a0)
+        sd            t2, 16(a0)
+        sd            t3, 24(a0)
+        add           a2, a2, a3
+        add           a0, a0, a1
+        bnez          a4, 1b
+
+        ret
+endfunc
+
+func ff_copy16_rvi
+1:
+        addi          a4, a4, -1
+        ld            t0, (a2)
+        ld            t1, 8(a2)
+        sd            t0, (a0)
+        sd            t1, 8(a0)
+        add           a2, a2, a3
+        add           a0, a0, a1
+        bnez          a4, 1b
+
+        ret
+endfunc
+
+func ff_copy8_rvi
+1:
+        addi          a4, a4, -1
+        ld            t0, (a2)
+        sd            t0, (a0)
+        add           a2, a2, a3
+        add           a0, a0, a1
+        bnez          a4, 1b
+
+        ret
+endfunc
+#endif
+
+func ff_copy4_rvi
+1:
+        addi          a4, a4, -1
+        lw            t0, (a2)
+        sw            t0, (a0)
+        add           a2, a2, a3
+        add           a0, a0, a1
+        bnez          a4, 1b
+
+        ret
+endfunc
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index f8bc6563a5..b8ff282f8a 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -167,6 +167,9 @@ void ff_copy##SIZE##_rvi(uint8_t *dst, ptrdiff_t dststride,        \
                          const uint8_t *src, ptrdiff_t srcstride,  \
                          int h, int mx, int my);
 
+VP9_COPY_RISCV_RVI_FUNC(64);
+VP9_COPY_RISCV_RVI_FUNC(32);
+VP9_COPY_RISCV_RVI_FUNC(16);
 VP9_COPY_RISCV_RVI_FUNC(8);
 VP9_COPY_RISCV_RVI_FUNC(4);
 
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index d249dd71b2..c10f8bbe41 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -64,7 +64,32 @@ static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp)
     #endif
 }
 
+static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
+{
+#if HAVE_RV
+    int flags = av_get_cpu_flags();
+
+    if (bpp == 8 && flags & AV_CPU_FLAG_RVI) {
+
+#define init_fpel(idx1, sz)                                           \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][0][0][0] = ff_copy##sz##_rvi;  \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][0][0][0] = ff_copy##sz##_rvi;  \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][0][0][0] = ff_copy##sz##_rvi;  \
+    dsp->mc[idx1][FILTER_BILINEAR    ][0][0][0] = ff_copy##sz##_rvi
+
+    init_fpel(0, 64);
+    init_fpel(1, 32);
+    init_fpel(2, 16);
+    init_fpel(3, 8);
+    init_fpel(4, 4);
+
+#undef init_fpel
+    }
+#endif
+}
+
 av_cold void ff_vp9dsp_init_riscv(VP9DSPContext *dsp, int bpp, int bitexact)
 {
     vp9dsp_intrapred_init_riscv(dsp, bpp);
+    vp9dsp_mc_init_riscv(dsp, bpp);
 }
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor
       [not found] <20240507073613.2871668-1-uk7b@foxmail.com>
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 2/9] lavc/vp9dsp: R-V mc copy uk7b
@ 2024-05-07  7:36 ` uk7b
  2024-05-07 16:08   ` Rémi Denis-Courmont
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 4/9] lavc/vp9dsp: R-V V ipred tm uk7b
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 10+ messages in thread
From: uk7b @ 2024-05-07  7:36 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp9_hor_8x8_8bpp_c: 74.7
vp9_hor_8x8_8bpp_rvv_i32: 35.7
vp9_hor_16x16_8bpp_c: 175.5
vp9_hor_16x16_8bpp_rvv_i32: 80.2
vp9_hor_32x32_8bpp_c: 510.2
vp9_hor_32x32_8bpp_rvv_i32: 264.0
---
 libavcodec/riscv/vp9_intra_rvv.S | 56 ++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp.h        |  6 ++++
 libavcodec/riscv/vp9dsp_init.c   |  3 ++
 3 files changed, 65 insertions(+)

diff --git a/libavcodec/riscv/vp9_intra_rvv.S b/libavcodec/riscv/vp9_intra_rvv.S
index db9774c263..dd9bc036e7 100644
--- a/libavcodec/riscv/vp9_intra_rvv.S
+++ b/libavcodec/riscv/vp9_intra_rvv.S
@@ -113,3 +113,59 @@ func_dc dc_left  8   left 3  0  zve64x
 func_dc dc_top   32  top  5  1  zve32x
 func_dc dc_top   16  top  4  1  zve32x
 func_dc dc_top   8   top  3  0  zve64x
+
+func ff_h_32x32_rvv, zve32x
+        li           t0, 32
+        addi         a2, a2, 31
+        vsetvli      zero, t0, e8, m2, ta, ma
+
+        .rept 2
+        .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+        lbu          t1, (a2)
+        addi         a2, a2, -1
+        vmv.v.x      v\n, t1
+        .endr
+        .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+        vse8.v       v\n, (a0)
+        add          a0, a0, a1
+        .endr
+        .endr
+
+        ret
+endfunc
+
+func ff_h_16x16_rvv, zve32x
+        addi         a2, a2, 15
+        vsetivli     zero, 16, e8, m1, ta, ma
+
+        .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
+        lbu          t1, (a2)
+        addi         a2, a2, -1
+        vmv.v.x      v\n, t1
+        .endr
+        .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
+        vse8.v       v\n, (a0)
+        add          a0, a0, a1
+        .endr
+        vse8.v       v23, (a0)
+
+        ret
+endfunc
+
+func ff_h_8x8_rvv, zve32x
+        addi         a2, a2, 7
+        vsetivli     zero, 8, e8, mf2, ta, ma
+
+        .irp n 8, 9, 10, 11, 12, 13, 14, 15
+        lbu          t1, (a2)
+        addi         a2, a2, -1
+        vmv.v.x      v\n, t1
+        .endr
+        .irp n 8, 9, 10, 11, 12, 13, 14
+        vse8.v       v\n, (a0)
+        add          a0, a0, a1
+        .endr
+        vse8.v       v15, (a0)
+
+        ret
+endfunc
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index b8ff282f8a..0ad961c7e0 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -66,6 +66,12 @@ void ff_v_16x16_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
                     const uint8_t *a);
 void ff_v_8x8_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
                   const uint8_t *a);
+void ff_h_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+                    const uint8_t *a);
+void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+                    const uint8_t *a);
+void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+                  const uint8_t *a);
 
 #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)                         \
 void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index c10f8bbe41..7816b13fe0 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -59,6 +59,9 @@ static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp)
             dsp->intra_pred[TX_16X16][DC_129_PRED] = ff_dc_129_16x16_rvv;
             dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_dc_top_32x32_rvv;
             dsp->intra_pred[TX_16X16][TOP_DC_PRED] = ff_dc_top_16x16_rvv;
+            dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv;
+            dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv;
+            dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv;
         }
     #endif
     #endif
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH v2 4/9] lavc/vp9dsp: R-V V ipred tm
       [not found] <20240507073613.2871668-1-uk7b@foxmail.com>
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 2/9] lavc/vp9dsp: R-V mc copy uk7b
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor uk7b
@ 2024-05-07  7:36 ` uk7b
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 5/9] lavc/vp9dsp: R-V V mc avg uk7b
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: uk7b @ 2024-05-07  7:36 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp9_tm_4x4_8bpp_c: 116.5
vp9_tm_4x4_8bpp_rvv_i32: 43.5
vp9_tm_8x8_8bpp_c: 416.2
vp9_tm_8x8_8bpp_rvv_i32: 86.0
vp9_tm_16x16_8bpp_c: 1665.5
vp9_tm_16x16_8bpp_rvv_i32: 187.2
vp9_tm_32x32_8bpp_c: 6974.2
vp9_tm_32x32_8bpp_rvv_i32: 625.7
---
 libavcodec/riscv/vp9_intra_rvv.S | 141 +++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp.h        |   8 ++
 libavcodec/riscv/vp9dsp_init.c   |   4 +
 3 files changed, 153 insertions(+)

diff --git a/libavcodec/riscv/vp9_intra_rvv.S b/libavcodec/riscv/vp9_intra_rvv.S
index dd9bc036e7..7a51aa2bf1 100644
--- a/libavcodec/riscv/vp9_intra_rvv.S
+++ b/libavcodec/riscv/vp9_intra_rvv.S
@@ -169,3 +169,144 @@ func ff_h_8x8_rvv, zve32x
 
         ret
 endfunc
+
+.macro tm_sum dst, top, offset
+        lbu          t3, \offset(a2)
+        sub          t3, t3, a4
+        vadd.vx      \dst, \top, t3
+.endm
+
+func ff_tm_32x32_rvv, zve32x
+        lbu          a4, -1(a3)
+        li           t5, 32
+
+        .macro tm_sum32 n1,n2,n3,n4,n5,n6,n7,n8
+        vsetvli      zero, t5, e16, m4, ta, ma
+        vle8.v       v8, (a3)
+        vzext.vf2    v28, v8
+
+        tm_sum       v0, v28, \n1
+        tm_sum       v4, v28, \n2
+        tm_sum       v8, v28, \n3
+        tm_sum       v12, v28, \n4
+        tm_sum       v16, v28, \n5
+        tm_sum       v20, v28, \n6
+        tm_sum       v24, v28, \n7
+        tm_sum       v28, v28, \n8
+
+        .irp n 0, 4, 8, 12, 16, 20, 24, 28
+        vmax.vx      v\n, v\n, zero
+        .endr
+
+        vsetvli      zero, zero, e8, m2, ta, ma
+        .irp n 0, 4, 8, 12, 16, 20, 24, 28
+        vnclipu.wi   v\n, v\n, 0
+        vse8.v       v\n, (a0)
+        add          a0, a0, a1
+        .endr
+        .endm
+
+        tm_sum32     31, 30, 29, 28, 27, 26, 25, 24
+        tm_sum32     23, 22, 21, 20, 19, 18, 17, 16
+        tm_sum32     15, 14, 13, 12, 11, 10, 9, 8
+        tm_sum32     7, 6, 5, 4, 3, 2, 1, 0
+
+        ret
+endfunc
+
+func ff_tm_16x16_rvv, zve32x
+        vsetivli      zero, 16, e16, m2, ta, ma
+        vle8.v        v8, (a3)
+        vzext.vf2     v30, v8
+        lbu           a4, -1(a3)
+
+        tm_sum       v0, v30, 15
+        tm_sum       v2, v30, 14
+        tm_sum       v4, v30, 13
+        tm_sum       v6, v30, 12
+        tm_sum       v8, v30, 11
+        tm_sum       v10, v30, 10
+        tm_sum       v12, v30, 9
+        tm_sum       v14, v30, 8
+        tm_sum       v16, v30, 7
+        tm_sum       v18, v30, 6
+        tm_sum       v20, v30, 5
+        tm_sum       v22, v30, 4
+        tm_sum       v24, v30, 3
+        tm_sum       v26, v30, 2
+        tm_sum       v28, v30, 1
+        tm_sum       v30, v30, 0
+
+        .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+        vmax.vx      v\n, v\n, zero
+        .endr
+
+        vsetvli      zero, zero, e8, m1, ta, ma
+        .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28
+        vnclipu.wi   v\n, v\n, 0
+        vse8.v       v\n, (a0)
+        add          a0, a0, a1
+        .endr
+        vnclipu.wi   v30, v30, 0
+        vse8.v       v30, (a0)
+
+        ret
+endfunc
+
+func ff_tm_8x8_rvv, zve32x
+        vsetivli     zero, 8, e16, m1, ta, ma
+        vle8.v       v8, (a3)
+        vzext.vf2    v28, v8
+        lbu          a4, -1(a3)
+
+        tm_sum       v16, v28, 7
+        tm_sum       v17, v28, 6
+        tm_sum       v18, v28, 5
+        tm_sum       v19, v28, 4
+        tm_sum       v20, v28, 3
+        tm_sum       v21, v28, 2
+        tm_sum       v22, v28, 1
+        tm_sum       v23, v28, 0
+
+        .irp n 16, 17, 18, 19, 20, 21, 22, 23
+        vmax.vx      v\n, v\n, zero
+        .endr
+
+        vsetvli      zero, zero, e8, mf2, ta, ma
+        .irp n 16, 17, 18, 19, 20, 21, 22
+        vnclipu.wi   v\n, v\n, 0
+        vse8.v       v\n, (a0)
+        add          a0, a0, a1
+        .endr
+        vnclipu.wi   v24, v23, 0
+        vse8.v       v24, (a0)
+
+        ret
+endfunc
+
+func ff_tm_4x4_rvv, zve32x
+        vsetivli     zero, 4, e16, mf2, ta, ma
+        vle8.v       v8, (a3)
+        vzext.vf2    v28, v8
+        lbu          a4, -1(a3)
+
+        tm_sum       v16, v28, 3
+        tm_sum       v17, v28, 2
+        tm_sum       v18, v28, 1
+        tm_sum       v19, v28, 0
+
+        .irp n 16, 17, 18, 19
+        vmax.vx      v\n, v\n, zero
+        .endr
+
+        vsetvli      zero, zero, e8, mf4, ta, ma
+        .irp n 16, 17, 18
+        vnclipu.wi   v\n, v\n, 0
+        vse8.v       v\n, (a0)
+        add          a0, a0, a1
+        .endr
+        vnclipu.wi   v24, v19, 0
+        vse8.v       v24, (a0)
+
+        ret
+endfunc
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index 0ad961c7e0..79330b4968 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -72,6 +72,14 @@ void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
                     const uint8_t *a);
 void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
                   const uint8_t *a);
+void ff_tm_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+                     const uint8_t *a);
+void ff_tm_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+                     const uint8_t *a);
+void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+                   const uint8_t *a);
+void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+                   const uint8_t *a);
 
 #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)                         \
 void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 7816b13fe0..8023c333db 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -62,6 +62,10 @@ static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp)
             dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv;
             dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv;
             dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv;
+            dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_tm_32x32_rvv;
+            dsp->intra_pred[TX_16X16][TM_VP8_PRED] = ff_tm_16x16_rvv;
+            dsp->intra_pred[TX_8X8][TM_VP8_PRED] = ff_tm_8x8_rvv;
+            dsp->intra_pred[TX_4X4][TM_VP8_PRED] = ff_tm_4x4_rvv;
         }
     #endif
     #endif
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH v2 5/9] lavc/vp9dsp: R-V V mc avg
       [not found] <20240507073613.2871668-1-uk7b@foxmail.com>
                   ` (2 preceding siblings ...)
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 4/9] lavc/vp9dsp: R-V V ipred tm uk7b
@ 2024-05-07  7:36 ` uk7b
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 6/9] lavc/vp9dsp: R-V V mc bilin h v uk7b
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: uk7b @ 2024-05-07  7:36 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp9_avg4_8bpp_c: 1.2
vp9_avg4_8bpp_rvv_i64: 1.0
vp9_avg8_8bpp_c: 3.7
vp9_avg8_8bpp_rvv_i64: 1.5
vp9_avg16_8bpp_c: 14.7
vp9_avg16_8bpp_rvv_i64: 3.5
vp9_avg32_8bpp_c: 57.7
vp9_avg32_8bpp_rvv_i64: 10.0
vp9_avg64_8bpp_c: 229.0
vp9_avg64_8bpp_rvv_i64: 31.7
---
 libavcodec/riscv/Makefile      |  3 +-
 libavcodec/riscv/vp9_mc_rvv.S  | 58 ++++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp_init.c | 19 +++++++++++
 3 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/vp9_mc_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 5846861bac..73c9f24d97 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -65,6 +65,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
 OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
 RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
                                  riscv/vp9_mc_rvi.o
-RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
+RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \
+                                  riscv/vp9_mc_rvv.o
 OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
 RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
new file mode 100644
index 0000000000..81ecb49435
--- /dev/null
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro vsetvlstatic8 len an mn=m4
+.if \len <= 4
+        vsetivli        zero, \len, e8, mf4, ta, ma
+.elseif \len <= 8
+        vsetivli        zero, \len, e8, mf2, ta, ma
+.elseif \len <= 16
+        vsetivli        zero, \len, e8, m1, ta, ma
+.elseif \len <= 32
+        li              \an, \len
+        vsetvli         zero, \an, e8, m2, ta, ma
+.elseif \len <= 64
+        li              \an, \len
+        vsetvli         zero, \an, e8, \mn, ta, ma
+.endif
+.endm
+
+.macro copy_avg len
+func ff_avg\len\()_rvv, zve32x
+        csrwi           vxrm, 0
+        vsetvlstatic8   \len t0
+1:
+        addi            a4, a4, -1
+        vle8.v          v8, (a2)
+        vle8.v          v16, (a0)
+        vaaddu.vv       v8, v8, v16
+        vse8.v          v8, (a0)
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+        ret
+endfunc
+.endm
+
+.irp len 64, 32, 16, 8, 4
+        copy_avg \len
+.endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 8023c333db..2caaf732db 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -92,6 +92,25 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
 
 #undef init_fpel
     }
+
+#if HAVE_RVV
+    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I64) && ff_get_rv_vlenb() >= 16) {
+
+#define init_fpel(idx1, sz)                                           \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_avg##sz##_rvv;  \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][1][0][0] = ff_avg##sz##_rvv;  \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][1][0][0] = ff_avg##sz##_rvv;  \
+    dsp->mc[idx1][FILTER_BILINEAR    ][1][0][0] = ff_avg##sz##_rvv
+
+    init_fpel(0, 64);
+    init_fpel(1, 32);
+    init_fpel(2, 16);
+    init_fpel(3, 8);
+    init_fpel(4, 4);
+
+#undef init_fpel
+    }
+#endif
 #endif
 }
 
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH v2 6/9] lavc/vp9dsp: R-V V mc bilin h v
       [not found] <20240507073613.2871668-1-uk7b@foxmail.com>
                   ` (3 preceding siblings ...)
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 5/9] lavc/vp9dsp: R-V V mc avg uk7b
@ 2024-05-07  7:36 ` uk7b
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 7/9] lavc/vp9dsp: R-V V mc tap " uk7b
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: uk7b @ 2024-05-07  7:36 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp9_avg_bilin_4h_8bpp_c: 5.2
vp9_avg_bilin_4h_8bpp_rvv_i64: 2.2
vp9_avg_bilin_4v_8bpp_c: 5.5
vp9_avg_bilin_4v_8bpp_rvv_i64: 2.2
vp9_avg_bilin_8h_8bpp_c: 20.0
vp9_avg_bilin_8h_8bpp_rvv_i64: 4.5
vp9_avg_bilin_8v_8bpp_c: 21.0
vp9_avg_bilin_8v_8bpp_rvv_i64: 4.2
vp9_avg_bilin_16h_8bpp_c: 78.2
vp9_avg_bilin_16h_8bpp_rvv_i64: 9.0
vp9_avg_bilin_16v_8bpp_c: 82.0
vp9_avg_bilin_16v_8bpp_rvv_i64: 9.0
vp9_avg_bilin_32h_8bpp_c: 325.5
vp9_avg_bilin_32h_8bpp_rvv_i64: 26.2
vp9_avg_bilin_32v_8bpp_c: 326.2
vp9_avg_bilin_32v_8bpp_rvv_i64: 26.2
vp9_avg_bilin_64h_8bpp_c: 1265.7
vp9_avg_bilin_64h_8bpp_rvv_i64: 91.5
vp9_avg_bilin_64v_8bpp_c: 1317.0
vp9_avg_bilin_64v_8bpp_rvv_i64: 91.2
vp9_put_bilin_4h_8bpp_c: 4.5
vp9_put_bilin_4h_8bpp_rvv_i64: 1.7
vp9_put_bilin_4v_8bpp_c: 4.7
vp9_put_bilin_4v_8bpp_rvv_i64: 1.7
vp9_put_bilin_8h_8bpp_c: 17.0
vp9_put_bilin_8h_8bpp_rvv_i64: 3.5
vp9_put_bilin_8v_8bpp_c: 18.0
vp9_put_bilin_8v_8bpp_rvv_i64: 3.5
vp9_put_bilin_16h_8bpp_c: 65.2
vp9_put_bilin_16h_8bpp_rvv_i64: 7.5
vp9_put_bilin_16v_8bpp_c: 85.7
vp9_put_bilin_16v_8bpp_rvv_i64: 7.5
vp9_put_bilin_32h_8bpp_c: 257.5
vp9_put_bilin_32h_8bpp_rvv_i64: 23.5
vp9_put_bilin_32v_8bpp_c: 274.5
vp9_put_bilin_32v_8bpp_rvv_i64: 23.5
vp9_put_bilin_64h_8bpp_c: 1040.5
vp9_put_bilin_64h_8bpp_rvv_i64: 82.5
vp9_put_bilin_64v_8bpp_c: 1108.7
vp9_put_bilin_64v_8bpp_rvv_i64: 82.2
---
 libavcodec/riscv/vp9_mc_rvv.S  | 43 ++++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp_init.c | 22 +++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 81ecb49435..598a67fc94 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -53,6 +53,49 @@ func ff_avg\len\()_rvv, zve32x
 endfunc
 .endm
 
+.macro bilin_load dst len op type mn
+.ifc \type,v
+        add             t5, a2, a3
+.elseif \type == h
+        addi            t5, a2, 1
+.endif
+        vle8.v          v8, (a2)
+        vle8.v          v0, (t5)
+        vwmulu.vx       v16, v0, \mn
+        vwmaccsu.vx     v16, t1, v8
+        vwadd.wx        v16, v16, t4
+        vnsra.wi        v16, v16, 4
+        vadd.vv         \dst, v16, v8
+.ifc \op,avg
+        vle8.v          v16, (a0)
+        vaaddu.vv       \dst, \dst, v16
+.endif
+.endm
+
+.macro bilin_h_v len op type mn
+func ff_\op\()_bilin_\len\()\type\()_rvv, zve32x
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+        vsetvlstatic8   \len t0
+        li              t4, 8
+        neg             t1, \mn
+1:
+        addi            a4, a4, -1
+        bilin_load      v0, \len, \op, \type, \mn
+        vse8.v          v0, (a0)
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len 64, 32, 16, 8, 4
         copy_avg \len
+        .irp op put avg
+                bilin_h_v \len \op h a5
+                bilin_h_v \len \op v a6
+        .endr
 .endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 2caaf732db..cfeaa06c0a 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -109,6 +109,28 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     init_fpel(4, 4);
 
 #undef init_fpel
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type)  \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][idxh][idxv] =   \
+        ff_##type##_bilin_##sz##dir##_rvv;                   \
+
+#define init_subpel2(idx, idxh, idxv, dir, type)      \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type);  \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type);  \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type);  \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type);  \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
+
+#define init_subpel3(idx, type)         \
+    init_subpel2(idx, 1, 0, h, type);   \
+    init_subpel2(idx, 0, 1, v, type);   \
+
+    init_subpel3(0, put);
+    init_subpel3(1, avg);
+
+#undef init_subpel1
+#undef init_subpel2
+#undef init_subpel3
     }
 #endif
 #endif
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH v2 7/9] lavc/vp9dsp: R-V V mc tap h v
       [not found] <20240507073613.2871668-1-uk7b@foxmail.com>
                   ` (4 preceding siblings ...)
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 6/9] lavc/vp9dsp: R-V V mc bilin h v uk7b
@ 2024-05-07  7:36 ` uk7b
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 8/9] lavc/vp9dsp: R-V V mc bilin hv uk7b
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 9/9] lavc/vp9dsp: R-V V mc tap hv uk7b
  7 siblings, 0 replies; 10+ messages in thread
From: uk7b @ 2024-05-07  7:36 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp9_avg_8tap_smooth_4h_8bpp_c: 13.0
vp9_avg_8tap_smooth_4h_8bpp_rvv_i64: 5.0
vp9_avg_8tap_smooth_4v_8bpp_c: 13.7
vp9_avg_8tap_smooth_4v_8bpp_rvv_i64: 5.0
vp9_avg_8tap_smooth_8h_8bpp_c: 48.7
vp9_avg_8tap_smooth_8h_8bpp_rvv_i64: 9.5
vp9_avg_8tap_smooth_8v_8bpp_c: 50.0
vp9_avg_8tap_smooth_8v_8bpp_rvv_i64: 9.5
vp9_avg_8tap_smooth_16h_8bpp_c: 192.5
vp9_avg_8tap_smooth_16h_8bpp_rvv_i64: 21.2
vp9_avg_8tap_smooth_16v_8bpp_c: 191.5
vp9_avg_8tap_smooth_16v_8bpp_rvv_i64: 21.2
vp9_avg_8tap_smooth_32h_8bpp_c: 763.7
vp9_avg_8tap_smooth_32h_8bpp_rvv_i64: 67.2
vp9_avg_8tap_smooth_32v_8bpp_c: 770.7
vp9_avg_8tap_smooth_32v_8bpp_rvv_i64: 67.2
vp9_avg_8tap_smooth_64h_8bpp_c: 3098.7
vp9_avg_8tap_smooth_64h_8bpp_rvv_i64: 283.2
vp9_avg_8tap_smooth_64v_8bpp_c: 3045.2
vp9_avg_8tap_smooth_64v_8bpp_rvv_i64: 266.7
vp9_put_8tap_smooth_4h_8bpp_c: 11.0
vp9_put_8tap_smooth_4h_8bpp_rvv_i64: 4.2
vp9_put_8tap_smooth_4v_8bpp_c: 28.5
vp9_put_8tap_smooth_4v_8bpp_rvv_i64: 4.2
vp9_put_8tap_smooth_8h_8bpp_c: 42.2
vp9_put_8tap_smooth_8h_8bpp_rvv_i64: 8.5
vp9_put_8tap_smooth_8v_8bpp_c: 43.7
vp9_put_8tap_smooth_8v_8bpp_rvv_i64: 8.5
vp9_put_8tap_smooth_16h_8bpp_c: 165.7
vp9_put_8tap_smooth_16h_8bpp_rvv_i64: 19.7
vp9_put_8tap_smooth_16v_8bpp_c: 168.5
vp9_put_8tap_smooth_16v_8bpp_rvv_i64: 19.5
vp9_put_8tap_smooth_32h_8bpp_c: 675.5
vp9_put_8tap_smooth_32h_8bpp_rvv_i64: 64.2
vp9_put_8tap_smooth_32v_8bpp_c: 664.7
vp9_put_8tap_smooth_32v_8bpp_rvv_i64: 64.2
vp9_put_8tap_smooth_64h_8bpp_c: 2680.5
vp9_put_8tap_smooth_64h_8bpp_rvv_i64: 272.0
vp9_put_8tap_smooth_64v_8bpp_c: 2692.5
vp9_put_8tap_smooth_64v_8bpp_rvv_i64: 272.0
---
 libavcodec/riscv/vp9_mc_rvv.S  | 238 +++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp_init.c |   8 ++
 2 files changed, 246 insertions(+)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 598a67fc94..99605dfbb5 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -36,6 +36,18 @@
 .endif
 .endm
 
+.macro vsetvlstatic16 len
+.ifc \len,4
+        vsetvli         zero, zero, e16, mf2, ta, ma
+.elseif \len == 8
+        vsetvli         zero, zero, e16, m1, ta, ma
+.elseif \len == 16
+        vsetvli         zero, zero, e16, m2, ta, ma
+.else
+        vsetvli         zero, zero, e16, m4, ta, ma
+.endif
+.endm
+
 .macro copy_avg len
 func ff_avg\len\()_rvv, zve32x
         csrwi           vxrm, 0
@@ -92,10 +104,236 @@ func ff_\op\()_bilin_\len\()\type\()_rvv, zve32x
 endfunc
 .endm
 
+const subpel_filters_regular
+        .byte  0,  0,   0, 128,   0,   0,  0,  0
+        .byte  0,  1,  -5, 126,   8,  -3,  1,  0
+        .byte -1,  3, -10, 122,  18,  -6,  2,  0
+        .byte -1,  4, -13, 118,  27,  -9,  3, -1
+        .byte -1,  4, -16, 112,  37, -11,  4, -1
+        .byte -1,  5, -18, 105,  48, -14,  4, -1
+        .byte -1,  5, -19,  97,  58, -16,  5, -1
+        .byte -1,  6, -19,  88,  68, -18,  5, -1
+        .byte -1,  6, -19,  78,  78, -19,  6, -1
+        .byte -1,  5, -18,  68,  88, -19,  6, -1
+        .byte -1,  5, -16,  58,  97, -19,  5, -1
+        .byte -1,  4, -14,  48, 105, -18,  5, -1
+        .byte -1,  4, -11,  37, 112, -16,  4, -1
+        .byte -1,  3,  -9,  27, 118, -13,  4, -1
+        .byte  0,  2,  -6,  18, 122, -10,  3, -1
+        .byte  0,  1,  -3,   8, 126,  -5,  1,  0
+subpel_filters_sharp:
+        .byte  0,  0,   0, 128,   0,   0,  0,  0
+        .byte -1,  3,  -7, 127,   8,  -3,  1,  0
+        .byte -2,  5, -13, 125,  17,  -6,  3, -1
+        .byte -3,  7, -17, 121,  27, -10,  5, -2
+        .byte -4,  9, -20, 115,  37, -13,  6, -2
+        .byte -4, 10, -23, 108,  48, -16,  8, -3
+        .byte -4, 10, -24, 100,  59, -19,  9, -3
+        .byte -4, 11, -24,  90,  70, -21, 10, -4
+        .byte -4, 11, -23,  80,  80, -23, 11, -4
+        .byte -4, 10, -21,  70,  90, -24, 11, -4
+        .byte -3,  9, -19,  59, 100, -24, 10, -4
+        .byte -3,  8, -16,  48, 108, -23, 10, -4
+        .byte -2,  6, -13,  37, 115, -20,  9, -4
+        .byte -2,  5, -10,  27, 121, -17,  7, -3
+        .byte -1,  3,  -6,  17, 125, -13,  5, -2
+        .byte  0,  1,  -3,   8, 127,  -7,  3, -1
+subpel_filters_smooth:
+        .byte  0,  0,   0, 128,   0,   0,  0,  0
+        .byte -3, -1,  32,  64,  38,   1, -3,  0
+        .byte -2, -2,  29,  63,  41,   2, -3,  0
+        .byte -2, -2,  26,  63,  43,   4, -4,  0
+        .byte -2, -3,  24,  62,  46,   5, -4,  0
+        .byte -2, -3,  21,  60,  49,   7, -4,  0
+        .byte -1, -4,  18,  59,  51,   9, -4,  0
+        .byte -1, -4,  16,  57,  53,  12, -4, -1
+        .byte -1, -4,  14,  55,  55,  14, -4, -1
+        .byte -1, -4,  12,  53,  57,  16, -4, -1
+        .byte  0, -4,   9,  51,  59,  18, -4, -1
+        .byte  0, -4,   7,  49,  60,  21, -3, -2
+        .byte  0, -4,   5,  46,  62,  24, -3, -2
+        .byte  0, -4,   4,  43,  63,  26, -2, -2
+        .byte  0, -3,   2,  41,  63,  29, -2, -2
+        .byte  0, -3,   1,  38,  64,  32, -1, -3
+endconst
+
+.macro epel_filter name type regtype
+        lla             \regtype\()2, subpel_filters_\name
+        li              \regtype\()1, 8
+.ifc \type,v
+        mul             \regtype\()0, a6, \regtype\()1
+.elseif \type == h
+        mul             \regtype\()0, a5, \regtype\()1
+.endif
+        add             \regtype\()0, \regtype\()0, \regtype\()2
+        .irp n 1,2,3,4,5,6
+        lb              \regtype\n, \n(\regtype\()0)
+        .endr
+.ifc \regtype,t
+        lb              a7, 7(\regtype\()0)
+.elseif \regtype == s
+        lb              s7, 7(\regtype\()0)
+.endif
+        lb              \regtype\()0, 0(\regtype\()0)
+.endm
+
+.macro epel_load dst len op name type from_mem regtype
+        li              a5, 64
+.ifc \from_mem, 1
+        vle8.v          v22, (a2)
+.ifc \type,v
+        sub             a2, a2, a3
+        vle8.v          v20, (a2)
+        sh1add          a2, a3, a2
+        vle8.v          v24, (a2)
+        add             a2, a2, a3
+        vle8.v          v26, (a2)
+        add             a2, a2, a3
+        vle8.v          v28, (a2)
+        add             a2, a2, a3
+        vle8.v          v30, (a2)
+.elseif \type == h
+        addi            a2, a2, -1
+        vle8.v          v20, (a2)
+        addi            a2, a2, 2
+        vle8.v          v24, (a2)
+        addi            a2, a2, 1
+        vle8.v          v26, (a2)
+        addi            a2, a2, 1
+        vle8.v          v28, (a2)
+        addi            a2, a2, 1
+        vle8.v          v30, (a2)
+.endif
+
+.ifc \name,smooth
+        vwmulu.vx       v16, v24, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v20
+        vwmaccu.vx      v16, \regtype\()5, v26
+        vwmaccsu.vx     v16, \regtype\()6, v28
+.else
+        vwmulu.vx       v16, v28, \regtype\()6
+        vwmaccsu.vx     v16, \regtype\()2, v20
+        vwmaccsu.vx     v16, \regtype\()5, v26
+.endif
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v30
+.elseif \regtype == s
+        vwmaccsu.vx     v16, s7, v30
+.endif
+
+.ifc \type,v
+        .rept 6
+        sub             a2, a2, a3
+        .endr
+        vle8.v          v28, (a2)
+        sub             a2, a2, a3
+        vle8.v          v26, (a2)
+        sh1add          a2, a3, a2
+        add             a2, a2, a3
+.elseif \type == h
+        addi            a2, a2, -6
+        vle8.v          v28, (a2)
+        addi            a2, a2, -1
+        vle8.v          v26, (a2)
+        addi            a2, a2, 3
+.endif
+
+.ifc \name,smooth
+        vwmaccsu.vx     v16, \regtype\()1, v28
+.else
+        vwmaccu.vx      v16, \regtype\()1, v28
+        vwmulu.vx       v28, v24, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v26
+        vwmulu.vx       v20, v22, \regtype\()3
+.else
+.ifc \name,smooth
+        vwmulu.vx       v16, v8, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v4
+        vwmaccu.vx      v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()1, v2
+.else
+        vwmulu.vx       v16, v2, \regtype\()1
+        vwmaccu.vx      v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()2, v4
+        vwmulu.vx       v28, v8, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v0
+        vwmulu.vx       v20, v6, \regtype\()3
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v14
+.elseif \regtype == s
+        vwmaccsu.vx     v16, s7, v14
+.endif
+
+.endif
+        vwadd.wx        v16, v16, a5
+        vsetvlstatic16  \len
+
+.ifc \name,smooth
+        vwadd.vv        v24, v16, v20
+.else
+        vwadd.vv        v24, v16, v28
+        vwadd.wv        v24, v24, v20
+.endif
+        vnsra.wi        v24, v24, 7
+        vmax.vx         v24, v24, zero
+        vsetvlstatic8   \len, zero, m2
+
+        vnclipu.wi      \dst, v24, 0
+.ifc \op,avg
+        vle8.v          v24, (a0)
+        vaaddu.vv       \dst, \dst, v24
+.endif
+
+.endm
+
+.macro epel_load_inc dst len op name type from_mem regtype
+        epel_load       \dst \len \op \name \type \from_mem \regtype
+        add             a2, a2, a3
+.endm
+
+.macro epel len op name type
+func ff_\op\()_8tap_\name\()_\len\()\type\()_rvv, zve32x
+        epel_filter     \name \type t
+        vsetvlstatic8   \len a5 m2
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+
+1:
+        addi            a4, a4, -1
+        epel_load       v30 \len \op \name \type 1 t
+        vse8.v          v30, (a0)
+.ifc \len,64
+        addi            a0, a0, 32
+        addi            a2, a2, 32
+        epel_load       v30 \len \op \name \type 1 t
+        vse8.v          v30, (a0)
+        addi            a0, a0, -32
+        addi            a2, a2, -32
+.endif
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len 64, 32, 16, 8, 4
         copy_avg \len
         .irp op put avg
                 bilin_h_v \len \op h a5
                 bilin_h_v \len \op v a6
+                .irp name regular sharp smooth
+                        .irp type h v
+                                epel \len \op \name \type
+                        .endr
+                .endr
         .endr
 .endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index cfeaa06c0a..a45aea530d 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -113,6 +113,12 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
 #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type)  \
     dsp->mc[idx1][FILTER_BILINEAR    ][idx2][idxh][idxv] =   \
         ff_##type##_bilin_##sz##dir##_rvv;                   \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_smooth_##sz##dir##_rvv;             \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_regular_##sz##dir##_rvv;            \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_sharp_##sz##dir##_rvv;
 
 #define init_subpel2(idx, idxh, idxv, dir, type)      \
     init_subpel1(0, idx, idxh, idxv, 64, dir, type);  \
@@ -123,7 +129,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
 
 #define init_subpel3(idx, type)         \
     init_subpel2(idx, 1, 0, h, type);   \
+    if (flags & AV_CPU_FLAG_RVB_ADDR) { \
     init_subpel2(idx, 0, 1, v, type);   \
+    }
 
     init_subpel3(0, put);
     init_subpel3(1, avg);
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH v2 8/9] lavc/vp9dsp: R-V V mc bilin hv
       [not found] <20240507073613.2871668-1-uk7b@foxmail.com>
                   ` (5 preceding siblings ...)
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 7/9] lavc/vp9dsp: R-V V mc tap " uk7b
@ 2024-05-07  7:36 ` uk7b
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 9/9] lavc/vp9dsp: R-V V mc tap hv uk7b
  7 siblings, 0 replies; 10+ messages in thread
From: uk7b @ 2024-05-07  7:36 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp9_avg_bilin_4hv_8bpp_c: 11.0
vp9_avg_bilin_4hv_8bpp_rvv_i64: 3.7
vp9_avg_bilin_8hv_8bpp_c: 38.7
vp9_avg_bilin_8hv_8bpp_rvv_i64: 7.2
vp9_avg_bilin_16hv_8bpp_c: 147.0
vp9_avg_bilin_16hv_8bpp_rvv_i64: 14.2
vp9_avg_bilin_32hv_8bpp_c: 574.5
vp9_avg_bilin_32hv_8bpp_rvv_i64: 42.7
vp9_avg_bilin_64hv_8bpp_c: 2311.5
vp9_avg_bilin_64hv_8bpp_rvv_i64: 201.7
vp9_put_bilin_4hv_8bpp_c: 10.0
vp9_put_bilin_4hv_8bpp_rvv_i64: 3.2
vp9_put_bilin_8hv_8bpp_c: 35.2
vp9_put_bilin_8hv_8bpp_rvv_i64: 6.5
vp9_put_bilin_16hv_8bpp_c: 133.7
vp9_put_bilin_16hv_8bpp_rvv_i64: 13.0
vp9_put_bilin_32hv_8bpp_c: 538.2
vp9_put_bilin_32hv_8bpp_rvv_i64: 39.7
vp9_put_bilin_64hv_8bpp_c: 2114.0
vp9_put_bilin_64hv_8bpp_rvv_i64: 153.7
---
 libavcodec/riscv/vp9_mc_rvv.S | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 99605dfbb5..01404bbde5 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -104,6 +104,39 @@ func ff_\op\()_bilin_\len\()\type\()_rvv, zve32x
 endfunc
 .endm
 
+.macro bilin_hv len op
+func ff_\op\()_bilin_\len\()hv_rvv, zve32x
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+        vsetvlstatic8   \len t0
+        neg             t1, a5
+        neg             t2, a6
+        li              t4, 8
+        bilin_load      v24, \len, put, h, a5
+        add             a2, a2, a3
+1:
+        addi            a4, a4, -1
+        bilin_load      v4, \len, put, h, a5
+        vwmulu.vx       v16, v4, a6
+        vwmaccsu.vx     v16, t2, v24
+        vwadd.wx        v16, v16, t4
+        vnsra.wi        v16, v16, 4
+        vadd.vv         v0, v16, v24
+.ifc \op,avg
+        vle8.v          v16, (a0)
+        vaaddu.vv       v0, v0, v16
+.endif
+        vse8.v          v0, (a0)
+        vmv.v.v         v24, v4
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 const subpel_filters_regular
         .byte  0,  0,   0, 128,   0,   0,  0,  0
         .byte  0,  1,  -5, 126,   8,  -3,  1,  0
@@ -330,6 +363,7 @@ endfunc
         .irp op put avg
                 bilin_h_v \len \op h a5
                 bilin_h_v \len \op v a6
+                bilin_hv \len \op
                 .irp name regular sharp smooth
                         .irp type h v
                                 epel \len \op \name \type
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH v2 9/9] lavc/vp9dsp: R-V V mc tap hv
       [not found] <20240507073613.2871668-1-uk7b@foxmail.com>
                   ` (6 preceding siblings ...)
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 8/9] lavc/vp9dsp: R-V V mc bilin hv uk7b
@ 2024-05-07  7:36 ` uk7b
  7 siblings, 0 replies; 10+ messages in thread
From: uk7b @ 2024-05-07  7:36 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp9_avg_8tap_smooth_4hv_8bpp_c: 32.0
vp9_avg_8tap_smooth_4hv_8bpp_rvv_i64: 15.0
vp9_avg_8tap_smooth_8hv_8bpp_c: 114.5
vp9_avg_8tap_smooth_8hv_8bpp_rvv_i64: 40.5
vp9_avg_8tap_smooth_16hv_8bpp_c: 338.7
vp9_avg_8tap_smooth_16hv_8bpp_rvv_i64: 46.5
vp9_avg_8tap_smooth_32hv_8bpp_c: 1270.7
vp9_avg_8tap_smooth_32hv_8bpp_rvv_i64: 134.0
vp9_avg_8tap_smooth_64hv_8bpp_c: 4923.5
vp9_avg_8tap_smooth_64hv_8bpp_rvv_i64: 523.5
vp9_put_8tap_smooth_4hv_8bpp_c: 30.5
vp9_put_8tap_smooth_4hv_8bpp_rvv_i64: 14.2
vp9_put_8tap_smooth_8hv_8bpp_c: 91.7
vp9_put_8tap_smooth_8hv_8bpp_rvv_i64: 22.7
vp9_put_8tap_smooth_16hv_8bpp_c: 328.7
vp9_put_8tap_smooth_16hv_8bpp_rvv_i64: 45.0
vp9_put_8tap_smooth_32hv_8bpp_c: 1166.7
vp9_put_8tap_smooth_32hv_8bpp_rvv_i64: 131.0
vp9_put_8tap_smooth_64hv_8bpp_c: 4532.5
vp9_put_8tap_smooth_64hv_8bpp_rvv_i64: 512.5
---
 libavcodec/riscv/vp9_mc_rvv.S  | 94 ++++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp_init.c |  3 +-
 2 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 01404bbde5..0dcec94bbf 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -358,6 +358,99 @@ func ff_\op\()_8tap_\name\()_\len\()\type\()_rvv, zve32x
 endfunc
 .endm
 
+.macro epel_hv_once len name op
+        sub             a2, a2, a3
+        sub             a2, a2, a3
+        sub             a2, a2, a3
+        .irp n 0 2 4 6 8 10 12 14
+        epel_load_inc   v\n \len put \name h 1 t
+        .endr
+        addi            a4, a4, -1
+1:
+        addi            a4, a4, -1
+        epel_load       v30 \len \op \name v 0 s
+        vse8.v          v30, (a0)
+        vmv.v.v         v0, v2
+        vmv.v.v         v2, v4
+        vmv.v.v         v4, v6
+        vmv.v.v         v6, v8
+        vmv.v.v         v8, v10
+        vmv.v.v         v10, v12
+        vmv.v.v         v12, v14
+        epel_load       v14 \len put \name h 1 t
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+        epel_load       v30 \len \op \name v 0 s
+        vse8.v          v30, (a0)
+.endm
+
+.macro epel_hv op name len
+func ff_\op\()_8tap_\name\()_\len\()hv_rvv, zve32x
+#if __riscv_xlen == 64
+        addi            sp, sp, -64
+        .irp n 0,1,2,3,4,5,6,7
+        sd              s\n, \n\()<<3(sp)
+        .endr
+#else
+        addi            sp, sp, -32
+        .irp n 0,1,2,3,4,5,6,7
+        sw              s\n, \n\()<<2(sp)
+        .endr
+#endif
+.ifc \len,64
+#if __riscv_xlen == 64
+        addi            sp, sp, -48
+        .irp n 0,1,2,3,4,5
+        sd              a\n, \n\()<<3(sp)
+        .endr
+#else
+        addi            sp, sp, -24
+        .irp n 0,1,2,3,4,5
+        sw              a\n, \n\()<<2(sp)
+        .endr
+#endif
+.endif
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+        epel_filter     \name h t
+        epel_filter     \name v s
+        vsetvlstatic8   \len a6 m2
+        epel_hv_once    \len \name \op
+.ifc \len,64
+#if __riscv_xlen == 64
+        .irp n 0,1,2,3,4,5
+        ld              a\n, \n\()<<3(sp)
+        .endr
+        addi            sp, sp, 48
+#else
+        .irp n 0,1,2,3,4,5
+        lw              a\n, \n\()<<2(sp)
+        .endr
+        addi            sp, sp, 24
+#endif
+        addi            a0, a0, 32
+        addi            a2, a2, 32
+        epel_filter     \name h t
+        epel_hv_once    \len \name \op
+.endif
+#if __riscv_xlen == 64
+        .irp n 0,1,2,3,4,5,6,7
+        ld              s\n, \n\()<<3(sp)
+        .endr
+        addi            sp, sp, 64
+#else
+        .irp n 0,1,2,3,4,5,6,7
+        lw              s\n, \n\()<<2(sp)
+        .endr
+        addi            sp, sp, 32
+#endif
+
+        ret
+endfunc
+.endm
+
 .irp len 64, 32, 16, 8, 4
         copy_avg \len
         .irp op put avg
@@ -368,6 +461,7 @@ endfunc
                         .irp type h v
                                 epel \len \op \name \type
                         .endr
+                        epel_hv \op \name \len
                 .endr
         .endr
 .endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index a45aea530d..554fcefa6e 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -131,7 +131,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     init_subpel2(idx, 1, 0, h, type);   \
     if (flags & AV_CPU_FLAG_RVB_ADDR) { \
     init_subpel2(idx, 0, 1, v, type);   \
-    }
+    }                                   \
+    init_subpel2(idx, 1, 1, hv, type)
 
     init_subpel3(0, put);
     init_subpel3(1, avg);
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor
  2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor uk7b
@ 2024-05-07 16:08   ` Rémi Denis-Courmont
  2024-05-07 18:42     ` flow gg
  0 siblings, 1 reply; 10+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-07 16:08 UTC (permalink / raw)
  To: ffmpeg-devel

Le tiistaina 7. toukokuuta 2024, 10.36.07 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
> C908:
> vp9_hor_8x8_8bpp_c: 74.7
> vp9_hor_8x8_8bpp_rvv_i32: 35.7
> vp9_hor_16x16_8bpp_c: 175.5
> vp9_hor_16x16_8bpp_rvv_i32: 80.2
> vp9_hor_32x32_8bpp_c: 510.2
> vp9_hor_32x32_8bpp_rvv_i32: 264.0
> ---
>  libavcodec/riscv/vp9_intra_rvv.S | 56 ++++++++++++++++++++++++++++++++
>  libavcodec/riscv/vp9dsp.h        |  6 ++++
>  libavcodec/riscv/vp9dsp_init.c   |  3 ++
>  3 files changed, 65 insertions(+)
> 
> diff --git a/libavcodec/riscv/vp9_intra_rvv.S
> b/libavcodec/riscv/vp9_intra_rvv.S index db9774c263..dd9bc036e7 100644
> --- a/libavcodec/riscv/vp9_intra_rvv.S
> +++ b/libavcodec/riscv/vp9_intra_rvv.S
> @@ -113,3 +113,59 @@ func_dc dc_left  8   left 3  0  zve64x
>  func_dc dc_top   32  top  5  1  zve32x
>  func_dc dc_top   16  top  4  1  zve32x
>  func_dc dc_top   8   top  3  0  zve64x
> +
> +func ff_h_32x32_rvv, zve32x
> +        li           t0, 32
> +        addi         a2, a2, 31
> +        vsetvli      zero, t0, e8, m2, ta, ma
> +
> +        .rept 2
> +        .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> +        lbu          t1, (a2)
> +        addi         a2, a2, -1
> +        vmv.v.x      v\n, t1
> +        .endr
> +        .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> +        vse8.v       v\n, (a0)
> +        add          a0, a0, a1
> +        .endr
> +        .endr

Do you gain much by unrolling all the way to 16x? Given that you have the 
counter value already in t0, it should not make much difference to just unroll 
2x or maybe 4x and then loop.

It might also be faster to use lhu or lwu and shift to reduce scalar loads, at 
least if the vector is suitably aligned.

> +
> +        ret
> +endfunc
> +
> +func ff_h_16x16_rvv, zve32x
> +        addi         a2, a2, 15
> +        vsetivli     zero, 16, e8, m1, ta, ma
> +
> +        .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
> +        lbu          t1, (a2)
> +        addi         a2, a2, -1
> +        vmv.v.x      v\n, t1
> +        .endr
> +        .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
> +        vse8.v       v\n, (a0)
> +        add          a0, a0, a1
> +        .endr
> +        vse8.v       v23, (a0)
> +
> +        ret
> +endfunc
> +
> +func ff_h_8x8_rvv, zve32x
> +        addi         a2, a2, 7
> +        vsetivli     zero, 8, e8, mf2, ta, ma
> +
> +        .irp n 8, 9, 10, 11, 12, 13, 14, 15
> +        lbu          t1, (a2)
> +        addi         a2, a2, -1
> +        vmv.v.x      v\n, t1
> +        .endr
> +        .irp n 8, 9, 10, 11, 12, 13, 14
> +        vse8.v       v\n, (a0)
> +        add          a0, a0, a1
> +        .endr
> +        vse8.v       v15, (a0)
> +
> +        ret
> +endfunc
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index b8ff282f8a..0ad961c7e0 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -66,6 +66,12 @@ void ff_v_16x16_rvi(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, const uint8_t *a);
>  void ff_v_8x8_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
>                    const uint8_t *a);
> +void ff_h_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +                    const uint8_t *a);
> +void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +                    const uint8_t *a);
> +void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +                  const uint8_t *a);
> 
>  #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)                      
>   \ void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \ diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c index c10f8bbe41..7816b13fe0 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -59,6 +59,9 @@ static av_cold void
> vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp)
> dsp->intra_pred[TX_16X16][DC_129_PRED] = ff_dc_129_16x16_rvv;
> dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_dc_top_32x32_rvv;
> dsp->intra_pred[TX_16X16][TOP_DC_PRED] = ff_dc_top_16x16_rvv; +           
> dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv; +           
> dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv; +           
> dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv;
>          }
>      #endif
>      #endif


-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor
  2024-05-07 16:08   ` Rémi Denis-Courmont
@ 2024-05-07 18:42     ` flow gg
  0 siblings, 0 replies; 10+ messages in thread
From: flow gg @ 2024-05-07 18:42 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

> Do you gain much by unrolling all the way to 16x? Given that you have the
> counter value already in t0, it should not make much difference to just
unroll
> 2x or maybe 4x and then loop.

I chose this simple method because I think the effect is about the same..
Do I need to change it?

> It might also be faster to use lhu or lwu and shift to reduce scalar
loads, at
least if the vector is suitably aligned.

I just tested ff_h_16x16_rvv, lbu version is faster (lbu * 16 version:
80.2, lwu * 4 version: 117.2).


Rémi Denis-Courmont <remi@remlab.net> 于2024年5月8日周三 00:10写道:

> Le tiistaina 7. toukokuuta 2024, 10.36.07 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> > C908:
> > vp9_hor_8x8_8bpp_c: 74.7
> > vp9_hor_8x8_8bpp_rvv_i32: 35.7
> > vp9_hor_16x16_8bpp_c: 175.5
> > vp9_hor_16x16_8bpp_rvv_i32: 80.2
> > vp9_hor_32x32_8bpp_c: 510.2
> > vp9_hor_32x32_8bpp_rvv_i32: 264.0
> > ---
> >  libavcodec/riscv/vp9_intra_rvv.S | 56 ++++++++++++++++++++++++++++++++
> >  libavcodec/riscv/vp9dsp.h        |  6 ++++
> >  libavcodec/riscv/vp9dsp_init.c   |  3 ++
> >  3 files changed, 65 insertions(+)
> >
> > diff --git a/libavcodec/riscv/vp9_intra_rvv.S
> > b/libavcodec/riscv/vp9_intra_rvv.S index db9774c263..dd9bc036e7 100644
> > --- a/libavcodec/riscv/vp9_intra_rvv.S
> > +++ b/libavcodec/riscv/vp9_intra_rvv.S
> > @@ -113,3 +113,59 @@ func_dc dc_left  8   left 3  0  zve64x
> >  func_dc dc_top   32  top  5  1  zve32x
> >  func_dc dc_top   16  top  4  1  zve32x
> >  func_dc dc_top   8   top  3  0  zve64x
> > +
> > +func ff_h_32x32_rvv, zve32x
> > +        li           t0, 32
> > +        addi         a2, a2, 31
> > +        vsetvli      zero, t0, e8, m2, ta, ma
> > +
> > +        .rept 2
> > +        .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> > +        lbu          t1, (a2)
> > +        addi         a2, a2, -1
> > +        vmv.v.x      v\n, t1
> > +        .endr
> > +        .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> > +        vse8.v       v\n, (a0)
> > +        add          a0, a0, a1
> > +        .endr
> > +        .endr
>
> Do you gain much by unrolling all the way to 16x? Given that you have the
> counter value already in t0, it should not make much difference to just
> unroll
> 2x or maybe 4x and then loop.
>
> It might also be faster to use lhu or lwu and shift to reduce scalar
> loads, at
> least if the vector is suitably aligned.
>
> > +
> > +        ret
> > +endfunc
> > +
> > +func ff_h_16x16_rvv, zve32x
> > +        addi         a2, a2, 15
> > +        vsetivli     zero, 16, e8, m1, ta, ma
> > +
> > +        .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
> 22, 23
> > +        lbu          t1, (a2)
> > +        addi         a2, a2, -1
> > +        vmv.v.x      v\n, t1
> > +        .endr
> > +        .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
> > +        vse8.v       v\n, (a0)
> > +        add          a0, a0, a1
> > +        .endr
> > +        vse8.v       v23, (a0)
> > +
> > +        ret
> > +endfunc
> > +
> > +func ff_h_8x8_rvv, zve32x
> > +        addi         a2, a2, 7
> > +        vsetivli     zero, 8, e8, mf2, ta, ma
> > +
> > +        .irp n 8, 9, 10, 11, 12, 13, 14, 15
> > +        lbu          t1, (a2)
> > +        addi         a2, a2, -1
> > +        vmv.v.x      v\n, t1
> > +        .endr
> > +        .irp n 8, 9, 10, 11, 12, 13, 14
> > +        vse8.v       v\n, (a0)
> > +        add          a0, a0, a1
> > +        .endr
> > +        vse8.v       v15, (a0)
> > +
> > +        ret
> > +endfunc
> > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> > index b8ff282f8a..0ad961c7e0 100644
> > --- a/libavcodec/riscv/vp9dsp.h
> > +++ b/libavcodec/riscv/vp9dsp.h
> > @@ -66,6 +66,12 @@ void ff_v_16x16_rvi(uint8_t *dst, ptrdiff_t stride,
> const
> > uint8_t *l, const uint8_t *a);
> >  void ff_v_8x8_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> >                    const uint8_t *a);
> > +void ff_h_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> > +                    const uint8_t *a);
> > +void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> > +                    const uint8_t *a);
> > +void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> > +                  const uint8_t *a);
> >
> >  #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>
> >   \ void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> > dststride,   \ diff --git a/libavcodec/riscv/vp9dsp_init.c
> > b/libavcodec/riscv/vp9dsp_init.c index c10f8bbe41..7816b13fe0 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -59,6 +59,9 @@ static av_cold void
> > vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp)
> > dsp->intra_pred[TX_16X16][DC_129_PRED] = ff_dc_129_16x16_rvv;
> > dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_dc_top_32x32_rvv;
> > dsp->intra_pred[TX_16X16][TOP_DC_PRED] = ff_dc_top_16x16_rvv; +
>
> > dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv; +
> > dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv; +
> > dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv;
> >          }
> >      #endif
> >      #endif
>
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-05-07 18:43 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20240507073613.2871668-1-uk7b@foxmail.com>
2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 2/9] lavc/vp9dsp: R-V mc copy uk7b
2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor uk7b
2024-05-07 16:08   ` Rémi Denis-Courmont
2024-05-07 18:42     ` flow gg
2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 4/9] lavc/vp9dsp: R-V V ipred tm uk7b
2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 5/9] lavc/vp9dsp: R-V V mc avg uk7b
2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 6/9] lavc/vp9dsp: R-V V mc bilin h v uk7b
2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 7/9] lavc/vp9dsp: R-V V mc tap " uk7b
2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 8/9] lavc/vp9dsp: R-V V mc bilin hv uk7b
2024-05-07  7:36 ` [FFmpeg-devel] [PATCH v2 9/9] lavc/vp9dsp: R-V V mc tap hv uk7b

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git