* [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
@ 2024-05-21 7:37 uk7b
2024-05-21 7:38 ` flow gg
` (2 more replies)
0 siblings, 3 replies; 19+ messages in thread
From: uk7b @ 2024-05-21 7:37 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
avg_8_2x2_c : 1.0 1.0
avg_8_2x2_rvv_i32 : 0.7 0.7
avg_8_2x4_c : 2.0 2.0
avg_8_2x4_rvv_i32 : 1.0 0.7
avg_8_2x8_c : 4.0 3.7
avg_8_2x8_rvv_i32 : 1.5 1.2
avg_8_2x16_c : 7.5 7.7
avg_8_2x16_rvv_i32 : 2.7 2.5
avg_8_2x32_c : 14.2 15.0
avg_8_2x32_rvv_i32 : 5.0 4.5
avg_8_2x64_c : 28.5 30.2
avg_8_2x64_rvv_i32 : 9.5 8.7
avg_8_2x128_c : 80.0 70.5
avg_8_2x128_rvv_i32 : 50.7 41.2
avg_8_4x2_c : 1.7 2.0
avg_8_4x2_rvv_i32 : 0.7 0.7
avg_8_4x4_c : 3.5 3.7
avg_8_4x4_rvv_i32 : 1.2 1.0
avg_8_4x8_c : 6.7 7.0
avg_8_4x8_rvv_i32 : 1.5 1.2
avg_8_4x16_c : 13.2 14.0
avg_8_4x16_rvv_i32 : 2.7 2.5
avg_8_4x32_c : 26.2 27.7
avg_8_4x32_rvv_i32 : 5.0 4.5
avg_8_4x64_c : 52.2 55.0
avg_8_4x64_rvv_i32 : 9.5 8.7
avg_8_4x128_c : 146.0 117.5
avg_8_4x128_rvv_i32 : 53.2 40.5
avg_8_8x2_c : 3.5 3.5
avg_8_8x2_rvv_i32 : 0.7 0.7
avg_8_8x4_c : 6.5 6.5
avg_8_8x4_rvv_i32 : 1.2 1.0
avg_8_8x8_c : 12.7 13.2
avg_8_8x8_rvv_i32 : 2.0 1.5
avg_8_8x16_c : 25.2 26.2
avg_8_8x16_rvv_i32 : 3.5 2.5
avg_8_8x32_c : 50.0 52.7
avg_8_8x32_rvv_i32 : 6.5 4.7
avg_8_8x64_c : 99.7 105.0
avg_8_8x64_rvv_i32 : 12.5 8.5
avg_8_8x128_c : 225.7 218.0
avg_8_8x128_rvv_i32 : 78.0 39.2
avg_8_16x2_c : 6.2 6.7
avg_8_16x2_rvv_i32 : 1.2 0.7
avg_8_16x4_c : 12.2 12.7
avg_8_16x4_rvv_i32 : 2.0 1.2
avg_8_16x8_c : 24.7 26.0
avg_8_16x8_rvv_i32 : 3.5 1.7
avg_8_16x16_c : 49.0 51.5
avg_8_16x16_rvv_i32 : 6.2 3.2
avg_8_16x32_c : 97.5 102.5
avg_8_16x32_rvv_i32 : 11.5 5.7
avg_8_16x64_c : 212.5 204.7
avg_8_16x64_rvv_i32 : 22.5 11.0
avg_8_16x128_c : 411.2 418.2
avg_8_16x128_rvv_i32 : 76.0 47.7
avg_8_32x2_c : 12.2 12.7
avg_8_32x2_rvv_i32 : 2.0 1.2
avg_8_32x4_c : 24.2 25.5
avg_8_32x4_rvv_i32 : 3.2 1.7
avg_8_32x8_c : 48.5 50.7
avg_8_32x8_rvv_i32 : 5.7 3.2
avg_8_32x16_c : 96.5 101.2
avg_8_32x16_rvv_i32 : 10.7 5.7
avg_8_32x32_c : 192.5 202.5
avg_8_32x32_rvv_i32 : 20.7 10.5
avg_8_32x64_c : 411.2 404.5
avg_8_32x64_rvv_i32 : 41.0 20.5
avg_8_32x128_c : 834.7 855.2
avg_8_32x128_rvv_i32 : 151.2 118.7
avg_8_64x2_c : 24.0 25.2
avg_8_64x2_rvv_i32 : 3.2 1.7
avg_8_64x4_c : 48.2 50.5
avg_8_64x4_rvv_i32 : 5.2 3.0
avg_8_64x8_c : 95.7 100.7
avg_8_64x8_rvv_i32 : 10.0 5.2
avg_8_64x16_c : 191.7 201.2
avg_8_64x16_rvv_i32 : 19.2 9.5
avg_8_64x32_c : 406.2 402.0
avg_8_64x32_rvv_i32 : 38.0 18.5
avg_8_64x64_c : 827.5 833.7
avg_8_64x64_rvv_i32 : 148.2 95.2
avg_8_64x128_c : 1607.7 1625.7
avg_8_64x128_rvv_i32 : 252.0 179.5
avg_8_128x2_c : 48.7 51.0
avg_8_128x2_rvv_i32 : 5.5 2.7
avg_8_128x4_c : 96.7 101.2
avg_8_128x4_rvv_i32 : 9.7 5.0
avg_8_128x8_c : 192.5 202.0
avg_8_128x8_rvv_i32 : 19.0 9.0
avg_8_128x16_c : 403.5 403.2
avg_8_128x16_rvv_i32 : 37.0 17.5
avg_8_128x32_c : 787.0 805.7
avg_8_128x32_rvv_i32 : 73.5 34.2
avg_8_128x64_c : 1635.7 1654.7
avg_8_128x64_rvv_i32 : 229.5 68.5
avg_8_128x128_c : 3217.0 3233.5
avg_8_128x128_rvv_i32 : 435.0 321.2
w_avg_8_2x2_c : 1.5 1.5
w_avg_8_2x2_rvv_i32 : 1.2 1.2
w_avg_8_2x4_c : 2.7 2.5
w_avg_8_2x4_rvv_i32 : 1.7 1.7
w_avg_8_2x8_c : 5.0 4.7
w_avg_8_2x8_rvv_i32 : 2.7 2.5
w_avg_8_2x16_c : 9.7 9.5
w_avg_8_2x16_rvv_i32 : 4.7 4.5
w_avg_8_2x32_c : 19.0 18.5
w_avg_8_2x32_rvv_i32 : 9.0 8.0
w_avg_8_2x64_c : 37.2 37.0
w_avg_8_2x64_rvv_i32 : 17.5 15.5
w_avg_8_2x128_c : 120.7 82.7
w_avg_8_2x128_rvv_i32 : 71.2 49.0
w_avg_8_4x2_c : 2.5 2.5
w_avg_8_4x2_rvv_i32 : 1.2 1.2
w_avg_8_4x4_c : 4.7 4.5
w_avg_8_4x4_rvv_i32 : 1.7 1.5
w_avg_8_4x8_c : 9.0 9.0
w_avg_8_4x8_rvv_i32 : 2.7 2.5
w_avg_8_4x16_c : 17.7 17.7
w_avg_8_4x16_rvv_i32 : 5.0 4.2
w_avg_8_4x32_c : 34.7 34.7
w_avg_8_4x32_rvv_i32 : 9.0 8.0
w_avg_8_4x64_c : 69.7 69.5
w_avg_8_4x64_rvv_i32 : 17.2 15.5
w_avg_8_4x128_c : 171.7 154.7
w_avg_8_4x128_rvv_i32 : 87.0 48.0
w_avg_8_8x2_c : 4.5 4.5
w_avg_8_8x2_rvv_i32 : 1.5 1.2
w_avg_8_8x4_c : 8.7 8.7
w_avg_8_8x4_rvv_i32 : 2.0 1.7
w_avg_8_8x8_c : 17.2 17.0
w_avg_8_8x8_rvv_i32 : 3.5 2.5
w_avg_8_8x16_c : 34.0 34.0
w_avg_8_8x16_rvv_i32 : 6.0 4.5
w_avg_8_8x32_c : 67.5 68.0
w_avg_8_8x32_rvv_i32 : 10.7 8.2
w_avg_8_8x64_c : 135.7 135.0
w_avg_8_8x64_rvv_i32 : 21.0 15.7
w_avg_8_8x128_c : 304.0 280.0
w_avg_8_8x128_rvv_i32 : 65.5 56.7
w_avg_8_16x2_c : 8.5 8.7
w_avg_8_16x2_rvv_i32 : 2.0 1.2
w_avg_8_16x4_c : 16.7 17.0
w_avg_8_16x4_rvv_i32 : 3.2 2.0
w_avg_8_16x8_c : 33.5 33.5
w_avg_8_16x8_rvv_i32 : 5.7 3.0
w_avg_8_16x16_c : 66.7 62.2
w_avg_8_16x16_rvv_i32 : 27.0 5.2
w_avg_8_16x32_c : 132.5 133.0
w_avg_8_16x32_rvv_i32 : 20.2 9.7
w_avg_8_16x64_c : 264.2 239.0
w_avg_8_16x64_rvv_i32 : 39.7 18.7
w_avg_8_16x128_c : 572.5 541.2
w_avg_8_16x128_rvv_i32 : 148.5 55.2
w_avg_8_32x2_c : 16.7 16.7
w_avg_8_32x2_rvv_i32 : 3.2 2.0
w_avg_8_32x4_c : 33.2 33.2
w_avg_8_32x4_rvv_i32 : 6.0 3.0
w_avg_8_32x8_c : 66.0 66.0
w_avg_8_32x8_rvv_i32 : 11.0 5.5
w_avg_8_32x16_c : 131.2 122.7
w_avg_8_32x16_rvv_i32 : 21.5 9.7
w_avg_8_32x32_c : 262.2 268.7
w_avg_8_32x32_rvv_i32 : 42.2 18.5
w_avg_8_32x64_c : 544.2 547.0
w_avg_8_32x64_rvv_i32 : 83.5 37.0
w_avg_8_32x128_c : 1426.7 1139.7
w_avg_8_32x128_rvv_i32 : 201.0 138.2
w_avg_8_64x2_c : 33.0 33.0
w_avg_8_64x2_rvv_i32 : 6.0 3.0
w_avg_8_64x4_c : 65.7 65.7
w_avg_8_64x4_rvv_i32 : 11.2 5.5
w_avg_8_64x8_c : 131.0 131.5
w_avg_8_64x8_rvv_i32 : 21.5 10.0
w_avg_8_64x16_c : 289.2 262.7
w_avg_8_64x16_rvv_i32 : 42.5 19.2
w_avg_8_64x32_c : 548.7 525.2
w_avg_8_64x32_rvv_i32 : 83.7 37.5
w_avg_8_64x64_c : 1139.5 1208.2
w_avg_8_64x64_rvv_i32 : 209.0 107.5
w_avg_8_64x128_c : 2495.5 2300.5
w_avg_8_64x128_rvv_i32 : 420.2 208.7
w_avg_8_128x2_c : 66.0 66.5
w_avg_8_128x2_rvv_i32 : 11.2 5.5
w_avg_8_128x4_c : 131.2 132.5
w_avg_8_128x4_rvv_i32 : 21.5 10.0
w_avg_8_128x8_c : 280.2 275.7
w_avg_8_128x8_rvv_i32 : 42.2 19.5
w_avg_8_128x16_c : 549.0 527.7
w_avg_8_128x16_rvv_i32 : 104.7 37.7
w_avg_8_128x32_c : 1215.2 1068.5
w_avg_8_128x32_rvv_i32 : 189.0 74.7
w_avg_8_128x64_c : 2305.5 2145.5
w_avg_8_128x64_rvv_i32 : 386.7 190.0
w_avg_8_128x128_c : 5797.0 4600.2
w_avg_8_128x128_rvv_i32 : 760.5 343.0
---
libavcodec/riscv/Makefile | 2 +
libavcodec/riscv/vvc_mc_rvv.S | 312 +++++++++++++++++++++++++++++++++
libavcodec/riscv/vvcdsp_init.c | 76 ++++++++
libavcodec/vvc/dsp.c | 4 +-
libavcodec/vvc/dsp.h | 1 +
5 files changed, 394 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/riscv/vvc_mc_rvv.S
create mode 100644 libavcodec/riscv/vvcdsp_init.c
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 27b268ae39..6297664fc9 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
+OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o
+RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o
diff --git a/libavcodec/riscv/vvc_mc_rvv.S b/libavcodec/riscv/vvc_mc_rvv.S
new file mode 100644
index 0000000000..26a6afba1f
--- /dev/null
+++ b/libavcodec/riscv/vvc_mc_rvv.S
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro vsetvlstatic8 w vlen is_w
+ .if \w <= 2
+ vsetivli zero, \w, e8, mf8, ta, ma
+ .elseif \w <= 4 && \vlen == 128
+ vsetivli zero, \w, e8, mf4, ta, ma
+ .elseif \w <= 4 && \vlen >= 256
+ vsetivli zero, \w, e8, mf8, ta, ma
+ .elseif \w <= 8 && \vlen == 128
+ vsetivli zero, \w, e8, mf2, ta, ma
+ .elseif \w <= 8 && \vlen >= 256
+ vsetivli zero, \w, e8, mf4, ta, ma
+ .elseif \w <= 16 && \vlen == 128
+ vsetivli zero, \w, e8, m1, ta, ma
+ .elseif \w <= 16 && \vlen >= 256
+ vsetivli zero, \w, e8, mf2, ta, ma
+ .elseif \w <= 32 && \vlen >= 256
+ li t0, \w
+ vsetvli zero, t0, e8, m1, ta, ma
+ .elseif \w <= (\vlen / 4) || \is_w
+ li t0, 64
+ vsetvli zero, t0, e8, m2, ta, ma
+ .else
+ li t0, \w
+ vsetvli zero, t0, e8, m4, ta, ma
+ .endif
+.endm
+
+.macro vsetvlstatic16 w vlen is_w
+ .if \w <= 2
+ vsetivli zero, \w, e16, mf4, ta, ma
+ .elseif \w <= 4 && \vlen == 128
+ vsetivli zero, \w, e16, mf2, ta, ma
+ .elseif \w <= 4 && \vlen >= 256
+ vsetivli zero, \w, e16, mf4, ta, ma
+ .elseif \w <= 8 && \vlen == 128
+ vsetivli zero, \w, e16, m1, ta, ma
+ .elseif \w <= 8 && \vlen >= 256
+ vsetivli zero, \w, e16, mf2, ta, ma
+ .elseif \w <= 16 && \vlen == 128
+ vsetivli zero, \w, e16, m2, ta, ma
+ .elseif \w <= 16 && \vlen >= 256
+ vsetivli zero, \w, e16, m1, ta, ma
+ .elseif \w <= 32 && \vlen >= 256
+ li t0, \w
+ vsetvli zero, t0, e16, m2, ta, ma
+ .elseif \w <= (\vlen / 4) || \is_w
+ li t0, 64
+ vsetvli zero, t0, e16, m4, ta, ma
+ .else
+ li t0, \w
+ vsetvli zero, t0, e16, m8, ta, ma
+ .endif
+.endm
+
+.macro vsetvlstatic32 w vlen
+ .if \w <= 2
+ vsetivli zero, \w, e32, mf2, ta, ma
+ .elseif \w <= 4 && \vlen == 128
+ vsetivli zero, \w, e32, m1, ta, ma
+ .elseif \w <= 4 && \vlen >= 256
+ vsetivli zero, \w, e32, mf2, ta, ma
+ .elseif \w <= 8 && \vlen == 128
+ vsetivli zero, \w, e32, m2, ta, ma
+ .elseif \w <= 8 && \vlen >= 256
+ vsetivli zero, \w, e32, m1, ta, ma
+ .elseif \w <= 16 && \vlen == 128
+ vsetivli zero, \w, e32, m4, ta, ma
+ .elseif \w <= 16 && \vlen >= 256
+ vsetivli zero, \w, e32, m2, ta, ma
+ .elseif \w <= 32 && \vlen >= 256
+ li t0, \w
+ vsetvli zero, t0, e32, m4, ta, ma
+ .else
+ li t0, \w
+ vsetvli zero, t0, e32, m8, ta, ma
+ .endif
+.endm
+
+.macro avg_nx1 w vlen
+ vsetvlstatic16 \w, \vlen, 0
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vadd.vv v8, v8, v0
+ vmax.vx v8, v8, zero
+ vsetvlstatic8 \w, \vlen, 0
+ vnclipu.wi v8, v8, 7
+ vse8.v v8, (a0)
+.endm
+
+.macro avg w h vlen
+ csrw vxrm, zero
+
+.if \w <= (\vlen / 4) && \h >= 4
+.rept (\h / 4)
+ vsetvlstatic16 \w, \vlen, 0
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ addi t3, a2, 128*2*2
+ addi t4, a3, 128*2*2
+ addi a7, a3, 128*2*3
+ addi t6, a2, 128*2*3
+ add t2, a0, a1
+ sh1add t5, a1, a0
+ add a6, t5, a1
+ vle16.v v0, (a2)
+ vle16.v v4, (a3)
+ vle16.v v8, (t0)
+ vle16.v v12, (t1)
+ vle16.v v16, (t3)
+ vle16.v v20, (t4)
+ vle16.v v24, (t6)
+ vle16.v v28, (a7)
+ vadd.vv v4, v4, v0
+ vadd.vv v12, v12, v8
+ vadd.vv v20, v20, v16
+ vadd.vv v28, v28, v24
+ vmax.vx v4, v4, zero
+ vmax.vx v12, v12, zero
+ vmax.vx v20, v20, zero
+ vmax.vx v28, v28, zero
+ vsetvlstatic8 \w, \vlen, 0
+ vnclipu.wi v4, v4, 7
+ vnclipu.wi v12, v12, 7
+ vnclipu.wi v20, v20, 7
+ vnclipu.wi v28, v28, 7
+ vse8.v v4, (a0)
+ vse8.v v12, (t2)
+ vse8.v v20, (t5)
+ vse8.v v28, (a6)
+ addi a2, a2, 128*8
+ addi a3, a3, 128*8
+ sh2add a0, a1, a0
+.endr
+
+.elseif (\w <= (\vlen / 4) && \h == 2) || (\w == (\vlen / 2))
+.rept (\h / 2)
+ vsetvlstatic16 \w, \vlen, 0
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ add t2, a0, a1
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vle16.v v16, (t0)
+ vle16.v v24, (t1)
+ vadd.vv v8, v8, v0
+ vadd.vv v24, v24, v16
+ vmax.vx v8, v8, zero
+ vmax.vx v24, v24, zero
+ vsetvlstatic8 \w, \vlen, 0
+ vnclipu.wi v8, v8, 7
+ vnclipu.wi v24, v24, 7
+ vse8.v v8, (a0)
+ vse8.v v24, (t2)
+ addi a2, a2, 128*4
+ addi a3, a3, 128*4
+ sh1add a0, a1, a0
+.endr
+
+.else
+.rept \h
+ avg_nx1 \w, \vlen
+ .if \w == 128 && \vlen == 128
+ addi a2, a2, 64*2
+ addi a3, a3, 64*2
+ addi a0, a0, 64
+ avg_nx1 \w, \vlen
+ addi a2, a2, -64*2
+ addi a3, a3, -64*2
+ addi a0, a0, -64
+ .endif
+ addi a2, a2, 128*2
+ addi a3, a3, 128*2
+ add a0, a0, a1
+.endr
+.endif
+.endm
+
+.macro w_avg_nx1 w vlen
+ vsetvlstatic16 \w, \vlen, 1
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vwmul.vx v16, v0, a7
+ vwmacc.vx v16, t3, v8
+ vsetvlstatic32 \w, \vlen
+ vadd.vx v16, v16, t4
+ vsetvlstatic16 \w, \vlen, 1
+ vnsrl.wx v16, v16, t6
+ vmax.vx v16, v16, zero
+ vsetvlstatic8 \w, \vlen, 1
+ vnclipu.wi v16, v16, 0
+ vse8.v v16, (a0)
+.endm
+
+#if (__riscv_xlen == 64)
+.macro w_avg w h vlen
+ csrw vxrm, zero
+ addi t6, a6, 7
+ ld t3, (sp)
+ ld t4, 8(sp)
+ ld t5, 16(sp)
+ add t4, t4, t5
+ addi t4, t4, 1 // o0 + o1 + 1
+ addi t5, t6, -1 // shift - 1
+ sll t4, t4, t5
+
+.if \w <= (\vlen / 8)
+ .rept (\h / 2)
+ vsetvlstatic16 \w, \vlen, 1
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ add t2, a0, a1
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vle16.v v20, (t0)
+ vle16.v v24, (t1)
+ vwmul.vx v16, v0, a7
+ vwmul.vx v28, v20, a7
+ vwmacc.vx v16, t3, v8
+ vwmacc.vx v28, t3, v24
+ vsetvlstatic32 \w, \vlen
+ vadd.vx v16, v16, t4
+ vadd.vx v28, v28, t4
+ vsetvlstatic16 \w, \vlen, 1
+ vnsrl.wx v16, v16, t6
+ vnsrl.wx v28, v28, t6
+ vmax.vx v16, v16, zero
+ vmax.vx v28, v28, zero
+ vsetvlstatic8 \w, \vlen, 1
+ vnclipu.wi v16, v16, 0
+ vnclipu.wi v28, v28, 0
+ vse8.v v16, (a0)
+ vse8.v v28, (t2)
+ addi a2, a2, 128*4
+ addi a3, a3, 128*4
+ sh1add a0, a1, a0
+ .endr
+.else
+ .rept \h
+ w_avg_nx1 \w, \vlen
+ .if \w == (\vlen / 2)
+ addi a2, a2, (\vlen / 2)
+ addi a3, a3, (\vlen / 2)
+ addi a0, a0, (\vlen / 4)
+ w_avg_nx1 \w, \vlen
+ addi a2, a2, -(\vlen / 2)
+ addi a3, a3, -(\vlen / 2)
+ addi a0, a0, -(\vlen / 4)
+ .elseif \w == 128 && \vlen == 128
+ .rept 3
+ addi a2, a2, (\vlen / 2)
+ addi a3, a3, (\vlen / 2)
+ addi a0, a0, (\vlen / 4)
+ w_avg_nx1 \w, \vlen
+ .endr
+ addi a2, a2, -(\vlen / 2) * 3
+ addi a3, a3, -(\vlen / 2) * 3
+ addi a0, a0, -(\vlen / 4) * 3
+ .endif
+
+ addi a2, a2, 128*2
+ addi a3, a3, 128*2
+ add a0, a0, a1
+ .endr
+.endif
+.endm
+#endif
+
+.macro func_avg name vlen
+func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x
+.irp w,2,4,8,16,32,64,128
+ li t3, \w
+ bne a4, t3, \name\vlen\()end\w
+.irp h,2,4,8,16,32,64,128
+ li t4, \h
+ bne a5, t4, \name\vlen\()end\w\h
+ \name \w \h \vlen
+ ret
+\name\vlen\()end\w\h:
+.endr
+\name\vlen\()end\w:
+.endr
+endfunc
+.endm
+
+func_avg avg 256
+func_avg avg 128
+#if (__riscv_xlen == 64)
+func_avg w_avg 256
+func_avg w_avg 128
+#endif
diff --git a/libavcodec/riscv/vvcdsp_init.c b/libavcodec/riscv/vvcdsp_init.c
new file mode 100644
index 0000000000..d26b4c1c4a
--- /dev/null
+++ b/libavcodec/riscv/vvcdsp_init.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/vvc/dsp.h"
+
+#define bf(fn, bd, opt) fn##_##bd##_##opt
+
+#define AVG_PROTOTYPES(bd, opt) \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height); \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height, \
+ int denom, int w0, int w1, int o0, int o1);
+
+AVG_PROTOTYPES(8, rvv_128)
+AVG_PROTOTYPES(8, rvv_256)
+
+#define AVG_INIT(bd, opt) do { \
+ c->inter.avg = bf(ff_vvc_avg, bd, opt); \
+ c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \
+} while (0)
+
+void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
+{
+#if HAVE_RVV
+ const int flags = av_get_cpu_flags();
+
+ if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+ ff_rv_vlen_least(256)) {
+ switch (bd) {
+ case 8:
+ c->inter.avg = ff_vvc_avg_8_rvv_256;
+# if (__riscv_xlen == 64)
+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
+# endif
+ break;
+ default:
+ break;
+ }
+ } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+ ff_rv_vlen_least(128)) {
+ switch (bd) {
+ case 8:
+ c->inter.avg = ff_vvc_avg_8_rvv_128;
+# if (__riscv_xlen == 64)
+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
+# endif
+ break;
+ default:
+ break;
+ }
+ }
+#endif
+}
diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
index 41e830a98a..c55a37d255 100644
--- a/libavcodec/vvc/dsp.c
+++ b/libavcodec/vvc/dsp.c
@@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
break;
}
-#if ARCH_X86
+#if ARCH_RISCV
+ ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
+#elif ARCH_X86
ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
#endif
}
diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
index 9810ac314c..dcb978549f 100644
--- a/libavcodec/vvc/dsp.h
+++ b/libavcodec/vvc/dsp.h
@@ -167,6 +167,7 @@ typedef struct VVCDSPContext {
void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
+void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
#endif /* AVCODEC_VVC_DSP_H */
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-05-21 7:37 [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg uk7b
@ 2024-05-21 7:38 ` flow gg
2024-05-21 7:47 ` uk7b
2024-05-21 16:03 ` Rémi Denis-Courmont
2 siblings, 0 replies; 19+ messages in thread
From: flow gg @ 2024-05-21 7:38 UTC (permalink / raw)
To: FFmpeg development discussions and patches
To obtain test results, need to comment out the if (w == h) in
tests/checkasm/vvc_mc.c.
Because vset needs to be used in the loop, I manually wrote a cumbersome
vset macro.
<uk7b@foxmail.com> 于2024年5月21日周二 15:38写道:
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908 X60
> avg_8_2x2_c : 1.0 1.0
> avg_8_2x2_rvv_i32 : 0.7 0.7
> avg_8_2x4_c : 2.0 2.0
> avg_8_2x4_rvv_i32 : 1.0 0.7
> avg_8_2x8_c : 4.0 3.7
> avg_8_2x8_rvv_i32 : 1.5 1.2
> avg_8_2x16_c : 7.5 7.7
> avg_8_2x16_rvv_i32 : 2.7 2.5
> avg_8_2x32_c : 14.2 15.0
> avg_8_2x32_rvv_i32 : 5.0 4.5
> avg_8_2x64_c : 28.5 30.2
> avg_8_2x64_rvv_i32 : 9.5 8.7
> avg_8_2x128_c : 80.0 70.5
> avg_8_2x128_rvv_i32 : 50.7 41.2
> avg_8_4x2_c : 1.7 2.0
> avg_8_4x2_rvv_i32 : 0.7 0.7
> avg_8_4x4_c : 3.5 3.7
> avg_8_4x4_rvv_i32 : 1.2 1.0
> avg_8_4x8_c : 6.7 7.0
> avg_8_4x8_rvv_i32 : 1.5 1.2
> avg_8_4x16_c : 13.2 14.0
> avg_8_4x16_rvv_i32 : 2.7 2.5
> avg_8_4x32_c : 26.2 27.7
> avg_8_4x32_rvv_i32 : 5.0 4.5
> avg_8_4x64_c : 52.2 55.0
> avg_8_4x64_rvv_i32 : 9.5 8.7
> avg_8_4x128_c : 146.0 117.5
> avg_8_4x128_rvv_i32 : 53.2 40.5
> avg_8_8x2_c : 3.5 3.5
> avg_8_8x2_rvv_i32 : 0.7 0.7
> avg_8_8x4_c : 6.5 6.5
> avg_8_8x4_rvv_i32 : 1.2 1.0
> avg_8_8x8_c : 12.7 13.2
> avg_8_8x8_rvv_i32 : 2.0 1.5
> avg_8_8x16_c : 25.2 26.2
> avg_8_8x16_rvv_i32 : 3.5 2.5
> avg_8_8x32_c : 50.0 52.7
> avg_8_8x32_rvv_i32 : 6.5 4.7
> avg_8_8x64_c : 99.7 105.0
> avg_8_8x64_rvv_i32 : 12.5 8.5
> avg_8_8x128_c : 225.7 218.0
> avg_8_8x128_rvv_i32 : 78.0 39.2
> avg_8_16x2_c : 6.2 6.7
> avg_8_16x2_rvv_i32 : 1.2 0.7
> avg_8_16x4_c : 12.2 12.7
> avg_8_16x4_rvv_i32 : 2.0 1.2
> avg_8_16x8_c : 24.7 26.0
> avg_8_16x8_rvv_i32 : 3.5 1.7
> avg_8_16x16_c : 49.0 51.5
> avg_8_16x16_rvv_i32 : 6.2 3.2
> avg_8_16x32_c : 97.5 102.5
> avg_8_16x32_rvv_i32 : 11.5 5.7
> avg_8_16x64_c : 212.5 204.7
> avg_8_16x64_rvv_i32 : 22.5 11.0
> avg_8_16x128_c : 411.2 418.2
> avg_8_16x128_rvv_i32 : 76.0 47.7
> avg_8_32x2_c : 12.2 12.7
> avg_8_32x2_rvv_i32 : 2.0 1.2
> avg_8_32x4_c : 24.2 25.5
> avg_8_32x4_rvv_i32 : 3.2 1.7
> avg_8_32x8_c : 48.5 50.7
> avg_8_32x8_rvv_i32 : 5.7 3.2
> avg_8_32x16_c : 96.5 101.2
> avg_8_32x16_rvv_i32 : 10.7 5.7
> avg_8_32x32_c : 192.5 202.5
> avg_8_32x32_rvv_i32 : 20.7 10.5
> avg_8_32x64_c : 411.2 404.5
> avg_8_32x64_rvv_i32 : 41.0 20.5
> avg_8_32x128_c : 834.7 855.2
> avg_8_32x128_rvv_i32 : 151.2 118.7
> avg_8_64x2_c : 24.0 25.2
> avg_8_64x2_rvv_i32 : 3.2 1.7
> avg_8_64x4_c : 48.2 50.5
> avg_8_64x4_rvv_i32 : 5.2 3.0
> avg_8_64x8_c : 95.7 100.7
> avg_8_64x8_rvv_i32 : 10.0 5.2
> avg_8_64x16_c : 191.7 201.2
> avg_8_64x16_rvv_i32 : 19.2 9.5
> avg_8_64x32_c : 406.2 402.0
> avg_8_64x32_rvv_i32 : 38.0 18.5
> avg_8_64x64_c : 827.5 833.7
> avg_8_64x64_rvv_i32 : 148.2 95.2
> avg_8_64x128_c : 1607.7 1625.7
> avg_8_64x128_rvv_i32 : 252.0 179.5
> avg_8_128x2_c : 48.7 51.0
> avg_8_128x2_rvv_i32 : 5.5 2.7
> avg_8_128x4_c : 96.7 101.2
> avg_8_128x4_rvv_i32 : 9.7 5.0
> avg_8_128x8_c : 192.5 202.0
> avg_8_128x8_rvv_i32 : 19.0 9.0
> avg_8_128x16_c : 403.5 403.2
> avg_8_128x16_rvv_i32 : 37.0 17.5
> avg_8_128x32_c : 787.0 805.7
> avg_8_128x32_rvv_i32 : 73.5 34.2
> avg_8_128x64_c : 1635.7 1654.7
> avg_8_128x64_rvv_i32 : 229.5 68.5
> avg_8_128x128_c : 3217.0 3233.5
> avg_8_128x128_rvv_i32 : 435.0 321.2
> w_avg_8_2x2_c : 1.5 1.5
> w_avg_8_2x2_rvv_i32 : 1.2 1.2
> w_avg_8_2x4_c : 2.7 2.5
> w_avg_8_2x4_rvv_i32 : 1.7 1.7
> w_avg_8_2x8_c : 5.0 4.7
> w_avg_8_2x8_rvv_i32 : 2.7 2.5
> w_avg_8_2x16_c : 9.7 9.5
> w_avg_8_2x16_rvv_i32 : 4.7 4.5
> w_avg_8_2x32_c : 19.0 18.5
> w_avg_8_2x32_rvv_i32 : 9.0 8.0
> w_avg_8_2x64_c : 37.2 37.0
> w_avg_8_2x64_rvv_i32 : 17.5 15.5
> w_avg_8_2x128_c : 120.7 82.7
> w_avg_8_2x128_rvv_i32 : 71.2 49.0
> w_avg_8_4x2_c : 2.5 2.5
> w_avg_8_4x2_rvv_i32 : 1.2 1.2
> w_avg_8_4x4_c : 4.7 4.5
> w_avg_8_4x4_rvv_i32 : 1.7 1.5
> w_avg_8_4x8_c : 9.0 9.0
> w_avg_8_4x8_rvv_i32 : 2.7 2.5
> w_avg_8_4x16_c : 17.7 17.7
> w_avg_8_4x16_rvv_i32 : 5.0 4.2
> w_avg_8_4x32_c : 34.7 34.7
> w_avg_8_4x32_rvv_i32 : 9.0 8.0
> w_avg_8_4x64_c : 69.7 69.5
> w_avg_8_4x64_rvv_i32 : 17.2 15.5
> w_avg_8_4x128_c : 171.7 154.7
> w_avg_8_4x128_rvv_i32 : 87.0 48.0
> w_avg_8_8x2_c : 4.5 4.5
> w_avg_8_8x2_rvv_i32 : 1.5 1.2
> w_avg_8_8x4_c : 8.7 8.7
> w_avg_8_8x4_rvv_i32 : 2.0 1.7
> w_avg_8_8x8_c : 17.2 17.0
> w_avg_8_8x8_rvv_i32 : 3.5 2.5
> w_avg_8_8x16_c : 34.0 34.0
> w_avg_8_8x16_rvv_i32 : 6.0 4.5
> w_avg_8_8x32_c : 67.5 68.0
> w_avg_8_8x32_rvv_i32 : 10.7 8.2
> w_avg_8_8x64_c : 135.7 135.0
> w_avg_8_8x64_rvv_i32 : 21.0 15.7
> w_avg_8_8x128_c : 304.0 280.0
> w_avg_8_8x128_rvv_i32 : 65.5 56.7
> w_avg_8_16x2_c : 8.5 8.7
> w_avg_8_16x2_rvv_i32 : 2.0 1.2
> w_avg_8_16x4_c : 16.7 17.0
> w_avg_8_16x4_rvv_i32 : 3.2 2.0
> w_avg_8_16x8_c : 33.5 33.5
> w_avg_8_16x8_rvv_i32 : 5.7 3.0
> w_avg_8_16x16_c : 66.7 62.2
> w_avg_8_16x16_rvv_i32 : 27.0 5.2
> w_avg_8_16x32_c : 132.5 133.0
> w_avg_8_16x32_rvv_i32 : 20.2 9.7
> w_avg_8_16x64_c : 264.2 239.0
> w_avg_8_16x64_rvv_i32 : 39.7 18.7
> w_avg_8_16x128_c : 572.5 541.2
> w_avg_8_16x128_rvv_i32 : 148.5 55.2
> w_avg_8_32x2_c : 16.7 16.7
> w_avg_8_32x2_rvv_i32 : 3.2 2.0
> w_avg_8_32x4_c : 33.2 33.2
> w_avg_8_32x4_rvv_i32 : 6.0 3.0
> w_avg_8_32x8_c : 66.0 66.0
> w_avg_8_32x8_rvv_i32 : 11.0 5.5
> w_avg_8_32x16_c : 131.2 122.7
> w_avg_8_32x16_rvv_i32 : 21.5 9.7
> w_avg_8_32x32_c : 262.2 268.7
> w_avg_8_32x32_rvv_i32 : 42.2 18.5
> w_avg_8_32x64_c : 544.2 547.0
> w_avg_8_32x64_rvv_i32 : 83.5 37.0
> w_avg_8_32x128_c : 1426.7 1139.7
> w_avg_8_32x128_rvv_i32 : 201.0 138.2
> w_avg_8_64x2_c : 33.0 33.0
> w_avg_8_64x2_rvv_i32 : 6.0 3.0
> w_avg_8_64x4_c : 65.7 65.7
> w_avg_8_64x4_rvv_i32 : 11.2 5.5
> w_avg_8_64x8_c : 131.0 131.5
> w_avg_8_64x8_rvv_i32 : 21.5 10.0
> w_avg_8_64x16_c : 289.2 262.7
> w_avg_8_64x16_rvv_i32 : 42.5 19.2
> w_avg_8_64x32_c : 548.7 525.2
> w_avg_8_64x32_rvv_i32 : 83.7 37.5
> w_avg_8_64x64_c : 1139.5 1208.2
> w_avg_8_64x64_rvv_i32 : 209.0 107.5
> w_avg_8_64x128_c : 2495.5 2300.5
> w_avg_8_64x128_rvv_i32 : 420.2 208.7
> w_avg_8_128x2_c : 66.0 66.5
> w_avg_8_128x2_rvv_i32 : 11.2 5.5
> w_avg_8_128x4_c : 131.2 132.5
> w_avg_8_128x4_rvv_i32 : 21.5 10.0
> w_avg_8_128x8_c : 280.2 275.7
> w_avg_8_128x8_rvv_i32 : 42.2 19.5
> w_avg_8_128x16_c : 549.0 527.7
> w_avg_8_128x16_rvv_i32 : 104.7 37.7
> w_avg_8_128x32_c : 1215.2 1068.5
> w_avg_8_128x32_rvv_i32 : 189.0 74.7
> w_avg_8_128x64_c : 2305.5 2145.5
> w_avg_8_128x64_rvv_i32 : 386.7 190.0
> w_avg_8_128x128_c : 5797.0 4600.2
> w_avg_8_128x128_rvv_i32 : 760.5 343.0
> ---
> libavcodec/riscv/Makefile | 2 +
> libavcodec/riscv/vvc_mc_rvv.S | 312 +++++++++++++++++++++++++++++++++
> libavcodec/riscv/vvcdsp_init.c | 76 ++++++++
> libavcodec/vvc/dsp.c | 4 +-
> libavcodec/vvc/dsp.h | 1 +
> 5 files changed, 394 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/riscv/vvc_mc_rvv.S
> create mode 100644 libavcodec/riscv/vvcdsp_init.c
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 27b268ae39..6297664fc9 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
> RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o
> +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o
> diff --git a/libavcodec/riscv/vvc_mc_rvv.S b/libavcodec/riscv/vvc_mc_rvv.S
> new file mode 100644
> index 0000000000..26a6afba1f
> --- /dev/null
> +++ b/libavcodec/riscv/vvc_mc_rvv.S
> @@ -0,0 +1,312 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro vsetvlstatic8 w vlen is_w
> + .if \w <= 2
> + vsetivli zero, \w, e8, mf8, ta, ma
> + .elseif \w <= 4 && \vlen == 128
> + vsetivli zero, \w, e8, mf4, ta, ma
> + .elseif \w <= 4 && \vlen >= 256
> + vsetivli zero, \w, e8, mf8, ta, ma
> + .elseif \w <= 8 && \vlen == 128
> + vsetivli zero, \w, e8, mf2, ta, ma
> + .elseif \w <= 8 && \vlen >= 256
> + vsetivli zero, \w, e8, mf4, ta, ma
> + .elseif \w <= 16 && \vlen == 128
> + vsetivli zero, \w, e8, m1, ta, ma
> + .elseif \w <= 16 && \vlen >= 256
> + vsetivli zero, \w, e8, mf2, ta, ma
> + .elseif \w <= 32 && \vlen >= 256
> + li t0, \w
> + vsetvli zero, t0, e8, m1, ta, ma
> + .elseif \w <= (\vlen / 4) || \is_w
> + li t0, 64
> + vsetvli zero, t0, e8, m2, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e8, m4, ta, ma
> + .endif
> +.endm
> +
> +.macro vsetvlstatic16 w vlen is_w
> + .if \w <= 2
> + vsetivli zero, \w, e16, mf4, ta, ma
> + .elseif \w <= 4 && \vlen == 128
> + vsetivli zero, \w, e16, mf2, ta, ma
> + .elseif \w <= 4 && \vlen >= 256
> + vsetivli zero, \w, e16, mf4, ta, ma
> + .elseif \w <= 8 && \vlen == 128
> + vsetivli zero, \w, e16, m1, ta, ma
> + .elseif \w <= 8 && \vlen >= 256
> + vsetivli zero, \w, e16, mf2, ta, ma
> + .elseif \w <= 16 && \vlen == 128
> + vsetivli zero, \w, e16, m2, ta, ma
> + .elseif \w <= 16 && \vlen >= 256
> + vsetivli zero, \w, e16, m1, ta, ma
> + .elseif \w <= 32 && \vlen >= 256
> + li t0, \w
> + vsetvli zero, t0, e16, m2, ta, ma
> + .elseif \w <= (\vlen / 4) || \is_w
> + li t0, 64
> + vsetvli zero, t0, e16, m4, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e16, m8, ta, ma
> + .endif
> +.endm
> +
> +.macro vsetvlstatic32 w vlen
> + .if \w <= 2
> + vsetivli zero, \w, e32, mf2, ta, ma
> + .elseif \w <= 4 && \vlen == 128
> + vsetivli zero, \w, e32, m1, ta, ma
> + .elseif \w <= 4 && \vlen >= 256
> + vsetivli zero, \w, e32, mf2, ta, ma
> + .elseif \w <= 8 && \vlen == 128
> + vsetivli zero, \w, e32, m2, ta, ma
> + .elseif \w <= 8 && \vlen >= 256
> + vsetivli zero, \w, e32, m1, ta, ma
> + .elseif \w <= 16 && \vlen == 128
> + vsetivli zero, \w, e32, m4, ta, ma
> + .elseif \w <= 16 && \vlen >= 256
> + vsetivli zero, \w, e32, m2, ta, ma
> + .elseif \w <= 32 && \vlen >= 256
> + li t0, \w
> + vsetvli zero, t0, e32, m4, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e32, m8, ta, ma
> + .endif
> +.endm
> +
> +.macro avg_nx1 w vlen
> + vsetvlstatic16 \w, \vlen, 0
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vadd.vv v8, v8, v0
> + vmax.vx v8, v8, zero
> + vsetvlstatic8 \w, \vlen, 0
> + vnclipu.wi v8, v8, 7
> + vse8.v v8, (a0)
> +.endm
> +
> +.macro avg w h vlen
> + csrw vxrm, zero
> +
> +.if \w <= (\vlen / 4) && \h >= 4
> +.rept (\h / 4)
> + vsetvlstatic16 \w, \vlen, 0
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + addi t3, a2, 128*2*2
> + addi t4, a3, 128*2*2
> + addi a7, a3, 128*2*3
> + addi t6, a2, 128*2*3
> + add t2, a0, a1
> + sh1add t5, a1, a0
> + add a6, t5, a1
> + vle16.v v0, (a2)
> + vle16.v v4, (a3)
> + vle16.v v8, (t0)
> + vle16.v v12, (t1)
> + vle16.v v16, (t3)
> + vle16.v v20, (t4)
> + vle16.v v24, (t6)
> + vle16.v v28, (a7)
> + vadd.vv v4, v4, v0
> + vadd.vv v12, v12, v8
> + vadd.vv v20, v20, v16
> + vadd.vv v28, v28, v24
> + vmax.vx v4, v4, zero
> + vmax.vx v12, v12, zero
> + vmax.vx v20, v20, zero
> + vmax.vx v28, v28, zero
> + vsetvlstatic8 \w, \vlen, 0
> + vnclipu.wi v4, v4, 7
> + vnclipu.wi v12, v12, 7
> + vnclipu.wi v20, v20, 7
> + vnclipu.wi v28, v28, 7
> + vse8.v v4, (a0)
> + vse8.v v12, (t2)
> + vse8.v v20, (t5)
> + vse8.v v28, (a6)
> + addi a2, a2, 128*8
> + addi a3, a3, 128*8
> + sh2add a0, a1, a0
> +.endr
> +
> +.elseif (\w <= (\vlen / 4) && \h == 2) || (\w == (\vlen / 2))
> +.rept (\h / 2)
> + vsetvlstatic16 \w, \vlen, 0
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + add t2, a0, a1
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vle16.v v16, (t0)
> + vle16.v v24, (t1)
> + vadd.vv v8, v8, v0
> + vadd.vv v24, v24, v16
> + vmax.vx v8, v8, zero
> + vmax.vx v24, v24, zero
> + vsetvlstatic8 \w, \vlen, 0
> + vnclipu.wi v8, v8, 7
> + vnclipu.wi v24, v24, 7
> + vse8.v v8, (a0)
> + vse8.v v24, (t2)
> + addi a2, a2, 128*4
> + addi a3, a3, 128*4
> + sh1add a0, a1, a0
> +.endr
> +
> +.else
> +.rept \h
> + avg_nx1 \w, \vlen
> + .if \w == 128 && \vlen == 128
> + addi a2, a2, 64*2
> + addi a3, a3, 64*2
> + addi a0, a0, 64
> + avg_nx1 \w, \vlen
> + addi a2, a2, -64*2
> + addi a3, a3, -64*2
> + addi a0, a0, -64
> + .endif
> + addi a2, a2, 128*2
> + addi a3, a3, 128*2
> + add a0, a0, a1
> +.endr
> +.endif
> +.endm
> +
> +.macro w_avg_nx1 w vlen
> + vsetvlstatic16 \w, \vlen, 1
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vwmul.vx v16, v0, a7
> + vwmacc.vx v16, t3, v8
> + vsetvlstatic32 \w, \vlen
> + vadd.vx v16, v16, t4
> + vsetvlstatic16 \w, \vlen, 1
> + vnsrl.wx v16, v16, t6
> + vmax.vx v16, v16, zero
> + vsetvlstatic8 \w, \vlen, 1
> + vnclipu.wi v16, v16, 0
> + vse8.v v16, (a0)
> +.endm
> +
> +#if (__riscv_xlen == 64)
> +.macro w_avg w h vlen
> + csrw vxrm, zero
> + addi t6, a6, 7
> + ld t3, (sp)
> + ld t4, 8(sp)
> + ld t5, 16(sp)
> + add t4, t4, t5
> + addi t4, t4, 1 // o0 + o1 + 1
> + addi t5, t6, -1 // shift - 1
> + sll t4, t4, t5
> +
> +.if \w <= (\vlen / 8)
> + .rept (\h / 2)
> + vsetvlstatic16 \w, \vlen, 1
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + add t2, a0, a1
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vle16.v v20, (t0)
> + vle16.v v24, (t1)
> + vwmul.vx v16, v0, a7
> + vwmul.vx v28, v20, a7
> + vwmacc.vx v16, t3, v8
> + vwmacc.vx v28, t3, v24
> + vsetvlstatic32 \w, \vlen
> + vadd.vx v16, v16, t4
> + vadd.vx v28, v28, t4
> + vsetvlstatic16 \w, \vlen, 1
> + vnsrl.wx v16, v16, t6
> + vnsrl.wx v28, v28, t6
> + vmax.vx v16, v16, zero
> + vmax.vx v28, v28, zero
> + vsetvlstatic8 \w, \vlen, 1
> + vnclipu.wi v16, v16, 0
> + vnclipu.wi v28, v28, 0
> + vse8.v v16, (a0)
> + vse8.v v28, (t2)
> + addi a2, a2, 128*4
> + addi a3, a3, 128*4
> + sh1add a0, a1, a0
> + .endr
> +.else
> + .rept \h
> + w_avg_nx1 \w, \vlen
> + .if \w == (\vlen / 2)
> + addi a2, a2, (\vlen / 2)
> + addi a3, a3, (\vlen / 2)
> + addi a0, a0, (\vlen / 4)
> + w_avg_nx1 \w, \vlen
> + addi a2, a2, -(\vlen / 2)
> + addi a3, a3, -(\vlen / 2)
> + addi a0, a0, -(\vlen / 4)
> + .elseif \w == 128 && \vlen == 128
> + .rept 3
> + addi a2, a2, (\vlen / 2)
> + addi a3, a3, (\vlen / 2)
> + addi a0, a0, (\vlen / 4)
> + w_avg_nx1 \w, \vlen
> + .endr
> + addi a2, a2, -(\vlen / 2) * 3
> + addi a3, a3, -(\vlen / 2) * 3
> + addi a0, a0, -(\vlen / 4) * 3
> + .endif
> +
> + addi a2, a2, 128*2
> + addi a3, a3, 128*2
> + add a0, a0, a1
> + .endr
> +.endif
> +.endm
> +#endif
> +
> +.macro func_avg name vlen
> +func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x
> +.irp w,2,4,8,16,32,64,128
> + li t3, \w
> + bne a4, t3, \name\vlen\()end\w
> +.irp h,2,4,8,16,32,64,128
> + li t4, \h
> + bne a5, t4, \name\vlen\()end\w\h
> + \name \w \h \vlen
> + ret
> +\name\vlen\()end\w\h:
> +.endr
> +\name\vlen\()end\w:
> +.endr
> +endfunc
> +.endm
> +
> +func_avg avg 256
> +func_avg avg 128
> +#if (__riscv_xlen == 64)
> +func_avg w_avg 256
> +func_avg w_avg 128
> +#endif
> diff --git a/libavcodec/riscv/vvcdsp_init.c
> b/libavcodec/riscv/vvcdsp_init.c
> new file mode 100644
> index 0000000000..d26b4c1c4a
> --- /dev/null
> +++ b/libavcodec/riscv/vvcdsp_init.c
> @@ -0,0 +1,76 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "config.h"
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
> +#include "libavcodec/vvc/dsp.h"
> +
> +#define bf(fn, bd, opt) fn##_##bd##_##opt
> +
> +#define AVG_PROTOTYPES(bd, opt)
> \
> +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,
> \
> + const int16_t *src0, const int16_t *src1, int width, int height);
> \
> +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,
> \
> + const int16_t *src0, const int16_t *src1, int width, int height,
> \
> + int denom, int w0, int w1, int o0, int o1);
> +
> +AVG_PROTOTYPES(8, rvv_128)
> +AVG_PROTOTYPES(8, rvv_256)
> +
> +#define AVG_INIT(bd, opt) do { \
> + c->inter.avg = bf(ff_vvc_avg, bd, opt); \
> + c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \
> +} while (0)
> +
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> +{
> +#if HAVE_RVV
> + const int flags = av_get_cpu_flags();
> +
> + if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
> + ff_rv_vlen_least(256)) {
> + switch (bd) {
> + case 8:
> + c->inter.avg = ff_vvc_avg_8_rvv_256;
> +# if (__riscv_xlen == 64)
> + c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
> +# endif
> + break;
> + default:
> + break;
> + }
> + } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags &
> AV_CPU_FLAG_RVB_ADDR) &&
> + ff_rv_vlen_least(128)) {
> + switch (bd) {
> + case 8:
> + c->inter.avg = ff_vvc_avg_8_rvv_128;
> +# if (__riscv_xlen == 64)
> + c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
> +# endif
> + break;
> + default:
> + break;
> + }
> + }
> +#endif
> +}
> diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
> index 41e830a98a..c55a37d255 100644
> --- a/libavcodec/vvc/dsp.c
> +++ b/libavcodec/vvc/dsp.c
> @@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int
> bit_depth)
> break;
> }
>
> -#if ARCH_X86
> +#if ARCH_RISCV
> + ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
> +#elif ARCH_X86
> ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
> #endif
> }
> diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> index 9810ac314c..dcb978549f 100644
> --- a/libavcodec/vvc/dsp.h
> +++ b/libavcodec/vvc/dsp.h
> @@ -167,6 +167,7 @@ typedef struct VVCDSPContext {
>
> void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
>
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
> void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
>
> #endif /* AVCODEC_VVC_DSP_H */
> --
> 2.45.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-05-21 7:37 [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg uk7b
2024-05-21 7:38 ` flow gg
@ 2024-05-21 7:47 ` uk7b
2024-05-21 7:48 ` flow gg
2024-05-21 16:03 ` Rémi Denis-Courmont
2 siblings, 1 reply; 19+ messages in thread
From: uk7b @ 2024-05-21 7:47 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
avg_8_2x2_c : 1.0 1.0
avg_8_2x2_rvv_i32 : 0.7 0.7
avg_8_2x4_c : 2.0 2.0
avg_8_2x4_rvv_i32 : 1.0 0.7
avg_8_2x8_c : 4.0 3.7
avg_8_2x8_rvv_i32 : 1.5 1.2
avg_8_2x16_c : 7.5 7.7
avg_8_2x16_rvv_i32 : 2.7 2.5
avg_8_2x32_c : 14.2 15.0
avg_8_2x32_rvv_i32 : 5.0 4.5
avg_8_2x64_c : 28.5 30.2
avg_8_2x64_rvv_i32 : 9.5 8.7
avg_8_2x128_c : 80.0 70.5
avg_8_2x128_rvv_i32 : 50.7 41.2
avg_8_4x2_c : 1.7 2.0
avg_8_4x2_rvv_i32 : 0.7 0.7
avg_8_4x4_c : 3.5 3.7
avg_8_4x4_rvv_i32 : 1.2 1.0
avg_8_4x8_c : 6.7 7.0
avg_8_4x8_rvv_i32 : 1.5 1.2
avg_8_4x16_c : 13.2 14.0
avg_8_4x16_rvv_i32 : 2.7 2.5
avg_8_4x32_c : 26.2 27.7
avg_8_4x32_rvv_i32 : 5.0 4.5
avg_8_4x64_c : 52.2 55.0
avg_8_4x64_rvv_i32 : 9.5 8.7
avg_8_4x128_c : 146.0 117.5
avg_8_4x128_rvv_i32 : 53.2 40.5
avg_8_8x2_c : 3.5 3.5
avg_8_8x2_rvv_i32 : 0.7 0.7
avg_8_8x4_c : 6.5 6.5
avg_8_8x4_rvv_i32 : 1.2 1.0
avg_8_8x8_c : 12.7 13.2
avg_8_8x8_rvv_i32 : 2.0 1.5
avg_8_8x16_c : 25.2 26.2
avg_8_8x16_rvv_i32 : 3.5 2.5
avg_8_8x32_c : 50.0 52.7
avg_8_8x32_rvv_i32 : 6.5 4.7
avg_8_8x64_c : 99.7 105.0
avg_8_8x64_rvv_i32 : 12.5 8.5
avg_8_8x128_c : 225.7 218.0
avg_8_8x128_rvv_i32 : 78.0 39.2
avg_8_16x2_c : 6.2 6.7
avg_8_16x2_rvv_i32 : 1.2 0.7
avg_8_16x4_c : 12.2 12.7
avg_8_16x4_rvv_i32 : 2.0 1.2
avg_8_16x8_c : 24.7 26.0
avg_8_16x8_rvv_i32 : 3.5 1.7
avg_8_16x16_c : 49.0 51.5
avg_8_16x16_rvv_i32 : 6.2 3.2
avg_8_16x32_c : 97.5 102.5
avg_8_16x32_rvv_i32 : 11.5 5.7
avg_8_16x64_c : 212.5 204.7
avg_8_16x64_rvv_i32 : 22.5 11.0
avg_8_16x128_c : 411.2 418.2
avg_8_16x128_rvv_i32 : 76.0 47.7
avg_8_32x2_c : 12.2 12.7
avg_8_32x2_rvv_i32 : 2.0 1.2
avg_8_32x4_c : 24.2 25.5
avg_8_32x4_rvv_i32 : 3.2 1.7
avg_8_32x8_c : 48.5 50.7
avg_8_32x8_rvv_i32 : 5.7 3.2
avg_8_32x16_c : 96.5 101.2
avg_8_32x16_rvv_i32 : 10.7 5.7
avg_8_32x32_c : 192.5 202.5
avg_8_32x32_rvv_i32 : 20.7 10.5
avg_8_32x64_c : 411.2 404.5
avg_8_32x64_rvv_i32 : 41.0 20.5
avg_8_32x128_c : 834.7 855.2
avg_8_32x128_rvv_i32 : 151.2 118.7
avg_8_64x2_c : 24.0 25.2
avg_8_64x2_rvv_i32 : 3.2 1.7
avg_8_64x4_c : 48.2 50.5
avg_8_64x4_rvv_i32 : 5.2 3.0
avg_8_64x8_c : 95.7 100.7
avg_8_64x8_rvv_i32 : 10.0 5.2
avg_8_64x16_c : 191.7 201.2
avg_8_64x16_rvv_i32 : 19.2 9.5
avg_8_64x32_c : 406.2 402.0
avg_8_64x32_rvv_i32 : 38.0 18.5
avg_8_64x64_c : 827.5 833.7
avg_8_64x64_rvv_i32 : 148.2 95.2
avg_8_64x128_c : 1607.7 1625.7
avg_8_64x128_rvv_i32 : 252.0 179.5
avg_8_128x2_c : 48.7 51.0
avg_8_128x2_rvv_i32 : 5.5 2.7
avg_8_128x4_c : 96.7 101.2
avg_8_128x4_rvv_i32 : 9.7 5.0
avg_8_128x8_c : 192.5 202.0
avg_8_128x8_rvv_i32 : 19.0 9.0
avg_8_128x16_c : 403.5 403.2
avg_8_128x16_rvv_i32 : 37.0 17.5
avg_8_128x32_c : 787.0 805.7
avg_8_128x32_rvv_i32 : 73.5 34.2
avg_8_128x64_c : 1635.7 1654.7
avg_8_128x64_rvv_i32 : 229.5 68.5
avg_8_128x128_c : 3217.0 3233.5
avg_8_128x128_rvv_i32 : 435.0 321.2
w_avg_8_2x2_c : 1.5 1.5
w_avg_8_2x2_rvv_i32 : 1.2 1.2
w_avg_8_2x4_c : 2.7 2.5
w_avg_8_2x4_rvv_i32 : 1.7 1.7
w_avg_8_2x8_c : 5.0 4.7
w_avg_8_2x8_rvv_i32 : 2.7 2.5
w_avg_8_2x16_c : 9.7 9.5
w_avg_8_2x16_rvv_i32 : 4.7 4.5
w_avg_8_2x32_c : 19.0 18.5
w_avg_8_2x32_rvv_i32 : 9.0 8.0
w_avg_8_2x64_c : 37.2 37.0
w_avg_8_2x64_rvv_i32 : 17.5 15.5
w_avg_8_2x128_c : 120.7 82.7
w_avg_8_2x128_rvv_i32 : 71.2 49.0
w_avg_8_4x2_c : 2.5 2.5
w_avg_8_4x2_rvv_i32 : 1.2 1.2
w_avg_8_4x4_c : 4.7 4.5
w_avg_8_4x4_rvv_i32 : 1.7 1.5
w_avg_8_4x8_c : 9.0 9.0
w_avg_8_4x8_rvv_i32 : 2.7 2.5
w_avg_8_4x16_c : 17.7 17.7
w_avg_8_4x16_rvv_i32 : 5.0 4.2
w_avg_8_4x32_c : 34.7 34.7
w_avg_8_4x32_rvv_i32 : 9.0 8.0
w_avg_8_4x64_c : 69.7 69.5
w_avg_8_4x64_rvv_i32 : 17.2 15.5
w_avg_8_4x128_c : 171.7 154.7
w_avg_8_4x128_rvv_i32 : 87.0 48.0
w_avg_8_8x2_c : 4.5 4.5
w_avg_8_8x2_rvv_i32 : 1.5 1.2
w_avg_8_8x4_c : 8.7 8.7
w_avg_8_8x4_rvv_i32 : 2.0 1.7
w_avg_8_8x8_c : 17.2 17.0
w_avg_8_8x8_rvv_i32 : 3.5 2.5
w_avg_8_8x16_c : 34.0 34.0
w_avg_8_8x16_rvv_i32 : 6.0 4.5
w_avg_8_8x32_c : 67.5 68.0
w_avg_8_8x32_rvv_i32 : 10.7 8.2
w_avg_8_8x64_c : 135.7 135.0
w_avg_8_8x64_rvv_i32 : 21.0 15.7
w_avg_8_8x128_c : 304.0 280.0
w_avg_8_8x128_rvv_i32 : 65.5 56.7
w_avg_8_16x2_c : 8.5 8.7
w_avg_8_16x2_rvv_i32 : 2.0 1.2
w_avg_8_16x4_c : 16.7 17.0
w_avg_8_16x4_rvv_i32 : 3.2 2.0
w_avg_8_16x8_c : 33.5 33.5
w_avg_8_16x8_rvv_i32 : 5.7 3.0
w_avg_8_16x16_c : 66.7 62.2
w_avg_8_16x16_rvv_i32 : 27.0 5.2
w_avg_8_16x32_c : 132.5 133.0
w_avg_8_16x32_rvv_i32 : 20.2 9.7
w_avg_8_16x64_c : 264.2 239.0
w_avg_8_16x64_rvv_i32 : 39.7 18.7
w_avg_8_16x128_c : 572.5 541.2
w_avg_8_16x128_rvv_i32 : 148.5 55.2
w_avg_8_32x2_c : 16.7 16.7
w_avg_8_32x2_rvv_i32 : 3.2 2.0
w_avg_8_32x4_c : 33.2 33.2
w_avg_8_32x4_rvv_i32 : 6.0 3.0
w_avg_8_32x8_c : 66.0 66.0
w_avg_8_32x8_rvv_i32 : 11.0 5.5
w_avg_8_32x16_c : 131.2 122.7
w_avg_8_32x16_rvv_i32 : 21.5 9.7
w_avg_8_32x32_c : 262.2 268.7
w_avg_8_32x32_rvv_i32 : 42.2 18.5
w_avg_8_32x64_c : 544.2 547.0
w_avg_8_32x64_rvv_i32 : 83.5 37.0
w_avg_8_32x128_c : 1426.7 1139.7
w_avg_8_32x128_rvv_i32 : 201.0 138.2
w_avg_8_64x2_c : 33.0 33.0
w_avg_8_64x2_rvv_i32 : 6.0 3.0
w_avg_8_64x4_c : 65.7 65.7
w_avg_8_64x4_rvv_i32 : 11.2 5.5
w_avg_8_64x8_c : 131.0 131.5
w_avg_8_64x8_rvv_i32 : 21.5 10.0
w_avg_8_64x16_c : 289.2 262.7
w_avg_8_64x16_rvv_i32 : 42.5 19.2
w_avg_8_64x32_c : 548.7 525.2
w_avg_8_64x32_rvv_i32 : 83.7 37.5
w_avg_8_64x64_c : 1139.5 1208.2
w_avg_8_64x64_rvv_i32 : 209.0 107.5
w_avg_8_64x128_c : 2495.5 2300.5
w_avg_8_64x128_rvv_i32 : 420.2 208.7
w_avg_8_128x2_c : 66.0 66.5
w_avg_8_128x2_rvv_i32 : 11.2 5.5
w_avg_8_128x4_c : 131.2 132.5
w_avg_8_128x4_rvv_i32 : 21.5 10.0
w_avg_8_128x8_c : 280.2 275.7
w_avg_8_128x8_rvv_i32 : 42.2 19.5
w_avg_8_128x16_c : 549.0 527.7
w_avg_8_128x16_rvv_i32 : 104.7 37.7
w_avg_8_128x32_c : 1215.2 1068.5
w_avg_8_128x32_rvv_i32 : 189.0 74.7
w_avg_8_128x64_c : 2305.5 2145.5
w_avg_8_128x64_rvv_i32 : 386.7 190.0
w_avg_8_128x128_c : 5797.0 4600.2
w_avg_8_128x128_rvv_i32 : 760.5 343.0
---
libavcodec/riscv/Makefile | 2 +
libavcodec/riscv/vvc_mc_rvv.S | 312 +++++++++++++++++++++++++++++++++
libavcodec/riscv/vvcdsp_init.c | 71 ++++++++
libavcodec/vvc/dsp.c | 4 +-
libavcodec/vvc/dsp.h | 1 +
5 files changed, 389 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/riscv/vvc_mc_rvv.S
create mode 100644 libavcodec/riscv/vvcdsp_init.c
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 27b268ae39..6297664fc9 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
+OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o
+RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o
diff --git a/libavcodec/riscv/vvc_mc_rvv.S b/libavcodec/riscv/vvc_mc_rvv.S
new file mode 100644
index 0000000000..26a6afba1f
--- /dev/null
+++ b/libavcodec/riscv/vvc_mc_rvv.S
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro vsetvlstatic8 w vlen is_w
+ .if \w <= 2
+ vsetivli zero, \w, e8, mf8, ta, ma
+ .elseif \w <= 4 && \vlen == 128
+ vsetivli zero, \w, e8, mf4, ta, ma
+ .elseif \w <= 4 && \vlen >= 256
+ vsetivli zero, \w, e8, mf8, ta, ma
+ .elseif \w <= 8 && \vlen == 128
+ vsetivli zero, \w, e8, mf2, ta, ma
+ .elseif \w <= 8 && \vlen >= 256
+ vsetivli zero, \w, e8, mf4, ta, ma
+ .elseif \w <= 16 && \vlen == 128
+ vsetivli zero, \w, e8, m1, ta, ma
+ .elseif \w <= 16 && \vlen >= 256
+ vsetivli zero, \w, e8, mf2, ta, ma
+ .elseif \w <= 32 && \vlen >= 256
+ li t0, \w
+ vsetvli zero, t0, e8, m1, ta, ma
+ .elseif \w <= (\vlen / 4) || \is_w
+ li t0, 64
+ vsetvli zero, t0, e8, m2, ta, ma
+ .else
+ li t0, \w
+ vsetvli zero, t0, e8, m4, ta, ma
+ .endif
+.endm
+
+.macro vsetvlstatic16 w vlen is_w
+ .if \w <= 2
+ vsetivli zero, \w, e16, mf4, ta, ma
+ .elseif \w <= 4 && \vlen == 128
+ vsetivli zero, \w, e16, mf2, ta, ma
+ .elseif \w <= 4 && \vlen >= 256
+ vsetivli zero, \w, e16, mf4, ta, ma
+ .elseif \w <= 8 && \vlen == 128
+ vsetivli zero, \w, e16, m1, ta, ma
+ .elseif \w <= 8 && \vlen >= 256
+ vsetivli zero, \w, e16, mf2, ta, ma
+ .elseif \w <= 16 && \vlen == 128
+ vsetivli zero, \w, e16, m2, ta, ma
+ .elseif \w <= 16 && \vlen >= 256
+ vsetivli zero, \w, e16, m1, ta, ma
+ .elseif \w <= 32 && \vlen >= 256
+ li t0, \w
+ vsetvli zero, t0, e16, m2, ta, ma
+ .elseif \w <= (\vlen / 4) || \is_w
+ li t0, 64
+ vsetvli zero, t0, e16, m4, ta, ma
+ .else
+ li t0, \w
+ vsetvli zero, t0, e16, m8, ta, ma
+ .endif
+.endm
+
+.macro vsetvlstatic32 w vlen
+ .if \w <= 2
+ vsetivli zero, \w, e32, mf2, ta, ma
+ .elseif \w <= 4 && \vlen == 128
+ vsetivli zero, \w, e32, m1, ta, ma
+ .elseif \w <= 4 && \vlen >= 256
+ vsetivli zero, \w, e32, mf2, ta, ma
+ .elseif \w <= 8 && \vlen == 128
+ vsetivli zero, \w, e32, m2, ta, ma
+ .elseif \w <= 8 && \vlen >= 256
+ vsetivli zero, \w, e32, m1, ta, ma
+ .elseif \w <= 16 && \vlen == 128
+ vsetivli zero, \w, e32, m4, ta, ma
+ .elseif \w <= 16 && \vlen >= 256
+ vsetivli zero, \w, e32, m2, ta, ma
+ .elseif \w <= 32 && \vlen >= 256
+ li t0, \w
+ vsetvli zero, t0, e32, m4, ta, ma
+ .else
+ li t0, \w
+ vsetvli zero, t0, e32, m8, ta, ma
+ .endif
+.endm
+
+.macro avg_nx1 w vlen
+ vsetvlstatic16 \w, \vlen, 0
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vadd.vv v8, v8, v0
+ vmax.vx v8, v8, zero
+ vsetvlstatic8 \w, \vlen, 0
+ vnclipu.wi v8, v8, 7
+ vse8.v v8, (a0)
+.endm
+
+.macro avg w h vlen
+ csrw vxrm, zero
+
+.if \w <= (\vlen / 4) && \h >= 4
+.rept (\h / 4)
+ vsetvlstatic16 \w, \vlen, 0
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ addi t3, a2, 128*2*2
+ addi t4, a3, 128*2*2
+ addi a7, a3, 128*2*3
+ addi t6, a2, 128*2*3
+ add t2, a0, a1
+ sh1add t5, a1, a0
+ add a6, t5, a1
+ vle16.v v0, (a2)
+ vle16.v v4, (a3)
+ vle16.v v8, (t0)
+ vle16.v v12, (t1)
+ vle16.v v16, (t3)
+ vle16.v v20, (t4)
+ vle16.v v24, (t6)
+ vle16.v v28, (a7)
+ vadd.vv v4, v4, v0
+ vadd.vv v12, v12, v8
+ vadd.vv v20, v20, v16
+ vadd.vv v28, v28, v24
+ vmax.vx v4, v4, zero
+ vmax.vx v12, v12, zero
+ vmax.vx v20, v20, zero
+ vmax.vx v28, v28, zero
+ vsetvlstatic8 \w, \vlen, 0
+ vnclipu.wi v4, v4, 7
+ vnclipu.wi v12, v12, 7
+ vnclipu.wi v20, v20, 7
+ vnclipu.wi v28, v28, 7
+ vse8.v v4, (a0)
+ vse8.v v12, (t2)
+ vse8.v v20, (t5)
+ vse8.v v28, (a6)
+ addi a2, a2, 128*8
+ addi a3, a3, 128*8
+ sh2add a0, a1, a0
+.endr
+
+.elseif (\w <= (\vlen / 4) && \h == 2) || (\w == (\vlen / 2))
+.rept (\h / 2)
+ vsetvlstatic16 \w, \vlen, 0
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ add t2, a0, a1
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vle16.v v16, (t0)
+ vle16.v v24, (t1)
+ vadd.vv v8, v8, v0
+ vadd.vv v24, v24, v16
+ vmax.vx v8, v8, zero
+ vmax.vx v24, v24, zero
+ vsetvlstatic8 \w, \vlen, 0
+ vnclipu.wi v8, v8, 7
+ vnclipu.wi v24, v24, 7
+ vse8.v v8, (a0)
+ vse8.v v24, (t2)
+ addi a2, a2, 128*4
+ addi a3, a3, 128*4
+ sh1add a0, a1, a0
+.endr
+
+.else
+.rept \h
+ avg_nx1 \w, \vlen
+ .if \w == 128 && \vlen == 128
+ addi a2, a2, 64*2
+ addi a3, a3, 64*2
+ addi a0, a0, 64
+ avg_nx1 \w, \vlen
+ addi a2, a2, -64*2
+ addi a3, a3, -64*2
+ addi a0, a0, -64
+ .endif
+ addi a2, a2, 128*2
+ addi a3, a3, 128*2
+ add a0, a0, a1
+.endr
+.endif
+.endm
+
+.macro w_avg_nx1 w vlen
+ vsetvlstatic16 \w, \vlen, 1
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vwmul.vx v16, v0, a7
+ vwmacc.vx v16, t3, v8
+ vsetvlstatic32 \w, \vlen
+ vadd.vx v16, v16, t4
+ vsetvlstatic16 \w, \vlen, 1
+ vnsrl.wx v16, v16, t6
+ vmax.vx v16, v16, zero
+ vsetvlstatic8 \w, \vlen, 1
+ vnclipu.wi v16, v16, 0
+ vse8.v v16, (a0)
+.endm
+
+#if (__riscv_xlen == 64)
+.macro w_avg w h vlen
+ csrw vxrm, zero
+ addi t6, a6, 7
+ ld t3, (sp)
+ ld t4, 8(sp)
+ ld t5, 16(sp)
+ add t4, t4, t5
+ addi t4, t4, 1 // o0 + o1 + 1
+ addi t5, t6, -1 // shift - 1
+ sll t4, t4, t5
+
+.if \w <= (\vlen / 8)
+ .rept (\h / 2)
+ vsetvlstatic16 \w, \vlen, 1
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ add t2, a0, a1
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vle16.v v20, (t0)
+ vle16.v v24, (t1)
+ vwmul.vx v16, v0, a7
+ vwmul.vx v28, v20, a7
+ vwmacc.vx v16, t3, v8
+ vwmacc.vx v28, t3, v24
+ vsetvlstatic32 \w, \vlen
+ vadd.vx v16, v16, t4
+ vadd.vx v28, v28, t4
+ vsetvlstatic16 \w, \vlen, 1
+ vnsrl.wx v16, v16, t6
+ vnsrl.wx v28, v28, t6
+ vmax.vx v16, v16, zero
+ vmax.vx v28, v28, zero
+ vsetvlstatic8 \w, \vlen, 1
+ vnclipu.wi v16, v16, 0
+ vnclipu.wi v28, v28, 0
+ vse8.v v16, (a0)
+ vse8.v v28, (t2)
+ addi a2, a2, 128*4
+ addi a3, a3, 128*4
+ sh1add a0, a1, a0
+ .endr
+.else
+ .rept \h
+ w_avg_nx1 \w, \vlen
+ .if \w == (\vlen / 2)
+ addi a2, a2, (\vlen / 2)
+ addi a3, a3, (\vlen / 2)
+ addi a0, a0, (\vlen / 4)
+ w_avg_nx1 \w, \vlen
+ addi a2, a2, -(\vlen / 2)
+ addi a3, a3, -(\vlen / 2)
+ addi a0, a0, -(\vlen / 4)
+ .elseif \w == 128 && \vlen == 128
+ .rept 3
+ addi a2, a2, (\vlen / 2)
+ addi a3, a3, (\vlen / 2)
+ addi a0, a0, (\vlen / 4)
+ w_avg_nx1 \w, \vlen
+ .endr
+ addi a2, a2, -(\vlen / 2) * 3
+ addi a3, a3, -(\vlen / 2) * 3
+ addi a0, a0, -(\vlen / 4) * 3
+ .endif
+
+ addi a2, a2, 128*2
+ addi a3, a3, 128*2
+ add a0, a0, a1
+ .endr
+.endif
+.endm
+#endif
+
+.macro func_avg name vlen
+func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x
+.irp w,2,4,8,16,32,64,128
+ li t3, \w
+ bne a4, t3, \name\vlen\()end\w
+.irp h,2,4,8,16,32,64,128
+ li t4, \h
+ bne a5, t4, \name\vlen\()end\w\h
+ \name \w \h \vlen
+ ret
+\name\vlen\()end\w\h:
+.endr
+\name\vlen\()end\w:
+.endr
+endfunc
+.endm
+
+func_avg avg 256
+func_avg avg 128
+#if (__riscv_xlen == 64)
+func_avg w_avg 256
+func_avg w_avg 128
+#endif
diff --git a/libavcodec/riscv/vvcdsp_init.c b/libavcodec/riscv/vvcdsp_init.c
new file mode 100644
index 0000000000..85b1ede061
--- /dev/null
+++ b/libavcodec/riscv/vvcdsp_init.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/vvc/dsp.h"
+
+#define bf(fn, bd, opt) fn##_##bd##_##opt
+
+#define AVG_PROTOTYPES(bd, opt) \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height); \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height, \
+ int denom, int w0, int w1, int o0, int o1);
+
+AVG_PROTOTYPES(8, rvv_128)
+AVG_PROTOTYPES(8, rvv_256)
+
+void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
+{
+#if HAVE_RVV
+ const int flags = av_get_cpu_flags();
+
+ if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+ ff_rv_vlen_least(256)) {
+ switch (bd) {
+ case 8:
+ c->inter.avg = ff_vvc_avg_8_rvv_256;
+# if (__riscv_xlen == 64)
+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
+# endif
+ break;
+ default:
+ break;
+ }
+ } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+ ff_rv_vlen_least(128)) {
+ switch (bd) {
+ case 8:
+ c->inter.avg = ff_vvc_avg_8_rvv_128;
+# if (__riscv_xlen == 64)
+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
+# endif
+ break;
+ default:
+ break;
+ }
+ }
+#endif
+}
diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
index 41e830a98a..c55a37d255 100644
--- a/libavcodec/vvc/dsp.c
+++ b/libavcodec/vvc/dsp.c
@@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
break;
}
-#if ARCH_X86
+#if ARCH_RISCV
+ ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
+#elif ARCH_X86
ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
#endif
}
diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
index 9810ac314c..dcb978549f 100644
--- a/libavcodec/vvc/dsp.h
+++ b/libavcodec/vvc/dsp.h
@@ -167,6 +167,7 @@ typedef struct VVCDSPContext {
void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
+void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
#endif /* AVCODEC_VVC_DSP_H */
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-05-21 7:47 ` uk7b
@ 2024-05-21 7:48 ` flow gg
0 siblings, 0 replies; 19+ messages in thread
From: flow gg @ 2024-05-21 7:48 UTC (permalink / raw)
To: FFmpeg development discussions and patches
There are three unused lines which I forgot to delete before submitting. I
have updated them here.
<uk7b@foxmail.com> 于2024年5月21日周二 15:47写道:
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908 X60
> avg_8_2x2_c : 1.0 1.0
> avg_8_2x2_rvv_i32 : 0.7 0.7
> avg_8_2x4_c : 2.0 2.0
> avg_8_2x4_rvv_i32 : 1.0 0.7
> avg_8_2x8_c : 4.0 3.7
> avg_8_2x8_rvv_i32 : 1.5 1.2
> avg_8_2x16_c : 7.5 7.7
> avg_8_2x16_rvv_i32 : 2.7 2.5
> avg_8_2x32_c : 14.2 15.0
> avg_8_2x32_rvv_i32 : 5.0 4.5
> avg_8_2x64_c : 28.5 30.2
> avg_8_2x64_rvv_i32 : 9.5 8.7
> avg_8_2x128_c : 80.0 70.5
> avg_8_2x128_rvv_i32 : 50.7 41.2
> avg_8_4x2_c : 1.7 2.0
> avg_8_4x2_rvv_i32 : 0.7 0.7
> avg_8_4x4_c : 3.5 3.7
> avg_8_4x4_rvv_i32 : 1.2 1.0
> avg_8_4x8_c : 6.7 7.0
> avg_8_4x8_rvv_i32 : 1.5 1.2
> avg_8_4x16_c : 13.2 14.0
> avg_8_4x16_rvv_i32 : 2.7 2.5
> avg_8_4x32_c : 26.2 27.7
> avg_8_4x32_rvv_i32 : 5.0 4.5
> avg_8_4x64_c : 52.2 55.0
> avg_8_4x64_rvv_i32 : 9.5 8.7
> avg_8_4x128_c : 146.0 117.5
> avg_8_4x128_rvv_i32 : 53.2 40.5
> avg_8_8x2_c : 3.5 3.5
> avg_8_8x2_rvv_i32 : 0.7 0.7
> avg_8_8x4_c : 6.5 6.5
> avg_8_8x4_rvv_i32 : 1.2 1.0
> avg_8_8x8_c : 12.7 13.2
> avg_8_8x8_rvv_i32 : 2.0 1.5
> avg_8_8x16_c : 25.2 26.2
> avg_8_8x16_rvv_i32 : 3.5 2.5
> avg_8_8x32_c : 50.0 52.7
> avg_8_8x32_rvv_i32 : 6.5 4.7
> avg_8_8x64_c : 99.7 105.0
> avg_8_8x64_rvv_i32 : 12.5 8.5
> avg_8_8x128_c : 225.7 218.0
> avg_8_8x128_rvv_i32 : 78.0 39.2
> avg_8_16x2_c : 6.2 6.7
> avg_8_16x2_rvv_i32 : 1.2 0.7
> avg_8_16x4_c : 12.2 12.7
> avg_8_16x4_rvv_i32 : 2.0 1.2
> avg_8_16x8_c : 24.7 26.0
> avg_8_16x8_rvv_i32 : 3.5 1.7
> avg_8_16x16_c : 49.0 51.5
> avg_8_16x16_rvv_i32 : 6.2 3.2
> avg_8_16x32_c : 97.5 102.5
> avg_8_16x32_rvv_i32 : 11.5 5.7
> avg_8_16x64_c : 212.5 204.7
> avg_8_16x64_rvv_i32 : 22.5 11.0
> avg_8_16x128_c : 411.2 418.2
> avg_8_16x128_rvv_i32 : 76.0 47.7
> avg_8_32x2_c : 12.2 12.7
> avg_8_32x2_rvv_i32 : 2.0 1.2
> avg_8_32x4_c : 24.2 25.5
> avg_8_32x4_rvv_i32 : 3.2 1.7
> avg_8_32x8_c : 48.5 50.7
> avg_8_32x8_rvv_i32 : 5.7 3.2
> avg_8_32x16_c : 96.5 101.2
> avg_8_32x16_rvv_i32 : 10.7 5.7
> avg_8_32x32_c : 192.5 202.5
> avg_8_32x32_rvv_i32 : 20.7 10.5
> avg_8_32x64_c : 411.2 404.5
> avg_8_32x64_rvv_i32 : 41.0 20.5
> avg_8_32x128_c : 834.7 855.2
> avg_8_32x128_rvv_i32 : 151.2 118.7
> avg_8_64x2_c : 24.0 25.2
> avg_8_64x2_rvv_i32 : 3.2 1.7
> avg_8_64x4_c : 48.2 50.5
> avg_8_64x4_rvv_i32 : 5.2 3.0
> avg_8_64x8_c : 95.7 100.7
> avg_8_64x8_rvv_i32 : 10.0 5.2
> avg_8_64x16_c : 191.7 201.2
> avg_8_64x16_rvv_i32 : 19.2 9.5
> avg_8_64x32_c : 406.2 402.0
> avg_8_64x32_rvv_i32 : 38.0 18.5
> avg_8_64x64_c : 827.5 833.7
> avg_8_64x64_rvv_i32 : 148.2 95.2
> avg_8_64x128_c : 1607.7 1625.7
> avg_8_64x128_rvv_i32 : 252.0 179.5
> avg_8_128x2_c : 48.7 51.0
> avg_8_128x2_rvv_i32 : 5.5 2.7
> avg_8_128x4_c : 96.7 101.2
> avg_8_128x4_rvv_i32 : 9.7 5.0
> avg_8_128x8_c : 192.5 202.0
> avg_8_128x8_rvv_i32 : 19.0 9.0
> avg_8_128x16_c : 403.5 403.2
> avg_8_128x16_rvv_i32 : 37.0 17.5
> avg_8_128x32_c : 787.0 805.7
> avg_8_128x32_rvv_i32 : 73.5 34.2
> avg_8_128x64_c : 1635.7 1654.7
> avg_8_128x64_rvv_i32 : 229.5 68.5
> avg_8_128x128_c : 3217.0 3233.5
> avg_8_128x128_rvv_i32 : 435.0 321.2
> w_avg_8_2x2_c : 1.5 1.5
> w_avg_8_2x2_rvv_i32 : 1.2 1.2
> w_avg_8_2x4_c : 2.7 2.5
> w_avg_8_2x4_rvv_i32 : 1.7 1.7
> w_avg_8_2x8_c : 5.0 4.7
> w_avg_8_2x8_rvv_i32 : 2.7 2.5
> w_avg_8_2x16_c : 9.7 9.5
> w_avg_8_2x16_rvv_i32 : 4.7 4.5
> w_avg_8_2x32_c : 19.0 18.5
> w_avg_8_2x32_rvv_i32 : 9.0 8.0
> w_avg_8_2x64_c : 37.2 37.0
> w_avg_8_2x64_rvv_i32 : 17.5 15.5
> w_avg_8_2x128_c : 120.7 82.7
> w_avg_8_2x128_rvv_i32 : 71.2 49.0
> w_avg_8_4x2_c : 2.5 2.5
> w_avg_8_4x2_rvv_i32 : 1.2 1.2
> w_avg_8_4x4_c : 4.7 4.5
> w_avg_8_4x4_rvv_i32 : 1.7 1.5
> w_avg_8_4x8_c : 9.0 9.0
> w_avg_8_4x8_rvv_i32 : 2.7 2.5
> w_avg_8_4x16_c : 17.7 17.7
> w_avg_8_4x16_rvv_i32 : 5.0 4.2
> w_avg_8_4x32_c : 34.7 34.7
> w_avg_8_4x32_rvv_i32 : 9.0 8.0
> w_avg_8_4x64_c : 69.7 69.5
> w_avg_8_4x64_rvv_i32 : 17.2 15.5
> w_avg_8_4x128_c : 171.7 154.7
> w_avg_8_4x128_rvv_i32 : 87.0 48.0
> w_avg_8_8x2_c : 4.5 4.5
> w_avg_8_8x2_rvv_i32 : 1.5 1.2
> w_avg_8_8x4_c : 8.7 8.7
> w_avg_8_8x4_rvv_i32 : 2.0 1.7
> w_avg_8_8x8_c : 17.2 17.0
> w_avg_8_8x8_rvv_i32 : 3.5 2.5
> w_avg_8_8x16_c : 34.0 34.0
> w_avg_8_8x16_rvv_i32 : 6.0 4.5
> w_avg_8_8x32_c : 67.5 68.0
> w_avg_8_8x32_rvv_i32 : 10.7 8.2
> w_avg_8_8x64_c : 135.7 135.0
> w_avg_8_8x64_rvv_i32 : 21.0 15.7
> w_avg_8_8x128_c : 304.0 280.0
> w_avg_8_8x128_rvv_i32 : 65.5 56.7
> w_avg_8_16x2_c : 8.5 8.7
> w_avg_8_16x2_rvv_i32 : 2.0 1.2
> w_avg_8_16x4_c : 16.7 17.0
> w_avg_8_16x4_rvv_i32 : 3.2 2.0
> w_avg_8_16x8_c : 33.5 33.5
> w_avg_8_16x8_rvv_i32 : 5.7 3.0
> w_avg_8_16x16_c : 66.7 62.2
> w_avg_8_16x16_rvv_i32 : 27.0 5.2
> w_avg_8_16x32_c : 132.5 133.0
> w_avg_8_16x32_rvv_i32 : 20.2 9.7
> w_avg_8_16x64_c : 264.2 239.0
> w_avg_8_16x64_rvv_i32 : 39.7 18.7
> w_avg_8_16x128_c : 572.5 541.2
> w_avg_8_16x128_rvv_i32 : 148.5 55.2
> w_avg_8_32x2_c : 16.7 16.7
> w_avg_8_32x2_rvv_i32 : 3.2 2.0
> w_avg_8_32x4_c : 33.2 33.2
> w_avg_8_32x4_rvv_i32 : 6.0 3.0
> w_avg_8_32x8_c : 66.0 66.0
> w_avg_8_32x8_rvv_i32 : 11.0 5.5
> w_avg_8_32x16_c : 131.2 122.7
> w_avg_8_32x16_rvv_i32 : 21.5 9.7
> w_avg_8_32x32_c : 262.2 268.7
> w_avg_8_32x32_rvv_i32 : 42.2 18.5
> w_avg_8_32x64_c : 544.2 547.0
> w_avg_8_32x64_rvv_i32 : 83.5 37.0
> w_avg_8_32x128_c : 1426.7 1139.7
> w_avg_8_32x128_rvv_i32 : 201.0 138.2
> w_avg_8_64x2_c : 33.0 33.0
> w_avg_8_64x2_rvv_i32 : 6.0 3.0
> w_avg_8_64x4_c : 65.7 65.7
> w_avg_8_64x4_rvv_i32 : 11.2 5.5
> w_avg_8_64x8_c : 131.0 131.5
> w_avg_8_64x8_rvv_i32 : 21.5 10.0
> w_avg_8_64x16_c : 289.2 262.7
> w_avg_8_64x16_rvv_i32 : 42.5 19.2
> w_avg_8_64x32_c : 548.7 525.2
> w_avg_8_64x32_rvv_i32 : 83.7 37.5
> w_avg_8_64x64_c : 1139.5 1208.2
> w_avg_8_64x64_rvv_i32 : 209.0 107.5
> w_avg_8_64x128_c : 2495.5 2300.5
> w_avg_8_64x128_rvv_i32 : 420.2 208.7
> w_avg_8_128x2_c : 66.0 66.5
> w_avg_8_128x2_rvv_i32 : 11.2 5.5
> w_avg_8_128x4_c : 131.2 132.5
> w_avg_8_128x4_rvv_i32 : 21.5 10.0
> w_avg_8_128x8_c : 280.2 275.7
> w_avg_8_128x8_rvv_i32 : 42.2 19.5
> w_avg_8_128x16_c : 549.0 527.7
> w_avg_8_128x16_rvv_i32 : 104.7 37.7
> w_avg_8_128x32_c : 1215.2 1068.5
> w_avg_8_128x32_rvv_i32 : 189.0 74.7
> w_avg_8_128x64_c : 2305.5 2145.5
> w_avg_8_128x64_rvv_i32 : 386.7 190.0
> w_avg_8_128x128_c : 5797.0 4600.2
> w_avg_8_128x128_rvv_i32 : 760.5 343.0
> ---
> libavcodec/riscv/Makefile | 2 +
> libavcodec/riscv/vvc_mc_rvv.S | 312 +++++++++++++++++++++++++++++++++
> libavcodec/riscv/vvcdsp_init.c | 71 ++++++++
> libavcodec/vvc/dsp.c | 4 +-
> libavcodec/vvc/dsp.h | 1 +
> 5 files changed, 389 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/riscv/vvc_mc_rvv.S
> create mode 100644 libavcodec/riscv/vvcdsp_init.c
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 27b268ae39..6297664fc9 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
> RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o
> +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o
> diff --git a/libavcodec/riscv/vvc_mc_rvv.S b/libavcodec/riscv/vvc_mc_rvv.S
> new file mode 100644
> index 0000000000..26a6afba1f
> --- /dev/null
> +++ b/libavcodec/riscv/vvc_mc_rvv.S
> @@ -0,0 +1,312 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro vsetvlstatic8 w vlen is_w
> + .if \w <= 2
> + vsetivli zero, \w, e8, mf8, ta, ma
> + .elseif \w <= 4 && \vlen == 128
> + vsetivli zero, \w, e8, mf4, ta, ma
> + .elseif \w <= 4 && \vlen >= 256
> + vsetivli zero, \w, e8, mf8, ta, ma
> + .elseif \w <= 8 && \vlen == 128
> + vsetivli zero, \w, e8, mf2, ta, ma
> + .elseif \w <= 8 && \vlen >= 256
> + vsetivli zero, \w, e8, mf4, ta, ma
> + .elseif \w <= 16 && \vlen == 128
> + vsetivli zero, \w, e8, m1, ta, ma
> + .elseif \w <= 16 && \vlen >= 256
> + vsetivli zero, \w, e8, mf2, ta, ma
> + .elseif \w <= 32 && \vlen >= 256
> + li t0, \w
> + vsetvli zero, t0, e8, m1, ta, ma
> + .elseif \w <= (\vlen / 4) || \is_w
> + li t0, 64
> + vsetvli zero, t0, e8, m2, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e8, m4, ta, ma
> + .endif
> +.endm
> +
> +.macro vsetvlstatic16 w vlen is_w
> + .if \w <= 2
> + vsetivli zero, \w, e16, mf4, ta, ma
> + .elseif \w <= 4 && \vlen == 128
> + vsetivli zero, \w, e16, mf2, ta, ma
> + .elseif \w <= 4 && \vlen >= 256
> + vsetivli zero, \w, e16, mf4, ta, ma
> + .elseif \w <= 8 && \vlen == 128
> + vsetivli zero, \w, e16, m1, ta, ma
> + .elseif \w <= 8 && \vlen >= 256
> + vsetivli zero, \w, e16, mf2, ta, ma
> + .elseif \w <= 16 && \vlen == 128
> + vsetivli zero, \w, e16, m2, ta, ma
> + .elseif \w <= 16 && \vlen >= 256
> + vsetivli zero, \w, e16, m1, ta, ma
> + .elseif \w <= 32 && \vlen >= 256
> + li t0, \w
> + vsetvli zero, t0, e16, m2, ta, ma
> + .elseif \w <= (\vlen / 4) || \is_w
> + li t0, 64
> + vsetvli zero, t0, e16, m4, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e16, m8, ta, ma
> + .endif
> +.endm
> +
> +.macro vsetvlstatic32 w vlen
> + .if \w <= 2
> + vsetivli zero, \w, e32, mf2, ta, ma
> + .elseif \w <= 4 && \vlen == 128
> + vsetivli zero, \w, e32, m1, ta, ma
> + .elseif \w <= 4 && \vlen >= 256
> + vsetivli zero, \w, e32, mf2, ta, ma
> + .elseif \w <= 8 && \vlen == 128
> + vsetivli zero, \w, e32, m2, ta, ma
> + .elseif \w <= 8 && \vlen >= 256
> + vsetivli zero, \w, e32, m1, ta, ma
> + .elseif \w <= 16 && \vlen == 128
> + vsetivli zero, \w, e32, m4, ta, ma
> + .elseif \w <= 16 && \vlen >= 256
> + vsetivli zero, \w, e32, m2, ta, ma
> + .elseif \w <= 32 && \vlen >= 256
> + li t0, \w
> + vsetvli zero, t0, e32, m4, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e32, m8, ta, ma
> + .endif
> +.endm
> +
> +.macro avg_nx1 w vlen
> + vsetvlstatic16 \w, \vlen, 0
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vadd.vv v8, v8, v0
> + vmax.vx v8, v8, zero
> + vsetvlstatic8 \w, \vlen, 0
> + vnclipu.wi v8, v8, 7
> + vse8.v v8, (a0)
> +.endm
> +
> +.macro avg w h vlen
> + csrw vxrm, zero
> +
> +.if \w <= (\vlen / 4) && \h >= 4
> +.rept (\h / 4)
> + vsetvlstatic16 \w, \vlen, 0
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + addi t3, a2, 128*2*2
> + addi t4, a3, 128*2*2
> + addi a7, a3, 128*2*3
> + addi t6, a2, 128*2*3
> + add t2, a0, a1
> + sh1add t5, a1, a0
> + add a6, t5, a1
> + vle16.v v0, (a2)
> + vle16.v v4, (a3)
> + vle16.v v8, (t0)
> + vle16.v v12, (t1)
> + vle16.v v16, (t3)
> + vle16.v v20, (t4)
> + vle16.v v24, (t6)
> + vle16.v v28, (a7)
> + vadd.vv v4, v4, v0
> + vadd.vv v12, v12, v8
> + vadd.vv v20, v20, v16
> + vadd.vv v28, v28, v24
> + vmax.vx v4, v4, zero
> + vmax.vx v12, v12, zero
> + vmax.vx v20, v20, zero
> + vmax.vx v28, v28, zero
> + vsetvlstatic8 \w, \vlen, 0
> + vnclipu.wi v4, v4, 7
> + vnclipu.wi v12, v12, 7
> + vnclipu.wi v20, v20, 7
> + vnclipu.wi v28, v28, 7
> + vse8.v v4, (a0)
> + vse8.v v12, (t2)
> + vse8.v v20, (t5)
> + vse8.v v28, (a6)
> + addi a2, a2, 128*8
> + addi a3, a3, 128*8
> + sh2add a0, a1, a0
> +.endr
> +
> +.elseif (\w <= (\vlen / 4) && \h == 2) || (\w == (\vlen / 2))
> +.rept (\h / 2)
> + vsetvlstatic16 \w, \vlen, 0
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + add t2, a0, a1
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vle16.v v16, (t0)
> + vle16.v v24, (t1)
> + vadd.vv v8, v8, v0
> + vadd.vv v24, v24, v16
> + vmax.vx v8, v8, zero
> + vmax.vx v24, v24, zero
> + vsetvlstatic8 \w, \vlen, 0
> + vnclipu.wi v8, v8, 7
> + vnclipu.wi v24, v24, 7
> + vse8.v v8, (a0)
> + vse8.v v24, (t2)
> + addi a2, a2, 128*4
> + addi a3, a3, 128*4
> + sh1add a0, a1, a0
> +.endr
> +
> +.else
> +.rept \h
> + avg_nx1 \w, \vlen
> + .if \w == 128 && \vlen == 128
> + addi a2, a2, 64*2
> + addi a3, a3, 64*2
> + addi a0, a0, 64
> + avg_nx1 \w, \vlen
> + addi a2, a2, -64*2
> + addi a3, a3, -64*2
> + addi a0, a0, -64
> + .endif
> + addi a2, a2, 128*2
> + addi a3, a3, 128*2
> + add a0, a0, a1
> +.endr
> +.endif
> +.endm
> +
> +.macro w_avg_nx1 w vlen
> + vsetvlstatic16 \w, \vlen, 1
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vwmul.vx v16, v0, a7
> + vwmacc.vx v16, t3, v8
> + vsetvlstatic32 \w, \vlen
> + vadd.vx v16, v16, t4
> + vsetvlstatic16 \w, \vlen, 1
> + vnsrl.wx v16, v16, t6
> + vmax.vx v16, v16, zero
> + vsetvlstatic8 \w, \vlen, 1
> + vnclipu.wi v16, v16, 0
> + vse8.v v16, (a0)
> +.endm
> +
> +#if (__riscv_xlen == 64)
> +.macro w_avg w h vlen
> + csrw vxrm, zero
> + addi t6, a6, 7
> + ld t3, (sp)
> + ld t4, 8(sp)
> + ld t5, 16(sp)
> + add t4, t4, t5
> + addi t4, t4, 1 // o0 + o1 + 1
> + addi t5, t6, -1 // shift - 1
> + sll t4, t4, t5
> +
> +.if \w <= (\vlen / 8)
> + .rept (\h / 2)
> + vsetvlstatic16 \w, \vlen, 1
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + add t2, a0, a1
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vle16.v v20, (t0)
> + vle16.v v24, (t1)
> + vwmul.vx v16, v0, a7
> + vwmul.vx v28, v20, a7
> + vwmacc.vx v16, t3, v8
> + vwmacc.vx v28, t3, v24
> + vsetvlstatic32 \w, \vlen
> + vadd.vx v16, v16, t4
> + vadd.vx v28, v28, t4
> + vsetvlstatic16 \w, \vlen, 1
> + vnsrl.wx v16, v16, t6
> + vnsrl.wx v28, v28, t6
> + vmax.vx v16, v16, zero
> + vmax.vx v28, v28, zero
> + vsetvlstatic8 \w, \vlen, 1
> + vnclipu.wi v16, v16, 0
> + vnclipu.wi v28, v28, 0
> + vse8.v v16, (a0)
> + vse8.v v28, (t2)
> + addi a2, a2, 128*4
> + addi a3, a3, 128*4
> + sh1add a0, a1, a0
> + .endr
> +.else
> + .rept \h
> + w_avg_nx1 \w, \vlen
> + .if \w == (\vlen / 2)
> + addi a2, a2, (\vlen / 2)
> + addi a3, a3, (\vlen / 2)
> + addi a0, a0, (\vlen / 4)
> + w_avg_nx1 \w, \vlen
> + addi a2, a2, -(\vlen / 2)
> + addi a3, a3, -(\vlen / 2)
> + addi a0, a0, -(\vlen / 4)
> + .elseif \w == 128 && \vlen == 128
> + .rept 3
> + addi a2, a2, (\vlen / 2)
> + addi a3, a3, (\vlen / 2)
> + addi a0, a0, (\vlen / 4)
> + w_avg_nx1 \w, \vlen
> + .endr
> + addi a2, a2, -(\vlen / 2) * 3
> + addi a3, a3, -(\vlen / 2) * 3
> + addi a0, a0, -(\vlen / 4) * 3
> + .endif
> +
> + addi a2, a2, 128*2
> + addi a3, a3, 128*2
> + add a0, a0, a1
> + .endr
> +.endif
> +.endm
> +#endif
> +
> +.macro func_avg name vlen
> +func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x
> +.irp w,2,4,8,16,32,64,128
> + li t3, \w
> + bne a4, t3, \name\vlen\()end\w
> +.irp h,2,4,8,16,32,64,128
> + li t4, \h
> + bne a5, t4, \name\vlen\()end\w\h
> + \name \w \h \vlen
> + ret
> +\name\vlen\()end\w\h:
> +.endr
> +\name\vlen\()end\w:
> +.endr
> +endfunc
> +.endm
> +
> +func_avg avg 256
> +func_avg avg 128
> +#if (__riscv_xlen == 64)
> +func_avg w_avg 256
> +func_avg w_avg 128
> +#endif
> diff --git a/libavcodec/riscv/vvcdsp_init.c
> b/libavcodec/riscv/vvcdsp_init.c
> new file mode 100644
> index 0000000000..85b1ede061
> --- /dev/null
> +++ b/libavcodec/riscv/vvcdsp_init.c
> @@ -0,0 +1,71 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "config.h"
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
> +#include "libavcodec/vvc/dsp.h"
> +
> +#define bf(fn, bd, opt) fn##_##bd##_##opt
> +
> +#define AVG_PROTOTYPES(bd, opt)
> \
> +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,
> \
> + const int16_t *src0, const int16_t *src1, int width, int height);
> \
> +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,
> \
> + const int16_t *src0, const int16_t *src1, int width, int height,
> \
> + int denom, int w0, int w1, int o0, int o1);
> +
> +AVG_PROTOTYPES(8, rvv_128)
> +AVG_PROTOTYPES(8, rvv_256)
> +
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> +{
> +#if HAVE_RVV
> + const int flags = av_get_cpu_flags();
> +
> + if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
> + ff_rv_vlen_least(256)) {
> + switch (bd) {
> + case 8:
> + c->inter.avg = ff_vvc_avg_8_rvv_256;
> +# if (__riscv_xlen == 64)
> + c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
> +# endif
> + break;
> + default:
> + break;
> + }
> + } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags &
> AV_CPU_FLAG_RVB_ADDR) &&
> + ff_rv_vlen_least(128)) {
> + switch (bd) {
> + case 8:
> + c->inter.avg = ff_vvc_avg_8_rvv_128;
> +# if (__riscv_xlen == 64)
> + c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
> +# endif
> + break;
> + default:
> + break;
> + }
> + }
> +#endif
> +}
> diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
> index 41e830a98a..c55a37d255 100644
> --- a/libavcodec/vvc/dsp.c
> +++ b/libavcodec/vvc/dsp.c
> @@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int
> bit_depth)
> break;
> }
>
> -#if ARCH_X86
> +#if ARCH_RISCV
> + ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
> +#elif ARCH_X86
> ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
> #endif
> }
> diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> index 9810ac314c..dcb978549f 100644
> --- a/libavcodec/vvc/dsp.h
> +++ b/libavcodec/vvc/dsp.h
> @@ -167,6 +167,7 @@ typedef struct VVCDSPContext {
>
> void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
>
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
> void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
>
> #endif /* AVCODEC_VVC_DSP_H */
> --
> 2.45.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-05-21 7:37 [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg uk7b
2024-05-21 7:38 ` flow gg
2024-05-21 7:47 ` uk7b
@ 2024-05-21 16:03 ` Rémi Denis-Courmont
2024-05-21 17:24 ` flow gg
2 siblings, 1 reply; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-21 16:03 UTC (permalink / raw)
To: ffmpeg-devel
Le tiistaina 21. toukokuuta 2024, 10.37.51 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> ---
> libavcodec/riscv/Makefile | 2 +
> libavcodec/riscv/vvc_mc_rvv.S | 312 +++++++++++++++++++++++++++++++++
> libavcodec/riscv/vvcdsp_init.c | 76 ++++++++
> libavcodec/vvc/dsp.c | 4 +-
> libavcodec/vvc/dsp.h | 1 +
> 5 files changed, 394 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/riscv/vvc_mc_rvv.S
> create mode 100644 libavcodec/riscv/vvcdsp_init.c
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 27b268ae39..6297664fc9 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
> RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o
> +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o
> diff --git a/libavcodec/riscv/vvc_mc_rvv.S b/libavcodec/riscv/vvc_mc_rvv.S
> new file mode 100644
> index 0000000000..26a6afba1f
> --- /dev/null
> +++ b/libavcodec/riscv/vvc_mc_rvv.S
> @@ -0,0 +1,312 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS). + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro vsetvlstatic8 w vlen is_w
> + .if \w <= 2
> + vsetivli zero, \w, e8, mf8, ta, ma
> + .elseif \w <= 4 && \vlen == 128
> + vsetivli zero, \w, e8, mf4, ta, ma
> + .elseif \w <= 4 && \vlen >= 256
> + vsetivli zero, \w, e8, mf8, ta, ma
> + .elseif \w <= 8 && \vlen == 128
> + vsetivli zero, \w, e8, mf2, ta, ma
> + .elseif \w <= 8 && \vlen >= 256
> + vsetivli zero, \w, e8, mf4, ta, ma
> + .elseif \w <= 16 && \vlen == 128
> + vsetivli zero, \w, e8, m1, ta, ma
> + .elseif \w <= 16 && \vlen >= 256
> + vsetivli zero, \w, e8, mf2, ta, ma
> + .elseif \w <= 32 && \vlen >= 256
> + li t0, \w
> + vsetvli zero, t0, e8, m1, ta, ma
> + .elseif \w <= (\vlen / 4) || \is_w
> + li t0, 64
> + vsetvli zero, t0, e8, m2, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e8, m4, ta, ma
> + .endif
> +.endm
> +
> +.macro vsetvlstatic16 w vlen is_w
> + .if \w <= 2
> + vsetivli zero, \w, e16, mf4, ta, ma
> + .elseif \w <= 4 && \vlen == 128
> + vsetivli zero, \w, e16, mf2, ta, ma
> + .elseif \w <= 4 && \vlen >= 256
> + vsetivli zero, \w, e16, mf4, ta, ma
> + .elseif \w <= 8 && \vlen == 128
> + vsetivli zero, \w, e16, m1, ta, ma
> + .elseif \w <= 8 && \vlen >= 256
> + vsetivli zero, \w, e16, mf2, ta, ma
> + .elseif \w <= 16 && \vlen == 128
> + vsetivli zero, \w, e16, m2, ta, ma
> + .elseif \w <= 16 && \vlen >= 256
> + vsetivli zero, \w, e16, m1, ta, ma
> + .elseif \w <= 32 && \vlen >= 256
> + li t0, \w
> + vsetvli zero, t0, e16, m2, ta, ma
> + .elseif \w <= (\vlen / 4) || \is_w
> + li t0, 64
> + vsetvli zero, t0, e16, m4, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e16, m8, ta, ma
> + .endif
> +.endm
> +
> +.macro vsetvlstatic32 w vlen
> + .if \w <= 2
> + vsetivli zero, \w, e32, mf2, ta, ma
> + .elseif \w <= 4 && \vlen == 128
> + vsetivli zero, \w, e32, m1, ta, ma
> + .elseif \w <= 4 && \vlen >= 256
> + vsetivli zero, \w, e32, mf2, ta, ma
> + .elseif \w <= 8 && \vlen == 128
> + vsetivli zero, \w, e32, m2, ta, ma
> + .elseif \w <= 8 && \vlen >= 256
> + vsetivli zero, \w, e32, m1, ta, ma
> + .elseif \w <= 16 && \vlen == 128
> + vsetivli zero, \w, e32, m4, ta, ma
> + .elseif \w <= 16 && \vlen >= 256
> + vsetivli zero, \w, e32, m2, ta, ma
> + .elseif \w <= 32 && \vlen >= 256
> + li t0, \w
> + vsetvli zero, t0, e32, m4, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e32, m8, ta, ma
> + .endif
> +.endm
> +
> +.macro avg_nx1 w vlen
> + vsetvlstatic16 \w, \vlen, 0
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vadd.vv v8, v8, v0
> + vmax.vx v8, v8, zero
> + vsetvlstatic8 \w, \vlen, 0
> + vnclipu.wi v8, v8, 7
> + vse8.v v8, (a0)
> +.endm
> +
> +.macro avg w h vlen
> + csrw vxrm, zero
> +
> +.if \w <= (\vlen / 4) && \h >= 4
> +.rept (\h / 4)
> + vsetvlstatic16 \w, \vlen, 0
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + addi t3, a2, 128*2*2
> + addi t4, a3, 128*2*2
> + addi a7, a3, 128*2*3
> + addi t6, a2, 128*2*3
> + add t2, a0, a1
> + sh1add t5, a1, a0
> + add a6, t5, a1
> + vle16.v v0, (a2)
> + vle16.v v4, (a3)
> + vle16.v v8, (t0)
> + vle16.v v12, (t1)
> + vle16.v v16, (t3)
> + vle16.v v20, (t4)
> + vle16.v v24, (t6)
> + vle16.v v28, (a7)
I would expect that you can get better performance by interleaving scalar and
vector stuff, and possibly also vector loads and vector arithmetic.
> + vadd.vv v4, v4, v0
> + vadd.vv v12, v12, v8
> + vadd.vv v20, v20, v16
> + vadd.vv v28, v28, v24
> + vmax.vx v4, v4, zero
> + vmax.vx v12, v12, zero
> + vmax.vx v20, v20, zero
> + vmax.vx v28, v28, zero
> + vsetvlstatic8 \w, \vlen, 0
> + vnclipu.wi v4, v4, 7
> + vnclipu.wi v12, v12, 7
> + vnclipu.wi v20, v20, 7
> + vnclipu.wi v28, v28, 7
> + vse8.v v4, (a0)
> + vse8.v v12, (t2)
> + vse8.v v20, (t5)
> + vse8.v v28, (a6)
> + addi a2, a2, 128*8
> + addi a3, a3, 128*8
> + sh2add a0, a1, a0
> +.endr
> +
> +.elseif (\w <= (\vlen / 4) && \h == 2) || (\w == (\vlen / 2))
> +.rept (\h / 2)
> + vsetvlstatic16 \w, \vlen, 0
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + add t2, a0, a1
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vle16.v v16, (t0)
> + vle16.v v24, (t1)
> + vadd.vv v8, v8, v0
> + vadd.vv v24, v24, v16
> + vmax.vx v8, v8, zero
> + vmax.vx v24, v24, zero
> + vsetvlstatic8 \w, \vlen, 0
> + vnclipu.wi v8, v8, 7
> + vnclipu.wi v24, v24, 7
> + vse8.v v8, (a0)
> + vse8.v v24, (t2)
> + addi a2, a2, 128*4
> + addi a3, a3, 128*4
> + sh1add a0, a1, a0
> +.endr
> +
> +.else
> +.rept \h
> + avg_nx1 \w, \vlen
> + .if \w == 128 && \vlen == 128
> + addi a2, a2, 64*2
> + addi a3, a3, 64*2
> + addi a0, a0, 64
> + avg_nx1 \w, \vlen
> + addi a2, a2, -64*2
> + addi a3, a3, -64*2
> + addi a0, a0, -64
> + .endif
> + addi a2, a2, 128*2
> + addi a3, a3, 128*2
> + add a0, a0, a1
> +.endr
> +.endif
> +.endm
> +
> +.macro w_avg_nx1 w vlen
> + vsetvlstatic16 \w, \vlen, 1
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vwmul.vx v16, v0, a7
> + vwmacc.vx v16, t3, v8
> + vsetvlstatic32 \w, \vlen
> + vadd.vx v16, v16, t4
> + vsetvlstatic16 \w, \vlen, 1
> + vnsrl.wx v16, v16, t6
> + vmax.vx v16, v16, zero
> + vsetvlstatic8 \w, \vlen, 1
> + vnclipu.wi v16, v16, 0
> + vse8.v v16, (a0)
> +.endm
> +
> +#if (__riscv_xlen == 64)
> +.macro w_avg w h vlen
> + csrw vxrm, zero
> + addi t6, a6, 7
> + ld t3, (sp)
> + ld t4, 8(sp)
> + ld t5, 16(sp)
> + add t4, t4, t5
> + addi t4, t4, 1 // o0 + o1 + 1
> + addi t5, t6, -1 // shift - 1
> + sll t4, t4, t5
> +
> +.if \w <= (\vlen / 8)
> + .rept (\h / 2)
> + vsetvlstatic16 \w, \vlen, 1
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + add t2, a0, a1
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vle16.v v20, (t0)
> + vle16.v v24, (t1)
> + vwmul.vx v16, v0, a7
> + vwmul.vx v28, v20, a7
> + vwmacc.vx v16, t3, v8
> + vwmacc.vx v28, t3, v24
> + vsetvlstatic32 \w, \vlen
> + vadd.vx v16, v16, t4
> + vadd.vx v28, v28, t4
> + vsetvlstatic16 \w, \vlen, 1
> + vnsrl.wx v16, v16, t6
> + vnsrl.wx v28, v28, t6
> + vmax.vx v16, v16, zero
> + vmax.vx v28, v28, zero
> + vsetvlstatic8 \w, \vlen, 1
> + vnclipu.wi v16, v16, 0
> + vnclipu.wi v28, v28, 0
> + vse8.v v16, (a0)
> + vse8.v v28, (t2)
> + addi a2, a2, 128*4
> + addi a3, a3, 128*4
> + sh1add a0, a1, a0
> + .endr
> +.else
> + .rept \h
> + w_avg_nx1 \w, \vlen
> + .if \w == (\vlen / 2)
> + addi a2, a2, (\vlen / 2)
> + addi a3, a3, (\vlen / 2)
> + addi a0, a0, (\vlen / 4)
> + w_avg_nx1 \w, \vlen
> + addi a2, a2, -(\vlen / 2)
> + addi a3, a3, -(\vlen / 2)
> + addi a0, a0, -(\vlen / 4)
> + .elseif \w == 128 && \vlen == 128
> + .rept 3
> + addi a2, a2, (\vlen / 2)
> + addi a3, a3, (\vlen / 2)
> + addi a0, a0, (\vlen / 4)
> + w_avg_nx1 \w, \vlen
> + .endr
> + addi a2, a2, -(\vlen / 2) * 3
> + addi a3, a3, -(\vlen / 2) * 3
> + addi a0, a0, -(\vlen / 4) * 3
> + .endif
> +
> + addi a2, a2, 128*2
> + addi a3, a3, 128*2
> + add a0, a0, a1
> + .endr
> +.endif
> +.endm
> +#endif
> +
> +.macro func_avg name vlen
> +func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x
> +.irp w,2,4,8,16,32,64,128
> + li t3, \w
> + bne a4, t3, \name\vlen\()end\w
> +.irp h,2,4,8,16,32,64,128
> + li t4, \h
> + bne a5, t4, \name\vlen\()end\w\h
> + \name \w \h \vlen
> + ret
> +\name\vlen\()end\w\h:
> +.endr
> +\name\vlen\()end\w:
These labels lead to nowhere? If you actually mean to implicitly fall through
to the next function, you can use the function name directly rather than add
odd labels.
> +.endr
> +endfunc
> +.endm
> +
> +func_avg avg 256
> +func_avg avg 128
> +#if (__riscv_xlen == 64)
> +func_avg w_avg 256
> +func_avg w_avg 128
> +#endif
> diff --git a/libavcodec/riscv/vvcdsp_init.c b/libavcodec/riscv/vvcdsp_init.c
> new file mode 100644
> index 0000000000..d26b4c1c4a
> --- /dev/null
> +++ b/libavcodec/riscv/vvcdsp_init.c
> @@ -0,0 +1,76 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS). + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include "config.h"
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
> +#include "libavcodec/vvc/dsp.h"
> +
> +#define bf(fn, bd, opt) fn##_##bd##_##opt
> +
> +#define AVG_PROTOTYPES(bd, opt)
> \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst,
> ptrdiff_t dst_stride, \ + const
> int16_t *src0, const int16_t *src1, int width, int height);
> \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t
> dst_stride, \ + const int16_t *src0,
> const int16_t *src1, int width, int height,
> \ + int denom, int w0, int w1, int o0, int o1);
> +
> +AVG_PROTOTYPES(8, rvv_128)
> +AVG_PROTOTYPES(8, rvv_256)
> +
> +#define AVG_INIT(bd, opt) do { \
> + c->inter.avg = bf(ff_vvc_avg, bd, opt); \
> + c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \
> +} while (0)
> +
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> +{
> +#if HAVE_RVV
> + const int flags = av_get_cpu_flags();
> +
> + if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
> + ff_rv_vlen_least(256)) {
> + switch (bd) {
> + case 8:
> + c->inter.avg = ff_vvc_avg_8_rvv_256;
> +# if (__riscv_xlen == 64)
> + c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
> +# endif
> + break;
> + default:
> + break;
> + }
> + } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags &
> AV_CPU_FLAG_RVB_ADDR) &&
> + ff_rv_vlen_least(128)) {
> + switch (bd) {
> + case 8:
> + c->inter.avg = ff_vvc_avg_8_rvv_128;
> +# if (__riscv_xlen == 64)
> + c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
> +# endif
> + break;
> + default:
> + break;
> + }
> + }
> +#endif
> +}
> diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
> index 41e830a98a..c55a37d255 100644
> --- a/libavcodec/vvc/dsp.c
> +++ b/libavcodec/vvc/dsp.c
> @@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int
> bit_depth) break;
> }
>
> -#if ARCH_X86
> +#if ARCH_RISCV
> + ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
> +#elif ARCH_X86
> ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
> #endif
> }
> diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> index 9810ac314c..dcb978549f 100644
> --- a/libavcodec/vvc/dsp.h
> +++ b/libavcodec/vvc/dsp.h
> @@ -167,6 +167,7 @@ typedef struct VVCDSPContext {
>
> void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
>
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
> void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
>
> #endif /* AVCODEC_VVC_DSP_H */
--
Rémi Denis-Courmont
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-05-21 16:03 ` Rémi Denis-Courmont
@ 2024-05-21 17:24 ` flow gg
2024-05-21 19:24 ` uk7b
0 siblings, 1 reply; 19+ messages in thread
From: flow gg @ 2024-05-21 17:24 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> I would expect that you can get better performance by interleaving scalar
and
vector stuff, and possibly also vector loads and vector arithmetic.
Okay, I will try
> These labels lead to nowhere? If you actually mean to implicitly fall
through
to the next function, you can use the function name directly rather than add
odd labels.
These labels are used to convert variable parameters to constants to
achieve better performance and prepare for the next .irp. Some names are
strange because they cannot be duplicated. Here, there is only one
function, which should be executed after going through these labels?
Rémi Denis-Courmont <remi@remlab.net> 于2024年5月22日周三 00:04写道:
> Le tiistaina 21. toukokuuta 2024, 10.37.51 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> > ---
> > libavcodec/riscv/Makefile | 2 +
> > libavcodec/riscv/vvc_mc_rvv.S | 312 +++++++++++++++++++++++++++++++++
> > libavcodec/riscv/vvcdsp_init.c | 76 ++++++++
> > libavcodec/vvc/dsp.c | 4 +-
> > libavcodec/vvc/dsp.h | 1 +
> > 5 files changed, 394 insertions(+), 1 deletion(-)
> > create mode 100644 libavcodec/riscv/vvc_mc_rvv.S
> > create mode 100644 libavcodec/riscv/vvcdsp_init.c
> >
> > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> > index 27b268ae39..6297664fc9 100644
> > --- a/libavcodec/riscv/Makefile
> > +++ b/libavcodec/riscv/Makefile
> > @@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o
> \
> > RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> > OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> > RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> > +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o
> > +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o
> > diff --git a/libavcodec/riscv/vvc_mc_rvv.S
> b/libavcodec/riscv/vvc_mc_rvv.S
> > new file mode 100644
> > index 0000000000..26a6afba1f
> > --- /dev/null
> > +++ b/libavcodec/riscv/vvc_mc_rvv.S
> > @@ -0,0 +1,312 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +.macro vsetvlstatic8 w vlen is_w
> > + .if \w <= 2
> > + vsetivli zero, \w, e8, mf8, ta, ma
> > + .elseif \w <= 4 && \vlen == 128
> > + vsetivli zero, \w, e8, mf4, ta, ma
> > + .elseif \w <= 4 && \vlen >= 256
> > + vsetivli zero, \w, e8, mf8, ta, ma
> > + .elseif \w <= 8 && \vlen == 128
> > + vsetivli zero, \w, e8, mf2, ta, ma
> > + .elseif \w <= 8 && \vlen >= 256
> > + vsetivli zero, \w, e8, mf4, ta, ma
> > + .elseif \w <= 16 && \vlen == 128
> > + vsetivli zero, \w, e8, m1, ta, ma
> > + .elseif \w <= 16 && \vlen >= 256
> > + vsetivli zero, \w, e8, mf2, ta, ma
> > + .elseif \w <= 32 && \vlen >= 256
> > + li t0, \w
> > + vsetvli zero, t0, e8, m1, ta, ma
> > + .elseif \w <= (\vlen / 4) || \is_w
> > + li t0, 64
> > + vsetvli zero, t0, e8, m2, ta, ma
> > + .else
> > + li t0, \w
> > + vsetvli zero, t0, e8, m4, ta, ma
> > + .endif
> > +.endm
> > +
> > +.macro vsetvlstatic16 w vlen is_w
> > + .if \w <= 2
> > + vsetivli zero, \w, e16, mf4, ta, ma
> > + .elseif \w <= 4 && \vlen == 128
> > + vsetivli zero, \w, e16, mf2, ta, ma
> > + .elseif \w <= 4 && \vlen >= 256
> > + vsetivli zero, \w, e16, mf4, ta, ma
> > + .elseif \w <= 8 && \vlen == 128
> > + vsetivli zero, \w, e16, m1, ta, ma
> > + .elseif \w <= 8 && \vlen >= 256
> > + vsetivli zero, \w, e16, mf2, ta, ma
> > + .elseif \w <= 16 && \vlen == 128
> > + vsetivli zero, \w, e16, m2, ta, ma
> > + .elseif \w <= 16 && \vlen >= 256
> > + vsetivli zero, \w, e16, m1, ta, ma
> > + .elseif \w <= 32 && \vlen >= 256
> > + li t0, \w
> > + vsetvli zero, t0, e16, m2, ta, ma
> > + .elseif \w <= (\vlen / 4) || \is_w
> > + li t0, 64
> > + vsetvli zero, t0, e16, m4, ta, ma
> > + .else
> > + li t0, \w
> > + vsetvli zero, t0, e16, m8, ta, ma
> > + .endif
> > +.endm
> > +
> > +.macro vsetvlstatic32 w vlen
> > + .if \w <= 2
> > + vsetivli zero, \w, e32, mf2, ta, ma
> > + .elseif \w <= 4 && \vlen == 128
> > + vsetivli zero, \w, e32, m1, ta, ma
> > + .elseif \w <= 4 && \vlen >= 256
> > + vsetivli zero, \w, e32, mf2, ta, ma
> > + .elseif \w <= 8 && \vlen == 128
> > + vsetivli zero, \w, e32, m2, ta, ma
> > + .elseif \w <= 8 && \vlen >= 256
> > + vsetivli zero, \w, e32, m1, ta, ma
> > + .elseif \w <= 16 && \vlen == 128
> > + vsetivli zero, \w, e32, m4, ta, ma
> > + .elseif \w <= 16 && \vlen >= 256
> > + vsetivli zero, \w, e32, m2, ta, ma
> > + .elseif \w <= 32 && \vlen >= 256
> > + li t0, \w
> > + vsetvli zero, t0, e32, m4, ta, ma
> > + .else
> > + li t0, \w
> > + vsetvli zero, t0, e32, m8, ta, ma
> > + .endif
> > +.endm
> > +
> > +.macro avg_nx1 w vlen
> > + vsetvlstatic16 \w, \vlen, 0
> > + vle16.v v0, (a2)
> > + vle16.v v8, (a3)
> > + vadd.vv v8, v8, v0
> > + vmax.vx v8, v8, zero
> > + vsetvlstatic8 \w, \vlen, 0
> > + vnclipu.wi v8, v8, 7
> > + vse8.v v8, (a0)
> > +.endm
> > +
> > +.macro avg w h vlen
> > + csrw vxrm, zero
> > +
> > +.if \w <= (\vlen / 4) && \h >= 4
> > +.rept (\h / 4)
> > + vsetvlstatic16 \w, \vlen, 0
> > + addi t0, a2, 128*2
> > + addi t1, a3, 128*2
> > + addi t3, a2, 128*2*2
> > + addi t4, a3, 128*2*2
> > + addi a7, a3, 128*2*3
> > + addi t6, a2, 128*2*3
> > + add t2, a0, a1
> > + sh1add t5, a1, a0
> > + add a6, t5, a1
> > + vle16.v v0, (a2)
> > + vle16.v v4, (a3)
> > + vle16.v v8, (t0)
> > + vle16.v v12, (t1)
> > + vle16.v v16, (t3)
> > + vle16.v v20, (t4)
> > + vle16.v v24, (t6)
> > + vle16.v v28, (a7)
>
> I would expect that you can get better performance by interleaving scalar
> and
> vector stuff, and possibly also vector loads and vector arithmetic.
>
> > + vadd.vv v4, v4, v0
> > + vadd.vv v12, v12, v8
> > + vadd.vv v20, v20, v16
> > + vadd.vv v28, v28, v24
> > + vmax.vx v4, v4, zero
> > + vmax.vx v12, v12, zero
> > + vmax.vx v20, v20, zero
> > + vmax.vx v28, v28, zero
> > + vsetvlstatic8 \w, \vlen, 0
> > + vnclipu.wi v4, v4, 7
> > + vnclipu.wi v12, v12, 7
> > + vnclipu.wi v20, v20, 7
> > + vnclipu.wi v28, v28, 7
> > + vse8.v v4, (a0)
> > + vse8.v v12, (t2)
> > + vse8.v v20, (t5)
> > + vse8.v v28, (a6)
> > + addi a2, a2, 128*8
> > + addi a3, a3, 128*8
> > + sh2add a0, a1, a0
> > +.endr
> > +
> > +.elseif (\w <= (\vlen / 4) && \h == 2) || (\w == (\vlen / 2))
> > +.rept (\h / 2)
> > + vsetvlstatic16 \w, \vlen, 0
> > + addi t0, a2, 128*2
> > + addi t1, a3, 128*2
> > + add t2, a0, a1
> > + vle16.v v0, (a2)
> > + vle16.v v8, (a3)
> > + vle16.v v16, (t0)
> > + vle16.v v24, (t1)
> > + vadd.vv v8, v8, v0
> > + vadd.vv v24, v24, v16
> > + vmax.vx v8, v8, zero
> > + vmax.vx v24, v24, zero
> > + vsetvlstatic8 \w, \vlen, 0
> > + vnclipu.wi v8, v8, 7
> > + vnclipu.wi v24, v24, 7
> > + vse8.v v8, (a0)
> > + vse8.v v24, (t2)
> > + addi a2, a2, 128*4
> > + addi a3, a3, 128*4
> > + sh1add a0, a1, a0
> > +.endr
> > +
> > +.else
> > +.rept \h
> > + avg_nx1 \w, \vlen
> > + .if \w == 128 && \vlen == 128
> > + addi a2, a2, 64*2
> > + addi a3, a3, 64*2
> > + addi a0, a0, 64
> > + avg_nx1 \w, \vlen
> > + addi a2, a2, -64*2
> > + addi a3, a3, -64*2
> > + addi a0, a0, -64
> > + .endif
> > + addi a2, a2, 128*2
> > + addi a3, a3, 128*2
> > + add a0, a0, a1
> > +.endr
> > +.endif
> > +.endm
> > +
> > +.macro w_avg_nx1 w vlen
> > + vsetvlstatic16 \w, \vlen, 1
> > + vle16.v v0, (a2)
> > + vle16.v v8, (a3)
> > + vwmul.vx v16, v0, a7
> > + vwmacc.vx v16, t3, v8
> > + vsetvlstatic32 \w, \vlen
> > + vadd.vx v16, v16, t4
> > + vsetvlstatic16 \w, \vlen, 1
> > + vnsrl.wx v16, v16, t6
> > + vmax.vx v16, v16, zero
> > + vsetvlstatic8 \w, \vlen, 1
> > + vnclipu.wi v16, v16, 0
> > + vse8.v v16, (a0)
> > +.endm
> > +
> > +#if (__riscv_xlen == 64)
> > +.macro w_avg w h vlen
> > + csrw vxrm, zero
> > + addi t6, a6, 7
> > + ld t3, (sp)
> > + ld t4, 8(sp)
> > + ld t5, 16(sp)
> > + add t4, t4, t5
> > + addi t4, t4, 1 // o0 + o1 + 1
> > + addi t5, t6, -1 // shift - 1
> > + sll t4, t4, t5
> > +
> > +.if \w <= (\vlen / 8)
> > + .rept (\h / 2)
> > + vsetvlstatic16 \w, \vlen, 1
> > + addi t0, a2, 128*2
> > + addi t1, a3, 128*2
> > + add t2, a0, a1
> > + vle16.v v0, (a2)
> > + vle16.v v8, (a3)
> > + vle16.v v20, (t0)
> > + vle16.v v24, (t1)
> > + vwmul.vx v16, v0, a7
> > + vwmul.vx v28, v20, a7
> > + vwmacc.vx v16, t3, v8
> > + vwmacc.vx v28, t3, v24
> > + vsetvlstatic32 \w, \vlen
> > + vadd.vx v16, v16, t4
> > + vadd.vx v28, v28, t4
> > + vsetvlstatic16 \w, \vlen, 1
> > + vnsrl.wx v16, v16, t6
> > + vnsrl.wx v28, v28, t6
> > + vmax.vx v16, v16, zero
> > + vmax.vx v28, v28, zero
> > + vsetvlstatic8 \w, \vlen, 1
> > + vnclipu.wi v16, v16, 0
> > + vnclipu.wi v28, v28, 0
> > + vse8.v v16, (a0)
> > + vse8.v v28, (t2)
> > + addi a2, a2, 128*4
> > + addi a3, a3, 128*4
> > + sh1add a0, a1, a0
> > + .endr
> > +.else
> > + .rept \h
> > + w_avg_nx1 \w, \vlen
> > + .if \w == (\vlen / 2)
> > + addi a2, a2, (\vlen / 2)
> > + addi a3, a3, (\vlen / 2)
> > + addi a0, a0, (\vlen / 4)
> > + w_avg_nx1 \w, \vlen
> > + addi a2, a2, -(\vlen / 2)
> > + addi a3, a3, -(\vlen / 2)
> > + addi a0, a0, -(\vlen / 4)
> > + .elseif \w == 128 && \vlen == 128
> > + .rept 3
> > + addi a2, a2, (\vlen / 2)
> > + addi a3, a3, (\vlen / 2)
> > + addi a0, a0, (\vlen / 4)
> > + w_avg_nx1 \w, \vlen
> > + .endr
> > + addi a2, a2, -(\vlen / 2) * 3
> > + addi a3, a3, -(\vlen / 2) * 3
> > + addi a0, a0, -(\vlen / 4) * 3
> > + .endif
> > +
> > + addi a2, a2, 128*2
> > + addi a3, a3, 128*2
> > + add a0, a0, a1
> > + .endr
> > +.endif
> > +.endm
> > +#endif
> > +
> > +.macro func_avg name vlen
> > +func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x
> > +.irp w,2,4,8,16,32,64,128
> > + li t3, \w
> > + bne a4, t3, \name\vlen\()end\w
> > +.irp h,2,4,8,16,32,64,128
> > + li t4, \h
> > + bne a5, t4, \name\vlen\()end\w\h
> > + \name \w \h \vlen
> > + ret
> > +\name\vlen\()end\w\h:
> > +.endr
> > +\name\vlen\()end\w:
>
> These labels lead to nowhere? If you actually mean to implicitly fall
> through
> to the next function, you can use the function name directly rather than
> add
> odd labels.
>
> > +.endr
> > +endfunc
> > +.endm
> > +
> > +func_avg avg 256
> > +func_avg avg 128
> > +#if (__riscv_xlen == 64)
> > +func_avg w_avg 256
> > +func_avg w_avg 128
> > +#endif
> > diff --git a/libavcodec/riscv/vvcdsp_init.c
> b/libavcodec/riscv/vvcdsp_init.c
> > new file mode 100644
> > index 0000000000..d26b4c1c4a
> > --- /dev/null
> > +++ b/libavcodec/riscv/vvcdsp_init.c
> > @@ -0,0 +1,76 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "config.h"
> > +
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavutil/riscv/cpu.h"
> > +#include "libavcodec/vvc/dsp.h"
> > +
> > +#define bf(fn, bd, opt) fn##_##bd##_##opt
> > +
> > +#define AVG_PROTOTYPES(bd, opt)
>
> > \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst,
> > ptrdiff_t dst_stride, \ + const
> > int16_t *src0, const int16_t *src1, int width, int height);
>
> > \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t
> > dst_stride, \ + const int16_t *src0,
> > const int16_t *src1, int width, int height,
>
> > \ + int denom, int w0, int w1, int o0, int o1);
> > +
> > +AVG_PROTOTYPES(8, rvv_128)
> > +AVG_PROTOTYPES(8, rvv_256)
> > +
> > +#define AVG_INIT(bd, opt) do { \
> > + c->inter.avg = bf(ff_vvc_avg, bd, opt); \
> > + c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \
> > +} while (0)
> > +
> > +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> > +{
> > +#if HAVE_RVV
> > + const int flags = av_get_cpu_flags();
> > +
> > + if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)
> &&
> > + ff_rv_vlen_least(256)) {
> > + switch (bd) {
> > + case 8:
> > + c->inter.avg = ff_vvc_avg_8_rvv_256;
> > +# if (__riscv_xlen == 64)
> > + c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
> > +# endif
> > + break;
> > + default:
> > + break;
> > + }
> > + } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags &
> > AV_CPU_FLAG_RVB_ADDR) &&
> > + ff_rv_vlen_least(128)) {
> > + switch (bd) {
> > + case 8:
> > + c->inter.avg = ff_vvc_avg_8_rvv_128;
> > +# if (__riscv_xlen == 64)
> > + c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
> > +# endif
> > + break;
> > + default:
> > + break;
> > + }
> > + }
> > +#endif
> > +}
> > diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
> > index 41e830a98a..c55a37d255 100644
> > --- a/libavcodec/vvc/dsp.c
> > +++ b/libavcodec/vvc/dsp.c
> > @@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int
> > bit_depth) break;
> > }
> >
> > -#if ARCH_X86
> > +#if ARCH_RISCV
> > + ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
> > +#elif ARCH_X86
> > ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
> > #endif
> > }
> > diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> > index 9810ac314c..dcb978549f 100644
> > --- a/libavcodec/vvc/dsp.h
> > +++ b/libavcodec/vvc/dsp.h
> > @@ -167,6 +167,7 @@ typedef struct VVCDSPContext {
> >
> > void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
> >
> > +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
> > void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
> >
> > #endif /* AVCODEC_VVC_DSP_H */
>
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-05-21 17:24 ` flow gg
@ 2024-05-21 19:24 ` uk7b
2024-05-21 19:26 ` flow gg
2024-05-25 8:27 ` Rémi Denis-Courmont
0 siblings, 2 replies; 19+ messages in thread
From: uk7b @ 2024-05-21 19:24 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
avg_8_2x2_c : 1.0 1.0
avg_8_2x2_rvv_i32 : 0.7 0.7
avg_8_2x4_c : 2.0 2.0
avg_8_2x4_rvv_i32 : 1.0 0.7
avg_8_2x8_c : 4.0 3.7
avg_8_2x8_rvv_i32 : 1.5 1.2
avg_8_2x16_c : 7.5 7.7
avg_8_2x16_rvv_i32 : 2.7 2.5
avg_8_2x32_c : 14.2 15.0
avg_8_2x32_rvv_i32 : 5.0 4.5
avg_8_2x64_c : 28.5 30.2
avg_8_2x64_rvv_i32 : 9.5 8.7
avg_8_2x128_c : 80.0 70.5
avg_8_2x128_rvv_i32 : 50.7 41.2
avg_8_4x2_c : 1.7 2.0
avg_8_4x2_rvv_i32 : 0.7 0.7
avg_8_4x4_c : 3.5 3.7
avg_8_4x4_rvv_i32 : 1.2 1.0
avg_8_4x8_c : 6.7 7.0
avg_8_4x8_rvv_i32 : 1.5 1.2
avg_8_4x16_c : 13.2 14.0
avg_8_4x16_rvv_i32 : 2.7 2.5
avg_8_4x32_c : 26.2 27.7
avg_8_4x32_rvv_i32 : 5.0 4.5
avg_8_4x64_c : 52.2 55.0
avg_8_4x64_rvv_i32 : 9.5 8.7
avg_8_4x128_c : 146.0 117.5
avg_8_4x128_rvv_i32 : 53.2 40.5
avg_8_8x2_c : 3.5 3.5
avg_8_8x2_rvv_i32 : 0.7 0.7
avg_8_8x4_c : 6.5 6.5
avg_8_8x4_rvv_i32 : 1.2 1.0
avg_8_8x8_c : 12.7 13.2
avg_8_8x8_rvv_i32 : 2.0 1.5
avg_8_8x16_c : 25.2 26.2
avg_8_8x16_rvv_i32 : 3.5 2.5
avg_8_8x32_c : 50.0 52.7
avg_8_8x32_rvv_i32 : 6.5 4.7
avg_8_8x64_c : 99.7 105.0
avg_8_8x64_rvv_i32 : 12.5 8.5
avg_8_8x128_c : 225.7 218.0
avg_8_8x128_rvv_i32 : 78.0 39.2
avg_8_16x2_c : 6.2 6.7
avg_8_16x2_rvv_i32 : 1.2 0.7
avg_8_16x4_c : 12.2 12.7
avg_8_16x4_rvv_i32 : 2.0 1.2
avg_8_16x8_c : 24.7 26.0
avg_8_16x8_rvv_i32 : 3.5 1.7
avg_8_16x16_c : 49.0 51.5
avg_8_16x16_rvv_i32 : 6.2 3.2
avg_8_16x32_c : 97.5 102.5
avg_8_16x32_rvv_i32 : 11.5 5.7
avg_8_16x64_c : 212.5 204.7
avg_8_16x64_rvv_i32 : 22.5 11.0
avg_8_16x128_c : 411.2 418.2
avg_8_16x128_rvv_i32 : 76.0 47.7
avg_8_32x2_c : 12.2 12.7
avg_8_32x2_rvv_i32 : 2.0 1.2
avg_8_32x4_c : 24.2 25.5
avg_8_32x4_rvv_i32 : 3.2 1.7
avg_8_32x8_c : 48.5 50.7
avg_8_32x8_rvv_i32 : 5.7 3.2
avg_8_32x16_c : 96.5 101.2
avg_8_32x16_rvv_i32 : 10.7 5.7
avg_8_32x32_c : 192.5 202.5
avg_8_32x32_rvv_i32 : 20.7 10.5
avg_8_32x64_c : 411.2 404.5
avg_8_32x64_rvv_i32 : 41.0 20.5
avg_8_32x128_c : 834.7 855.2
avg_8_32x128_rvv_i32 : 151.2 118.7
avg_8_64x2_c : 24.0 25.2
avg_8_64x2_rvv_i32 : 3.2 1.7
avg_8_64x4_c : 48.2 50.5
avg_8_64x4_rvv_i32 : 5.2 3.0
avg_8_64x8_c : 95.7 100.7
avg_8_64x8_rvv_i32 : 10.0 5.2
avg_8_64x16_c : 191.7 201.2
avg_8_64x16_rvv_i32 : 19.2 9.5
avg_8_64x32_c : 406.2 402.0
avg_8_64x32_rvv_i32 : 38.0 18.5
avg_8_64x64_c : 827.5 833.7
avg_8_64x64_rvv_i32 : 148.2 95.2
avg_8_64x128_c : 1607.7 1625.7
avg_8_64x128_rvv_i32 : 252.0 179.5
avg_8_128x2_c : 48.7 51.0
avg_8_128x2_rvv_i32 : 5.5 2.7
avg_8_128x4_c : 96.7 101.2
avg_8_128x4_rvv_i32 : 9.7 5.0
avg_8_128x8_c : 192.5 202.0
avg_8_128x8_rvv_i32 : 19.0 9.0
avg_8_128x16_c : 403.5 403.2
avg_8_128x16_rvv_i32 : 37.0 17.5
avg_8_128x32_c : 787.0 805.7
avg_8_128x32_rvv_i32 : 73.5 34.2
avg_8_128x64_c : 1635.7 1654.7
avg_8_128x64_rvv_i32 : 229.5 68.5
avg_8_128x128_c : 3217.0 3233.5
avg_8_128x128_rvv_i32 : 435.0 321.2
w_avg_8_2x2_c : 1.5 1.5
w_avg_8_2x2_rvv_i32 : 1.2 1.2
w_avg_8_2x4_c : 2.7 2.5
w_avg_8_2x4_rvv_i32 : 1.7 1.7
w_avg_8_2x8_c : 5.0 4.7
w_avg_8_2x8_rvv_i32 : 2.7 2.5
w_avg_8_2x16_c : 9.7 9.5
w_avg_8_2x16_rvv_i32 : 4.7 4.5
w_avg_8_2x32_c : 19.0 18.5
w_avg_8_2x32_rvv_i32 : 9.0 8.0
w_avg_8_2x64_c : 37.2 37.0
w_avg_8_2x64_rvv_i32 : 17.5 15.5
w_avg_8_2x128_c : 120.7 82.7
w_avg_8_2x128_rvv_i32 : 71.2 49.0
w_avg_8_4x2_c : 2.5 2.5
w_avg_8_4x2_rvv_i32 : 1.2 1.2
w_avg_8_4x4_c : 4.7 4.5
w_avg_8_4x4_rvv_i32 : 1.7 1.5
w_avg_8_4x8_c : 9.0 9.0
w_avg_8_4x8_rvv_i32 : 2.7 2.5
w_avg_8_4x16_c : 17.7 17.7
w_avg_8_4x16_rvv_i32 : 5.0 4.2
w_avg_8_4x32_c : 34.7 34.7
w_avg_8_4x32_rvv_i32 : 9.0 8.0
w_avg_8_4x64_c : 69.7 69.5
w_avg_8_4x64_rvv_i32 : 17.2 15.5
w_avg_8_4x128_c : 171.7 154.7
w_avg_8_4x128_rvv_i32 : 87.0 48.0
w_avg_8_8x2_c : 4.5 4.5
w_avg_8_8x2_rvv_i32 : 1.5 1.2
w_avg_8_8x4_c : 8.7 8.7
w_avg_8_8x4_rvv_i32 : 2.0 1.7
w_avg_8_8x8_c : 17.2 17.0
w_avg_8_8x8_rvv_i32 : 3.5 2.5
w_avg_8_8x16_c : 34.0 34.0
w_avg_8_8x16_rvv_i32 : 6.0 4.5
w_avg_8_8x32_c : 67.5 68.0
w_avg_8_8x32_rvv_i32 : 10.7 8.2
w_avg_8_8x64_c : 135.7 135.0
w_avg_8_8x64_rvv_i32 : 21.0 15.7
w_avg_8_8x128_c : 304.0 280.0
w_avg_8_8x128_rvv_i32 : 65.5 56.7
w_avg_8_16x2_c : 8.5 8.7
w_avg_8_16x2_rvv_i32 : 2.0 1.2
w_avg_8_16x4_c : 16.7 17.0
w_avg_8_16x4_rvv_i32 : 3.2 2.0
w_avg_8_16x8_c : 33.5 33.5
w_avg_8_16x8_rvv_i32 : 5.7 3.0
w_avg_8_16x16_c : 66.7 62.2
w_avg_8_16x16_rvv_i32 : 27.0 5.2
w_avg_8_16x32_c : 132.5 133.0
w_avg_8_16x32_rvv_i32 : 20.2 9.7
w_avg_8_16x64_c : 264.2 239.0
w_avg_8_16x64_rvv_i32 : 39.7 18.7
w_avg_8_16x128_c : 572.5 541.2
w_avg_8_16x128_rvv_i32 : 148.5 55.2
w_avg_8_32x2_c : 16.7 16.7
w_avg_8_32x2_rvv_i32 : 3.2 2.0
w_avg_8_32x4_c : 33.2 33.2
w_avg_8_32x4_rvv_i32 : 6.0 3.0
w_avg_8_32x8_c : 66.0 66.0
w_avg_8_32x8_rvv_i32 : 11.0 5.5
w_avg_8_32x16_c : 131.2 122.7
w_avg_8_32x16_rvv_i32 : 21.5 9.7
w_avg_8_32x32_c : 262.2 268.7
w_avg_8_32x32_rvv_i32 : 42.2 18.5
w_avg_8_32x64_c : 544.2 547.0
w_avg_8_32x64_rvv_i32 : 83.5 37.0
w_avg_8_32x128_c : 1426.7 1139.7
w_avg_8_32x128_rvv_i32 : 201.0 138.2
w_avg_8_64x2_c : 33.0 33.0
w_avg_8_64x2_rvv_i32 : 6.0 3.0
w_avg_8_64x4_c : 65.7 65.7
w_avg_8_64x4_rvv_i32 : 11.2 5.5
w_avg_8_64x8_c : 131.0 131.5
w_avg_8_64x8_rvv_i32 : 21.5 10.0
w_avg_8_64x16_c : 289.2 262.7
w_avg_8_64x16_rvv_i32 : 42.5 19.2
w_avg_8_64x32_c : 548.7 525.2
w_avg_8_64x32_rvv_i32 : 83.7 37.5
w_avg_8_64x64_c : 1139.5 1208.2
w_avg_8_64x64_rvv_i32 : 209.0 107.5
w_avg_8_64x128_c : 2495.5 2300.5
w_avg_8_64x128_rvv_i32 : 420.2 208.7
w_avg_8_128x2_c : 66.0 66.5
w_avg_8_128x2_rvv_i32 : 11.2 5.5
w_avg_8_128x4_c : 131.2 132.5
w_avg_8_128x4_rvv_i32 : 21.5 10.0
w_avg_8_128x8_c : 280.2 275.7
w_avg_8_128x8_rvv_i32 : 42.2 19.5
w_avg_8_128x16_c : 549.0 527.7
w_avg_8_128x16_rvv_i32 : 104.7 37.7
w_avg_8_128x32_c : 1215.2 1068.5
w_avg_8_128x32_rvv_i32 : 189.0 74.7
w_avg_8_128x64_c : 2305.5 2145.5
w_avg_8_128x64_rvv_i32 : 386.7 190.0
w_avg_8_128x128_c : 5797.0 4600.2
w_avg_8_128x128_rvv_i32 : 760.5 343.0
---
libavcodec/riscv/Makefile | 2 +
libavcodec/riscv/vvc_mc_rvv.S | 313 +++++++++++++++++++++++++++++++++
libavcodec/riscv/vvcdsp_init.c | 71 ++++++++
libavcodec/vvc/dsp.c | 4 +-
libavcodec/vvc/dsp.h | 1 +
5 files changed, 390 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/riscv/vvc_mc_rvv.S
create mode 100644 libavcodec/riscv/vvcdsp_init.c
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 27b268ae39..6297664fc9 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
+OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o
+RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o
diff --git a/libavcodec/riscv/vvc_mc_rvv.S b/libavcodec/riscv/vvc_mc_rvv.S
new file mode 100644
index 0000000000..3c2bac37f7
--- /dev/null
+++ b/libavcodec/riscv/vvc_mc_rvv.S
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro vsetvlstatic8 w vlen is_w
+ .if \w <= 2
+ vsetivli zero, \w, e8, mf8, ta, ma
+ .elseif \w <= 4 && \vlen == 128
+ vsetivli zero, \w, e8, mf4, ta, ma
+ .elseif \w <= 4 && \vlen >= 256
+ vsetivli zero, \w, e8, mf8, ta, ma
+ .elseif \w <= 8 && \vlen == 128
+ vsetivli zero, \w, e8, mf2, ta, ma
+ .elseif \w <= 8 && \vlen >= 256
+ vsetivli zero, \w, e8, mf4, ta, ma
+ .elseif \w <= 16 && \vlen == 128
+ vsetivli zero, \w, e8, m1, ta, ma
+ .elseif \w <= 16 && \vlen >= 256
+ vsetivli zero, \w, e8, mf2, ta, ma
+ .elseif \w <= 32 && \vlen >= 256
+ li t0, \w
+ vsetvli zero, t0, e8, m1, ta, ma
+ .elseif \w <= (\vlen / 4) || \is_w
+ li t0, 64
+ vsetvli zero, t0, e8, m2, ta, ma
+ .else
+ li t0, \w
+ vsetvli zero, t0, e8, m4, ta, ma
+ .endif
+.endm
+
+.macro vsetvlstatic16 w vlen is_w
+ .if \w <= 2
+ vsetivli zero, \w, e16, mf4, ta, ma
+ .elseif \w <= 4 && \vlen == 128
+ vsetivli zero, \w, e16, mf2, ta, ma
+ .elseif \w <= 4 && \vlen >= 256
+ vsetivli zero, \w, e16, mf4, ta, ma
+ .elseif \w <= 8 && \vlen == 128
+ vsetivli zero, \w, e16, m1, ta, ma
+ .elseif \w <= 8 && \vlen >= 256
+ vsetivli zero, \w, e16, mf2, ta, ma
+ .elseif \w <= 16 && \vlen == 128
+ vsetivli zero, \w, e16, m2, ta, ma
+ .elseif \w <= 16 && \vlen >= 256
+ vsetivli zero, \w, e16, m1, ta, ma
+ .elseif \w <= 32 && \vlen >= 256
+ li t0, \w
+ vsetvli zero, t0, e16, m2, ta, ma
+ .elseif \w <= (\vlen / 4) || \is_w
+ li t0, 64
+ vsetvli zero, t0, e16, m4, ta, ma
+ .else
+ li t0, \w
+ vsetvli zero, t0, e16, m8, ta, ma
+ .endif
+.endm
+
+.macro vsetvlstatic32 w vlen
+ .if \w <= 2
+ vsetivli zero, \w, e32, mf2, ta, ma
+ .elseif \w <= 4 && \vlen == 128
+ vsetivli zero, \w, e32, m1, ta, ma
+ .elseif \w <= 4 && \vlen >= 256
+ vsetivli zero, \w, e32, mf2, ta, ma
+ .elseif \w <= 8 && \vlen == 128
+ vsetivli zero, \w, e32, m2, ta, ma
+ .elseif \w <= 8 && \vlen >= 256
+ vsetivli zero, \w, e32, m1, ta, ma
+ .elseif \w <= 16 && \vlen == 128
+ vsetivli zero, \w, e32, m4, ta, ma
+ .elseif \w <= 16 && \vlen >= 256
+ vsetivli zero, \w, e32, m2, ta, ma
+ .elseif \w <= 32 && \vlen >= 256
+ li t0, \w
+ vsetvli zero, t0, e32, m4, ta, ma
+ .else
+ li t0, \w
+ vsetvli zero, t0, e32, m8, ta, ma
+ .endif
+.endm
+
+.macro avg_nx1 w vlen
+ vsetvlstatic16 \w, \vlen, 0
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vadd.vv v8, v8, v0
+ vmax.vx v8, v8, zero
+ vsetvlstatic8 \w, \vlen, 0
+ vnclipu.wi v8, v8, 7
+ vse8.v v8, (a0)
+.endm
+
+.macro avg w h vlen
+ csrw vxrm, zero
+
+.if \w <= (\vlen / 4) && \h >= 4
+.rept (\h / 4)
+ addi t1, a3, 128*2
+ addi t3, a2, 128*2*2
+ add t2, a0, a1
+ sh1add t5, a1, a0
+ addi t4, a3, 128*2*2
+ addi a7, a3, 128*2*3
+ addi t6, a2, 128*2*3
+ vsetvlstatic16 \w, \vlen, 0
+ addi t0, a2, 128*2
+ add a6, t5, a1
+ vle16.v v0, (a2)
+ vle16.v v4, (a3)
+ vle16.v v8, (t0)
+ vle16.v v12, (t1)
+ vle16.v v16, (t3)
+ vle16.v v20, (t4)
+ vle16.v v24, (t6)
+ vle16.v v28, (a7)
+ vadd.vv v4, v4, v0
+ vadd.vv v12, v12, v8
+ vadd.vv v20, v20, v16
+ vadd.vv v28, v28, v24
+ vmax.vx v4, v4, zero
+ vmax.vx v12, v12, zero
+ vmax.vx v20, v20, zero
+ vmax.vx v28, v28, zero
+ vsetvlstatic8 \w, \vlen, 0
+ addi a2, a2, 128*8
+ vnclipu.wi v4, v4, 7
+ vnclipu.wi v12, v12, 7
+ vnclipu.wi v20, v20, 7
+ vnclipu.wi v28, v28, 7
+ addi a3, a3, 128*8
+ vse8.v v4, (a0)
+ vse8.v v12, (t2)
+ vse8.v v20, (t5)
+ sh2add a0, a1, a0
+ vse8.v v28, (a6)
+.endr
+
+.elseif (\w <= (\vlen / 4) && \h == 2) || (\w == (\vlen / 2))
+.rept (\h / 2)
+ vsetvlstatic16 \w, \vlen, 0
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ add t2, a0, a1
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vle16.v v16, (t0)
+ vle16.v v24, (t1)
+ vadd.vv v8, v8, v0
+ vadd.vv v24, v24, v16
+ vmax.vx v8, v8, zero
+ vmax.vx v24, v24, zero
+ vsetvlstatic8 \w, \vlen, 0
+ addi a2, a2, 128*4
+ vnclipu.wi v8, v8, 7
+ vnclipu.wi v24, v24, 7
+ addi a3, a3, 128*4
+ vse8.v v8, (a0)
+ vse8.v v24, (t2)
+ sh1add a0, a1, a0
+.endr
+
+.else
+.rept \h
+ avg_nx1 \w, \vlen
+ .if \w == 128 && \vlen == 128
+ addi a2, a2, 64*2
+ addi a3, a3, 64*2
+ addi a0, a0, 64
+ avg_nx1 \w, \vlen
+ addi a0, a0, -64
+ addi a2, a2, 128
+ addi a3, a3, 128
+ .else
+ addi a2, a2, 128*2
+ addi a3, a3, 128*2
+ .endif
+ add a0, a0, a1
+.endr
+.endif
+.endm
+
+.macro w_avg_nx1 w vlen
+ vsetvlstatic16 \w, \vlen, 1
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vwmul.vx v16, v0, a7
+ vwmacc.vx v16, t3, v8
+ vsetvlstatic32 \w, \vlen
+ vadd.vx v16, v16, t4
+ vsetvlstatic16 \w, \vlen, 1
+ vnsrl.wx v16, v16, t6
+ vmax.vx v16, v16, zero
+ vsetvlstatic8 \w, \vlen, 1
+ vnclipu.wi v16, v16, 0
+ vse8.v v16, (a0)
+.endm
+
+#if (__riscv_xlen == 64)
+.macro w_avg w h vlen
+ csrw vxrm, zero
+ addi t6, a6, 7
+ ld t3, (sp)
+ ld t4, 8(sp)
+ ld t5, 16(sp)
+ add t4, t4, t5
+ addi t4, t4, 1 // o0 + o1 + 1
+ addi t5, t6, -1 // shift - 1
+ sll t4, t4, t5
+
+.if \w <= (\vlen / 8)
+ .rept (\h / 2)
+ vsetvlstatic16 \w, \vlen, 1
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vle16.v v20, (t0)
+ vle16.v v24, (t1)
+ vwmul.vx v16, v0, a7
+ vwmul.vx v28, v20, a7
+ vwmacc.vx v16, t3, v8
+ vwmacc.vx v28, t3, v24
+ vsetvlstatic32 \w, \vlen
+ add t2, a0, a1
+ vadd.vx v16, v16, t4
+ vadd.vx v28, v28, t4
+ vsetvlstatic16 \w, \vlen, 1
+ vnsrl.wx v16, v16, t6
+ vnsrl.wx v28, v28, t6
+ vmax.vx v16, v16, zero
+ vmax.vx v28, v28, zero
+ vsetvlstatic8 \w, \vlen, 1
+ addi a2, a2, 128*4
+ vnclipu.wi v16, v16, 0
+ vnclipu.wi v28, v28, 0
+ vse8.v v16, (a0)
+ addi a3, a3, 128*4
+ vse8.v v28, (t2)
+ sh1add a0, a1, a0
+ .endr
+.else
+ .rept \h
+ w_avg_nx1 \w, \vlen
+ .if \w == (\vlen / 2)
+ addi a2, a2, (\vlen / 2)
+ addi a3, a3, (\vlen / 2)
+ addi a0, a0, (\vlen / 4)
+ w_avg_nx1 \w, \vlen
+ addi a2, a2, -(\vlen / 2)
+ addi a3, a3, -(\vlen / 2)
+ addi a0, a0, -(\vlen / 4)
+ .elseif \w == 128 && \vlen == 128
+ .rept 3
+ addi a2, a2, (\vlen / 2)
+ addi a3, a3, (\vlen / 2)
+ addi a0, a0, (\vlen / 4)
+ w_avg_nx1 \w, \vlen
+ .endr
+ addi a2, a2, -(\vlen / 2) * 3
+ addi a3, a3, -(\vlen / 2) * 3
+ addi a0, a0, -(\vlen / 4) * 3
+ .endif
+
+ addi a2, a2, 128*2
+ addi a3, a3, 128*2
+ add a0, a0, a1
+ .endr
+.endif
+.endm
+#endif
+
+.macro func_avg name vlen
+func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x
+.irp w,2,4,8,16,32,64,128
+ li t3, \w
+ bne a4, t3, \name\vlen\()end\w
+.irp h,2,4,8,16,32,64,128
+ li t4, \h
+ bne a5, t4, \name\vlen\()end\w\h
+ \name \w \h \vlen
+ ret
+\name\vlen\()end\w\h:
+.endr
+\name\vlen\()end\w:
+.endr
+endfunc
+.endm
+
+func_avg avg 256
+func_avg avg 128
+#if (__riscv_xlen == 64)
+func_avg w_avg 256
+func_avg w_avg 128
+#endif
diff --git a/libavcodec/riscv/vvcdsp_init.c b/libavcodec/riscv/vvcdsp_init.c
new file mode 100644
index 0000000000..85b1ede061
--- /dev/null
+++ b/libavcodec/riscv/vvcdsp_init.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/vvc/dsp.h"
+
+#define bf(fn, bd, opt) fn##_##bd##_##opt
+
+#define AVG_PROTOTYPES(bd, opt) \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height); \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height, \
+ int denom, int w0, int w1, int o0, int o1);
+
+AVG_PROTOTYPES(8, rvv_128)
+AVG_PROTOTYPES(8, rvv_256)
+
+void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
+{
+#if HAVE_RVV
+ const int flags = av_get_cpu_flags();
+
+ if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+ ff_rv_vlen_least(256)) {
+ switch (bd) {
+ case 8:
+ c->inter.avg = ff_vvc_avg_8_rvv_256;
+# if (__riscv_xlen == 64)
+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
+# endif
+ break;
+ default:
+ break;
+ }
+ } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+ ff_rv_vlen_least(128)) {
+ switch (bd) {
+ case 8:
+ c->inter.avg = ff_vvc_avg_8_rvv_128;
+# if (__riscv_xlen == 64)
+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
+# endif
+ break;
+ default:
+ break;
+ }
+ }
+#endif
+}
diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
index 41e830a98a..c55a37d255 100644
--- a/libavcodec/vvc/dsp.c
+++ b/libavcodec/vvc/dsp.c
@@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
break;
}
-#if ARCH_X86
+#if ARCH_RISCV
+ ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
+#elif ARCH_X86
ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
#endif
}
diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
index 9810ac314c..dcb978549f 100644
--- a/libavcodec/vvc/dsp.h
+++ b/libavcodec/vvc/dsp.h
@@ -167,6 +167,7 @@ typedef struct VVCDSPContext {
void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
+void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
#endif /* AVCODEC_VVC_DSP_H */
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-05-21 19:24 ` uk7b
@ 2024-05-21 19:26 ` flow gg
2024-05-25 8:27 ` Rémi Denis-Courmont
1 sibling, 0 replies; 19+ messages in thread
From: flow gg @ 2024-05-21 19:26 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Reordered some here.
<uk7b@foxmail.com> 于2024年5月22日周三 03:24写道:
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908 X60
> avg_8_2x2_c : 1.0 1.0
> avg_8_2x2_rvv_i32 : 0.7 0.7
> avg_8_2x4_c : 2.0 2.0
> avg_8_2x4_rvv_i32 : 1.0 0.7
> avg_8_2x8_c : 4.0 3.7
> avg_8_2x8_rvv_i32 : 1.5 1.2
> avg_8_2x16_c : 7.5 7.7
> avg_8_2x16_rvv_i32 : 2.7 2.5
> avg_8_2x32_c : 14.2 15.0
> avg_8_2x32_rvv_i32 : 5.0 4.5
> avg_8_2x64_c : 28.5 30.2
> avg_8_2x64_rvv_i32 : 9.5 8.7
> avg_8_2x128_c : 80.0 70.5
> avg_8_2x128_rvv_i32 : 50.7 41.2
> avg_8_4x2_c : 1.7 2.0
> avg_8_4x2_rvv_i32 : 0.7 0.7
> avg_8_4x4_c : 3.5 3.7
> avg_8_4x4_rvv_i32 : 1.2 1.0
> avg_8_4x8_c : 6.7 7.0
> avg_8_4x8_rvv_i32 : 1.5 1.2
> avg_8_4x16_c : 13.2 14.0
> avg_8_4x16_rvv_i32 : 2.7 2.5
> avg_8_4x32_c : 26.2 27.7
> avg_8_4x32_rvv_i32 : 5.0 4.5
> avg_8_4x64_c : 52.2 55.0
> avg_8_4x64_rvv_i32 : 9.5 8.7
> avg_8_4x128_c : 146.0 117.5
> avg_8_4x128_rvv_i32 : 53.2 40.5
> avg_8_8x2_c : 3.5 3.5
> avg_8_8x2_rvv_i32 : 0.7 0.7
> avg_8_8x4_c : 6.5 6.5
> avg_8_8x4_rvv_i32 : 1.2 1.0
> avg_8_8x8_c : 12.7 13.2
> avg_8_8x8_rvv_i32 : 2.0 1.5
> avg_8_8x16_c : 25.2 26.2
> avg_8_8x16_rvv_i32 : 3.5 2.5
> avg_8_8x32_c : 50.0 52.7
> avg_8_8x32_rvv_i32 : 6.5 4.7
> avg_8_8x64_c : 99.7 105.0
> avg_8_8x64_rvv_i32 : 12.5 8.5
> avg_8_8x128_c : 225.7 218.0
> avg_8_8x128_rvv_i32 : 78.0 39.2
> avg_8_16x2_c : 6.2 6.7
> avg_8_16x2_rvv_i32 : 1.2 0.7
> avg_8_16x4_c : 12.2 12.7
> avg_8_16x4_rvv_i32 : 2.0 1.2
> avg_8_16x8_c : 24.7 26.0
> avg_8_16x8_rvv_i32 : 3.5 1.7
> avg_8_16x16_c : 49.0 51.5
> avg_8_16x16_rvv_i32 : 6.2 3.2
> avg_8_16x32_c : 97.5 102.5
> avg_8_16x32_rvv_i32 : 11.5 5.7
> avg_8_16x64_c : 212.5 204.7
> avg_8_16x64_rvv_i32 : 22.5 11.0
> avg_8_16x128_c : 411.2 418.2
> avg_8_16x128_rvv_i32 : 76.0 47.7
> avg_8_32x2_c : 12.2 12.7
> avg_8_32x2_rvv_i32 : 2.0 1.2
> avg_8_32x4_c : 24.2 25.5
> avg_8_32x4_rvv_i32 : 3.2 1.7
> avg_8_32x8_c : 48.5 50.7
> avg_8_32x8_rvv_i32 : 5.7 3.2
> avg_8_32x16_c : 96.5 101.2
> avg_8_32x16_rvv_i32 : 10.7 5.7
> avg_8_32x32_c : 192.5 202.5
> avg_8_32x32_rvv_i32 : 20.7 10.5
> avg_8_32x64_c : 411.2 404.5
> avg_8_32x64_rvv_i32 : 41.0 20.5
> avg_8_32x128_c : 834.7 855.2
> avg_8_32x128_rvv_i32 : 151.2 118.7
> avg_8_64x2_c : 24.0 25.2
> avg_8_64x2_rvv_i32 : 3.2 1.7
> avg_8_64x4_c : 48.2 50.5
> avg_8_64x4_rvv_i32 : 5.2 3.0
> avg_8_64x8_c : 95.7 100.7
> avg_8_64x8_rvv_i32 : 10.0 5.2
> avg_8_64x16_c : 191.7 201.2
> avg_8_64x16_rvv_i32 : 19.2 9.5
> avg_8_64x32_c : 406.2 402.0
> avg_8_64x32_rvv_i32 : 38.0 18.5
> avg_8_64x64_c : 827.5 833.7
> avg_8_64x64_rvv_i32 : 148.2 95.2
> avg_8_64x128_c : 1607.7 1625.7
> avg_8_64x128_rvv_i32 : 252.0 179.5
> avg_8_128x2_c : 48.7 51.0
> avg_8_128x2_rvv_i32 : 5.5 2.7
> avg_8_128x4_c : 96.7 101.2
> avg_8_128x4_rvv_i32 : 9.7 5.0
> avg_8_128x8_c : 192.5 202.0
> avg_8_128x8_rvv_i32 : 19.0 9.0
> avg_8_128x16_c : 403.5 403.2
> avg_8_128x16_rvv_i32 : 37.0 17.5
> avg_8_128x32_c : 787.0 805.7
> avg_8_128x32_rvv_i32 : 73.5 34.2
> avg_8_128x64_c : 1635.7 1654.7
> avg_8_128x64_rvv_i32 : 229.5 68.5
> avg_8_128x128_c : 3217.0 3233.5
> avg_8_128x128_rvv_i32 : 435.0 321.2
> w_avg_8_2x2_c : 1.5 1.5
> w_avg_8_2x2_rvv_i32 : 1.2 1.2
> w_avg_8_2x4_c : 2.7 2.5
> w_avg_8_2x4_rvv_i32 : 1.7 1.7
> w_avg_8_2x8_c : 5.0 4.7
> w_avg_8_2x8_rvv_i32 : 2.7 2.5
> w_avg_8_2x16_c : 9.7 9.5
> w_avg_8_2x16_rvv_i32 : 4.7 4.5
> w_avg_8_2x32_c : 19.0 18.5
> w_avg_8_2x32_rvv_i32 : 9.0 8.0
> w_avg_8_2x64_c : 37.2 37.0
> w_avg_8_2x64_rvv_i32 : 17.5 15.5
> w_avg_8_2x128_c : 120.7 82.7
> w_avg_8_2x128_rvv_i32 : 71.2 49.0
> w_avg_8_4x2_c : 2.5 2.5
> w_avg_8_4x2_rvv_i32 : 1.2 1.2
> w_avg_8_4x4_c : 4.7 4.5
> w_avg_8_4x4_rvv_i32 : 1.7 1.5
> w_avg_8_4x8_c : 9.0 9.0
> w_avg_8_4x8_rvv_i32 : 2.7 2.5
> w_avg_8_4x16_c : 17.7 17.7
> w_avg_8_4x16_rvv_i32 : 5.0 4.2
> w_avg_8_4x32_c : 34.7 34.7
> w_avg_8_4x32_rvv_i32 : 9.0 8.0
> w_avg_8_4x64_c : 69.7 69.5
> w_avg_8_4x64_rvv_i32 : 17.2 15.5
> w_avg_8_4x128_c : 171.7 154.7
> w_avg_8_4x128_rvv_i32 : 87.0 48.0
> w_avg_8_8x2_c : 4.5 4.5
> w_avg_8_8x2_rvv_i32 : 1.5 1.2
> w_avg_8_8x4_c : 8.7 8.7
> w_avg_8_8x4_rvv_i32 : 2.0 1.7
> w_avg_8_8x8_c : 17.2 17.0
> w_avg_8_8x8_rvv_i32 : 3.5 2.5
> w_avg_8_8x16_c : 34.0 34.0
> w_avg_8_8x16_rvv_i32 : 6.0 4.5
> w_avg_8_8x32_c : 67.5 68.0
> w_avg_8_8x32_rvv_i32 : 10.7 8.2
> w_avg_8_8x64_c : 135.7 135.0
> w_avg_8_8x64_rvv_i32 : 21.0 15.7
> w_avg_8_8x128_c : 304.0 280.0
> w_avg_8_8x128_rvv_i32 : 65.5 56.7
> w_avg_8_16x2_c : 8.5 8.7
> w_avg_8_16x2_rvv_i32 : 2.0 1.2
> w_avg_8_16x4_c : 16.7 17.0
> w_avg_8_16x4_rvv_i32 : 3.2 2.0
> w_avg_8_16x8_c : 33.5 33.5
> w_avg_8_16x8_rvv_i32 : 5.7 3.0
> w_avg_8_16x16_c : 66.7 62.2
> w_avg_8_16x16_rvv_i32 : 27.0 5.2
> w_avg_8_16x32_c : 132.5 133.0
> w_avg_8_16x32_rvv_i32 : 20.2 9.7
> w_avg_8_16x64_c : 264.2 239.0
> w_avg_8_16x64_rvv_i32 : 39.7 18.7
> w_avg_8_16x128_c : 572.5 541.2
> w_avg_8_16x128_rvv_i32 : 148.5 55.2
> w_avg_8_32x2_c : 16.7 16.7
> w_avg_8_32x2_rvv_i32 : 3.2 2.0
> w_avg_8_32x4_c : 33.2 33.2
> w_avg_8_32x4_rvv_i32 : 6.0 3.0
> w_avg_8_32x8_c : 66.0 66.0
> w_avg_8_32x8_rvv_i32 : 11.0 5.5
> w_avg_8_32x16_c : 131.2 122.7
> w_avg_8_32x16_rvv_i32 : 21.5 9.7
> w_avg_8_32x32_c : 262.2 268.7
> w_avg_8_32x32_rvv_i32 : 42.2 18.5
> w_avg_8_32x64_c : 544.2 547.0
> w_avg_8_32x64_rvv_i32 : 83.5 37.0
> w_avg_8_32x128_c : 1426.7 1139.7
> w_avg_8_32x128_rvv_i32 : 201.0 138.2
> w_avg_8_64x2_c : 33.0 33.0
> w_avg_8_64x2_rvv_i32 : 6.0 3.0
> w_avg_8_64x4_c : 65.7 65.7
> w_avg_8_64x4_rvv_i32 : 11.2 5.5
> w_avg_8_64x8_c : 131.0 131.5
> w_avg_8_64x8_rvv_i32 : 21.5 10.0
> w_avg_8_64x16_c : 289.2 262.7
> w_avg_8_64x16_rvv_i32 : 42.5 19.2
> w_avg_8_64x32_c : 548.7 525.2
> w_avg_8_64x32_rvv_i32 : 83.7 37.5
> w_avg_8_64x64_c : 1139.5 1208.2
> w_avg_8_64x64_rvv_i32 : 209.0 107.5
> w_avg_8_64x128_c : 2495.5 2300.5
> w_avg_8_64x128_rvv_i32 : 420.2 208.7
> w_avg_8_128x2_c : 66.0 66.5
> w_avg_8_128x2_rvv_i32 : 11.2 5.5
> w_avg_8_128x4_c : 131.2 132.5
> w_avg_8_128x4_rvv_i32 : 21.5 10.0
> w_avg_8_128x8_c : 280.2 275.7
> w_avg_8_128x8_rvv_i32 : 42.2 19.5
> w_avg_8_128x16_c : 549.0 527.7
> w_avg_8_128x16_rvv_i32 : 104.7 37.7
> w_avg_8_128x32_c : 1215.2 1068.5
> w_avg_8_128x32_rvv_i32 : 189.0 74.7
> w_avg_8_128x64_c : 2305.5 2145.5
> w_avg_8_128x64_rvv_i32 : 386.7 190.0
> w_avg_8_128x128_c : 5797.0 4600.2
> w_avg_8_128x128_rvv_i32 : 760.5 343.0
> ---
> libavcodec/riscv/Makefile | 2 +
> libavcodec/riscv/vvc_mc_rvv.S | 313 +++++++++++++++++++++++++++++++++
> libavcodec/riscv/vvcdsp_init.c | 71 ++++++++
> libavcodec/vvc/dsp.c | 4 +-
> libavcodec/vvc/dsp.h | 1 +
> 5 files changed, 390 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/riscv/vvc_mc_rvv.S
> create mode 100644 libavcodec/riscv/vvcdsp_init.c
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 27b268ae39..6297664fc9 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
> RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o
> +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o
> diff --git a/libavcodec/riscv/vvc_mc_rvv.S b/libavcodec/riscv/vvc_mc_rvv.S
> new file mode 100644
> index 0000000000..3c2bac37f7
> --- /dev/null
> +++ b/libavcodec/riscv/vvc_mc_rvv.S
> @@ -0,0 +1,313 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro vsetvlstatic8 w vlen is_w
> + .if \w <= 2
> + vsetivli zero, \w, e8, mf8, ta, ma
> + .elseif \w <= 4 && \vlen == 128
> + vsetivli zero, \w, e8, mf4, ta, ma
> + .elseif \w <= 4 && \vlen >= 256
> + vsetivli zero, \w, e8, mf8, ta, ma
> + .elseif \w <= 8 && \vlen == 128
> + vsetivli zero, \w, e8, mf2, ta, ma
> + .elseif \w <= 8 && \vlen >= 256
> + vsetivli zero, \w, e8, mf4, ta, ma
> + .elseif \w <= 16 && \vlen == 128
> + vsetivli zero, \w, e8, m1, ta, ma
> + .elseif \w <= 16 && \vlen >= 256
> + vsetivli zero, \w, e8, mf2, ta, ma
> + .elseif \w <= 32 && \vlen >= 256
> + li t0, \w
> + vsetvli zero, t0, e8, m1, ta, ma
> + .elseif \w <= (\vlen / 4) || \is_w
> + li t0, 64
> + vsetvli zero, t0, e8, m2, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e8, m4, ta, ma
> + .endif
> +.endm
> +
> +.macro vsetvlstatic16 w vlen is_w
> + .if \w <= 2
> + vsetivli zero, \w, e16, mf4, ta, ma
> + .elseif \w <= 4 && \vlen == 128
> + vsetivli zero, \w, e16, mf2, ta, ma
> + .elseif \w <= 4 && \vlen >= 256
> + vsetivli zero, \w, e16, mf4, ta, ma
> + .elseif \w <= 8 && \vlen == 128
> + vsetivli zero, \w, e16, m1, ta, ma
> + .elseif \w <= 8 && \vlen >= 256
> + vsetivli zero, \w, e16, mf2, ta, ma
> + .elseif \w <= 16 && \vlen == 128
> + vsetivli zero, \w, e16, m2, ta, ma
> + .elseif \w <= 16 && \vlen >= 256
> + vsetivli zero, \w, e16, m1, ta, ma
> + .elseif \w <= 32 && \vlen >= 256
> + li t0, \w
> + vsetvli zero, t0, e16, m2, ta, ma
> + .elseif \w <= (\vlen / 4) || \is_w
> + li t0, 64
> + vsetvli zero, t0, e16, m4, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e16, m8, ta, ma
> + .endif
> +.endm
> +
> +.macro vsetvlstatic32 w vlen
> + .if \w <= 2
> + vsetivli zero, \w, e32, mf2, ta, ma
> + .elseif \w <= 4 && \vlen == 128
> + vsetivli zero, \w, e32, m1, ta, ma
> + .elseif \w <= 4 && \vlen >= 256
> + vsetivli zero, \w, e32, mf2, ta, ma
> + .elseif \w <= 8 && \vlen == 128
> + vsetivli zero, \w, e32, m2, ta, ma
> + .elseif \w <= 8 && \vlen >= 256
> + vsetivli zero, \w, e32, m1, ta, ma
> + .elseif \w <= 16 && \vlen == 128
> + vsetivli zero, \w, e32, m4, ta, ma
> + .elseif \w <= 16 && \vlen >= 256
> + vsetivli zero, \w, e32, m2, ta, ma
> + .elseif \w <= 32 && \vlen >= 256
> + li t0, \w
> + vsetvli zero, t0, e32, m4, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e32, m8, ta, ma
> + .endif
> +.endm
> +
> +.macro avg_nx1 w vlen
> + vsetvlstatic16 \w, \vlen, 0
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vadd.vv v8, v8, v0
> + vmax.vx v8, v8, zero
> + vsetvlstatic8 \w, \vlen, 0
> + vnclipu.wi v8, v8, 7
> + vse8.v v8, (a0)
> +.endm
> +
> +.macro avg w h vlen
> + csrw vxrm, zero
> +
> +.if \w <= (\vlen / 4) && \h >= 4
> +.rept (\h / 4)
> + addi t1, a3, 128*2
> + addi t3, a2, 128*2*2
> + add t2, a0, a1
> + sh1add t5, a1, a0
> + addi t4, a3, 128*2*2
> + addi a7, a3, 128*2*3
> + addi t6, a2, 128*2*3
> + vsetvlstatic16 \w, \vlen, 0
> + addi t0, a2, 128*2
> + add a6, t5, a1
> + vle16.v v0, (a2)
> + vle16.v v4, (a3)
> + vle16.v v8, (t0)
> + vle16.v v12, (t1)
> + vle16.v v16, (t3)
> + vle16.v v20, (t4)
> + vle16.v v24, (t6)
> + vle16.v v28, (a7)
> + vadd.vv v4, v4, v0
> + vadd.vv v12, v12, v8
> + vadd.vv v20, v20, v16
> + vadd.vv v28, v28, v24
> + vmax.vx v4, v4, zero
> + vmax.vx v12, v12, zero
> + vmax.vx v20, v20, zero
> + vmax.vx v28, v28, zero
> + vsetvlstatic8 \w, \vlen, 0
> + addi a2, a2, 128*8
> + vnclipu.wi v4, v4, 7
> + vnclipu.wi v12, v12, 7
> + vnclipu.wi v20, v20, 7
> + vnclipu.wi v28, v28, 7
> + addi a3, a3, 128*8
> + vse8.v v4, (a0)
> + vse8.v v12, (t2)
> + vse8.v v20, (t5)
> + sh2add a0, a1, a0
> + vse8.v v28, (a6)
> +.endr
> +
> +.elseif (\w <= (\vlen / 4) && \h == 2) || (\w == (\vlen / 2))
> +.rept (\h / 2)
> + vsetvlstatic16 \w, \vlen, 0
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + add t2, a0, a1
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vle16.v v16, (t0)
> + vle16.v v24, (t1)
> + vadd.vv v8, v8, v0
> + vadd.vv v24, v24, v16
> + vmax.vx v8, v8, zero
> + vmax.vx v24, v24, zero
> + vsetvlstatic8 \w, \vlen, 0
> + addi a2, a2, 128*4
> + vnclipu.wi v8, v8, 7
> + vnclipu.wi v24, v24, 7
> + addi a3, a3, 128*4
> + vse8.v v8, (a0)
> + vse8.v v24, (t2)
> + sh1add a0, a1, a0
> +.endr
> +
> +.else
> +.rept \h
> + avg_nx1 \w, \vlen
> + .if \w == 128 && \vlen == 128
> + addi a2, a2, 64*2
> + addi a3, a3, 64*2
> + addi a0, a0, 64
> + avg_nx1 \w, \vlen
> + addi a0, a0, -64
> + addi a2, a2, 128
> + addi a3, a3, 128
> + .else
> + addi a2, a2, 128*2
> + addi a3, a3, 128*2
> + .endif
> + add a0, a0, a1
> +.endr
> +.endif
> +.endm
> +
> +.macro w_avg_nx1 w vlen
> + vsetvlstatic16 \w, \vlen, 1
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vwmul.vx v16, v0, a7
> + vwmacc.vx v16, t3, v8
> + vsetvlstatic32 \w, \vlen
> + vadd.vx v16, v16, t4
> + vsetvlstatic16 \w, \vlen, 1
> + vnsrl.wx v16, v16, t6
> + vmax.vx v16, v16, zero
> + vsetvlstatic8 \w, \vlen, 1
> + vnclipu.wi v16, v16, 0
> + vse8.v v16, (a0)
> +.endm
> +
> +#if (__riscv_xlen == 64)
> +.macro w_avg w h vlen
> + csrw vxrm, zero
> + addi t6, a6, 7
> + ld t3, (sp)
> + ld t4, 8(sp)
> + ld t5, 16(sp)
> + add t4, t4, t5
> + addi t4, t4, 1 // o0 + o1 + 1
> + addi t5, t6, -1 // shift - 1
> + sll t4, t4, t5
> +
> +.if \w <= (\vlen / 8)
> + .rept (\h / 2)
> + vsetvlstatic16 \w, \vlen, 1
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vle16.v v20, (t0)
> + vle16.v v24, (t1)
> + vwmul.vx v16, v0, a7
> + vwmul.vx v28, v20, a7
> + vwmacc.vx v16, t3, v8
> + vwmacc.vx v28, t3, v24
> + vsetvlstatic32 \w, \vlen
> + add t2, a0, a1
> + vadd.vx v16, v16, t4
> + vadd.vx v28, v28, t4
> + vsetvlstatic16 \w, \vlen, 1
> + vnsrl.wx v16, v16, t6
> + vnsrl.wx v28, v28, t6
> + vmax.vx v16, v16, zero
> + vmax.vx v28, v28, zero
> + vsetvlstatic8 \w, \vlen, 1
> + addi a2, a2, 128*4
> + vnclipu.wi v16, v16, 0
> + vnclipu.wi v28, v28, 0
> + vse8.v v16, (a0)
> + addi a3, a3, 128*4
> + vse8.v v28, (t2)
> + sh1add a0, a1, a0
> + .endr
> +.else
> + .rept \h
> + w_avg_nx1 \w, \vlen
> + .if \w == (\vlen / 2)
> + addi a2, a2, (\vlen / 2)
> + addi a3, a3, (\vlen / 2)
> + addi a0, a0, (\vlen / 4)
> + w_avg_nx1 \w, \vlen
> + addi a2, a2, -(\vlen / 2)
> + addi a3, a3, -(\vlen / 2)
> + addi a0, a0, -(\vlen / 4)
> + .elseif \w == 128 && \vlen == 128
> + .rept 3
> + addi a2, a2, (\vlen / 2)
> + addi a3, a3, (\vlen / 2)
> + addi a0, a0, (\vlen / 4)
> + w_avg_nx1 \w, \vlen
> + .endr
> + addi a2, a2, -(\vlen / 2) * 3
> + addi a3, a3, -(\vlen / 2) * 3
> + addi a0, a0, -(\vlen / 4) * 3
> + .endif
> +
> + addi a2, a2, 128*2
> + addi a3, a3, 128*2
> + add a0, a0, a1
> + .endr
> +.endif
> +.endm
> +#endif
> +
> +.macro func_avg name vlen
> +func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x
> +.irp w,2,4,8,16,32,64,128
> + li t3, \w
> + bne a4, t3, \name\vlen\()end\w
> +.irp h,2,4,8,16,32,64,128
> + li t4, \h
> + bne a5, t4, \name\vlen\()end\w\h
> + \name \w \h \vlen
> + ret
> +\name\vlen\()end\w\h:
> +.endr
> +\name\vlen\()end\w:
> +.endr
> +endfunc
> +.endm
> +
> +func_avg avg 256
> +func_avg avg 128
> +#if (__riscv_xlen == 64)
> +func_avg w_avg 256
> +func_avg w_avg 128
> +#endif
> diff --git a/libavcodec/riscv/vvcdsp_init.c
> b/libavcodec/riscv/vvcdsp_init.c
> new file mode 100644
> index 0000000000..85b1ede061
> --- /dev/null
> +++ b/libavcodec/riscv/vvcdsp_init.c
> @@ -0,0 +1,71 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "config.h"
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
> +#include "libavcodec/vvc/dsp.h"
> +
> +#define bf(fn, bd, opt) fn##_##bd##_##opt
> +
> +#define AVG_PROTOTYPES(bd, opt)
> \
> +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,
> \
> + const int16_t *src0, const int16_t *src1, int width, int height);
> \
> +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,
> \
> + const int16_t *src0, const int16_t *src1, int width, int height,
> \
> + int denom, int w0, int w1, int o0, int o1);
> +
> +AVG_PROTOTYPES(8, rvv_128)
> +AVG_PROTOTYPES(8, rvv_256)
> +
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> +{
> +#if HAVE_RVV
> + const int flags = av_get_cpu_flags();
> +
> + if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
> + ff_rv_vlen_least(256)) {
> + switch (bd) {
> + case 8:
> + c->inter.avg = ff_vvc_avg_8_rvv_256;
> +# if (__riscv_xlen == 64)
> + c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
> +# endif
> + break;
> + default:
> + break;
> + }
> + } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags &
> AV_CPU_FLAG_RVB_ADDR) &&
> + ff_rv_vlen_least(128)) {
> + switch (bd) {
> + case 8:
> + c->inter.avg = ff_vvc_avg_8_rvv_128;
> +# if (__riscv_xlen == 64)
> + c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
> +# endif
> + break;
> + default:
> + break;
> + }
> + }
> +#endif
> +}
> diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
> index 41e830a98a..c55a37d255 100644
> --- a/libavcodec/vvc/dsp.c
> +++ b/libavcodec/vvc/dsp.c
> @@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int
> bit_depth)
> break;
> }
>
> -#if ARCH_X86
> +#if ARCH_RISCV
> + ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
> +#elif ARCH_X86
> ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
> #endif
> }
> diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> index 9810ac314c..dcb978549f 100644
> --- a/libavcodec/vvc/dsp.h
> +++ b/libavcodec/vvc/dsp.h
> @@ -167,6 +167,7 @@ typedef struct VVCDSPContext {
>
> void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
>
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
> void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
>
> #endif /* AVCODEC_VVC_DSP_H */
> --
> 2.45.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-05-21 19:24 ` uk7b
2024-05-21 19:26 ` flow gg
@ 2024-05-25 8:27 ` Rémi Denis-Courmont
1 sibling, 0 replies; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-25 8:27 UTC (permalink / raw)
To: ffmpeg-devel
Hi,
Sorry for the delay, this is not my day job.
Le tiistaina 21. toukokuuta 2024, 22.24.21 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> ---
> libavcodec/riscv/Makefile | 2 +
> libavcodec/riscv/vvc_mc_rvv.S | 313 +++++++++++++++++++++++++++++++++
> libavcodec/riscv/vvcdsp_init.c | 71 ++++++++
In keeping in line with the rest of the project, that should probably go into
libavcodec/riscv/vvc/
> libavcodec/vvc/dsp.c | 4 +-
> libavcodec/vvc/dsp.h | 1 +
> 5 files changed, 390 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/riscv/vvc_mc_rvv.S
> create mode 100644 libavcodec/riscv/vvcdsp_init.c
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 27b268ae39..6297664fc9 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
> RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o
> +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o
> diff --git a/libavcodec/riscv/vvc_mc_rvv.S b/libavcodec/riscv/vvc_mc_rvv.S
> new file mode 100644
> index 0000000000..3c2bac37f7
> --- /dev/null
> +++ b/libavcodec/riscv/vvc_mc_rvv.S
> @@ -0,0 +1,313 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS). + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro vsetvlstatic8 w vlen is_w
> + .if \w <= 2
> + vsetivli zero, \w, e8, mf8, ta, ma
> + .elseif \w <= 4 && \vlen == 128
> + vsetivli zero, \w, e8, mf4, ta, ma
> + .elseif \w <= 4 && \vlen >= 256
> + vsetivli zero, \w, e8, mf8, ta, ma
> + .elseif \w <= 8 && \vlen == 128
> + vsetivli zero, \w, e8, mf2, ta, ma
> + .elseif \w <= 8 && \vlen >= 256
> + vsetivli zero, \w, e8, mf4, ta, ma
> + .elseif \w <= 16 && \vlen == 128
> + vsetivli zero, \w, e8, m1, ta, ma
> + .elseif \w <= 16 && \vlen >= 256
> + vsetivli zero, \w, e8, mf2, ta, ma
> + .elseif \w <= 32 && \vlen >= 256
> + li t0, \w
> + vsetvli zero, t0, e8, m1, ta, ma
> + .elseif \w <= (\vlen / 4) || \is_w
> + li t0, 64
> + vsetvli zero, t0, e8, m2, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e8, m4, ta, ma
> + .endif
> +.endm
> +
> +.macro vsetvlstatic16 w vlen is_w
> + .if \w <= 2
> + vsetivli zero, \w, e16, mf4, ta, ma
> + .elseif \w <= 4 && \vlen == 128
> + vsetivli zero, \w, e16, mf2, ta, ma
> + .elseif \w <= 4 && \vlen >= 256
> + vsetivli zero, \w, e16, mf4, ta, ma
> + .elseif \w <= 8 && \vlen == 128
> + vsetivli zero, \w, e16, m1, ta, ma
> + .elseif \w <= 8 && \vlen >= 256
> + vsetivli zero, \w, e16, mf2, ta, ma
> + .elseif \w <= 16 && \vlen == 128
> + vsetivli zero, \w, e16, m2, ta, ma
> + .elseif \w <= 16 && \vlen >= 256
> + vsetivli zero, \w, e16, m1, ta, ma
> + .elseif \w <= 32 && \vlen >= 256
> + li t0, \w
> + vsetvli zero, t0, e16, m2, ta, ma
> + .elseif \w <= (\vlen / 4) || \is_w
> + li t0, 64
> + vsetvli zero, t0, e16, m4, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e16, m8, ta, ma
> + .endif
> +.endm
> +
> +.macro vsetvlstatic32 w vlen
> + .if \w <= 2
> + vsetivli zero, \w, e32, mf2, ta, ma
> + .elseif \w <= 4 && \vlen == 128
> + vsetivli zero, \w, e32, m1, ta, ma
> + .elseif \w <= 4 && \vlen >= 256
> + vsetivli zero, \w, e32, mf2, ta, ma
> + .elseif \w <= 8 && \vlen == 128
> + vsetivli zero, \w, e32, m2, ta, ma
> + .elseif \w <= 8 && \vlen >= 256
> + vsetivli zero, \w, e32, m1, ta, ma
> + .elseif \w <= 16 && \vlen == 128
> + vsetivli zero, \w, e32, m4, ta, ma
> + .elseif \w <= 16 && \vlen >= 256
> + vsetivli zero, \w, e32, m2, ta, ma
> + .elseif \w <= 32 && \vlen >= 256
> + li t0, \w
> + vsetvli zero, t0, e32, m4, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e32, m8, ta, ma
> + .endif
> +.endm
> +
> +.macro avg_nx1 w vlen
> + vsetvlstatic16 \w, \vlen, 0
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vadd.vv v8, v8, v0
> + vmax.vx v8, v8, zero
> + vsetvlstatic8 \w, \vlen, 0
> + vnclipu.wi v8, v8, 7
> + vse8.v v8, (a0)
> +.endm
> +
> +.macro avg w h vlen
> + csrw vxrm, zero
> +
> +.if \w <= (\vlen / 4) && \h >= 4
> +.rept (\h / 4)
> + addi t1, a3, 128*2
> + addi t3, a2, 128*2*2
> + add t2, a0, a1
> + sh1add t5, a1, a0
> + addi t4, a3, 128*2*2
> + addi a7, a3, 128*2*3
> + addi t6, a2, 128*2*3
> + vsetvlstatic16 \w, \vlen, 0
> + addi t0, a2, 128*2
> + add a6, t5, a1
> + vle16.v v0, (a2)
> + vle16.v v4, (a3)
> + vle16.v v8, (t0)
> + vle16.v v12, (t1)
> + vle16.v v16, (t3)
> + vle16.v v20, (t4)
> + vle16.v v24, (t6)
> + vle16.v v28, (a7)
> + vadd.vv v4, v4, v0
> + vadd.vv v12, v12, v8
> + vadd.vv v20, v20, v16
> + vadd.vv v28, v28, v24
> + vmax.vx v4, v4, zero
> + vmax.vx v12, v12, zero
> + vmax.vx v20, v20, zero
> + vmax.vx v28, v28, zero
> + vsetvlstatic8 \w, \vlen, 0
> + addi a2, a2, 128*8
> + vnclipu.wi v4, v4, 7
> + vnclipu.wi v12, v12, 7
> + vnclipu.wi v20, v20, 7
> + vnclipu.wi v28, v28, 7
> + addi a3, a3, 128*8
> + vse8.v v4, (a0)
> + vse8.v v12, (t2)
> + vse8.v v20, (t5)
> + sh2add a0, a1, a0
> + vse8.v v28, (a6)
> +.endr
> +
> +.elseif (\w <= (\vlen / 4) && \h == 2) || (\w == (\vlen / 2))
> +.rept (\h / 2)
> + vsetvlstatic16 \w, \vlen, 0
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + add t2, a0, a1
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vle16.v v16, (t0)
> + vle16.v v24, (t1)
> + vadd.vv v8, v8, v0
> + vadd.vv v24, v24, v16
> + vmax.vx v8, v8, zero
> + vmax.vx v24, v24, zero
> + vsetvlstatic8 \w, \vlen, 0
> + addi a2, a2, 128*4
> + vnclipu.wi v8, v8, 7
> + vnclipu.wi v24, v24, 7
> + addi a3, a3, 128*4
> + vse8.v v8, (a0)
> + vse8.v v24, (t2)
> + sh1add a0, a1, a0
> +.endr
> +
> +.else
> +.rept \h
> + avg_nx1 \w, \vlen
> + .if \w == 128 && \vlen == 128
> + addi a2, a2, 64*2
> + addi a3, a3, 64*2
> + addi a0, a0, 64
> + avg_nx1 \w, \vlen
> + addi a0, a0, -64
> + addi a2, a2, 128
> + addi a3, a3, 128
> + .else
> + addi a2, a2, 128*2
> + addi a3, a3, 128*2
> + .endif
> + add a0, a0, a1
> +.endr
> +.endif
> +.endm
> +
> +.macro w_avg_nx1 w vlen
> + vsetvlstatic16 \w, \vlen, 1
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vwmul.vx v16, v0, a7
> + vwmacc.vx v16, t3, v8
> + vsetvlstatic32 \w, \vlen
> + vadd.vx v16, v16, t4
> + vsetvlstatic16 \w, \vlen, 1
> + vnsrl.wx v16, v16, t6
> + vmax.vx v16, v16, zero
> + vsetvlstatic8 \w, \vlen, 1
> + vnclipu.wi v16, v16, 0
> + vse8.v v16, (a0)
> +.endm
> +
> +#if (__riscv_xlen == 64)
> +.macro w_avg w h vlen
> + csrw vxrm, zero
> + addi t6, a6, 7
> + ld t3, (sp)
> + ld t4, 8(sp)
> + ld t5, 16(sp)
> + add t4, t4, t5
> + addi t4, t4, 1 // o0 + o1 + 1
> + addi t5, t6, -1 // shift - 1
> + sll t4, t4, t5
> +
> +.if \w <= (\vlen / 8)
> + .rept (\h / 2)
> + vsetvlstatic16 \w, \vlen, 1
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vle16.v v20, (t0)
> + vle16.v v24, (t1)
> + vwmul.vx v16, v0, a7
> + vwmul.vx v28, v20, a7
> + vwmacc.vx v16, t3, v8
> + vwmacc.vx v28, t3, v24
> + vsetvlstatic32 \w, \vlen
> + add t2, a0, a1
> + vadd.vx v16, v16, t4
> + vadd.vx v28, v28, t4
> + vsetvlstatic16 \w, \vlen, 1
> + vnsrl.wx v16, v16, t6
> + vnsrl.wx v28, v28, t6
> + vmax.vx v16, v16, zero
> + vmax.vx v28, v28, zero
> + vsetvlstatic8 \w, \vlen, 1
> + addi a2, a2, 128*4
> + vnclipu.wi v16, v16, 0
> + vnclipu.wi v28, v28, 0
> + vse8.v v16, (a0)
> + addi a3, a3, 128*4
> + vse8.v v28, (t2)
> + sh1add a0, a1, a0
> + .endr
> +.else
> + .rept \h
> + w_avg_nx1 \w, \vlen
> + .if \w == (\vlen / 2)
> + addi a2, a2, (\vlen / 2)
> + addi a3, a3, (\vlen / 2)
> + addi a0, a0, (\vlen / 4)
> + w_avg_nx1 \w, \vlen
> + addi a2, a2, -(\vlen / 2)
> + addi a3, a3, -(\vlen / 2)
> + addi a0, a0, -(\vlen / 4)
> + .elseif \w == 128 && \vlen == 128
> + .rept 3
> + addi a2, a2, (\vlen / 2)
> + addi a3, a3, (\vlen / 2)
> + addi a0, a0, (\vlen / 4)
> + w_avg_nx1 \w, \vlen
> + .endr
> + addi a2, a2, -(\vlen / 2) * 3
> + addi a3, a3, -(\vlen / 2) * 3
> + addi a0, a0, -(\vlen / 4) * 3
> + .endif
> +
> + addi a2, a2, 128*2
> + addi a3, a3, 128*2
> + add a0, a0, a1
> + .endr
> +.endif
> +.endm
> +#endif
> +
> +.macro func_avg name vlen
> +func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x
> +.irp w,2,4,8,16,32,64,128
> + li t3, \w
> + bne a4, t3, \name\vlen\()end\w
> +.irp h,2,4,8,16,32,64,128
> + li t4, \h
> + bne a5, t4, \name\vlen\()end\w\h
Expanding the macro 49 times, with up to 14 branches to get there is maybe not
such a great idea. It might look nice on the checkasm µbenchmarks because the
branches under test get predicted and cached.
But in real use, branch prediction will not work so well, and the I-cache will
be filled with all variants of the same function. Indeed, this seems to result
in about .5 MiB of code. Even if only one half is needed (128-bit or 256+-bit
variants). that's a lot. For comparison, x86 uses just about 10 KiB, also with
two variants. What I make out from the arcane forbidden CISC arts there:
- functions are specialised only in one dimension, not both,
- dispatch tables avoid multiplying branches.
> + \name \w \h \vlen
> + ret
> +\name\vlen\()end\w\h:
> +.endr
> +\name\vlen\()end\w:
> +.endr
> +endfunc
> +.endm
> +
> +func_avg avg 256
> +func_avg avg 128
> +#if (__riscv_xlen == 64)
> +func_avg w_avg 256
> +func_avg w_avg 128
> +#endif
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH v5] lavc/vvc_mc: R-V V avg w_avg
@ 2024-07-08 15:41 Rémi Denis-Courmont
2024-07-10 10:02 ` [FFmpeg-devel] [PATCH] " uk7b
0 siblings, 1 reply; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-08 15:41 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Le maanantaina 1. heinäkuuta 2024, 19.09.01 EEST flow gg a écrit :
> I reviewed it again, the purpose of is_w is to limit lmul to a maximum of
> 1/4 of vlen,
1/4 of vlen? Do you mean limit to EMUL=1 for EEW=32 and EMUL=1/4 for EEW=8?
Limiting LMUL to less than 1 at maximum EEW is useless from a functional
standpoint, since fractional registers cannot be addressed individually. (Of
course it might still be useful for performance reasons.)
> to prevent vector register shortage, which can also be
> considered as vset limiting lmul. I renamed it to quarter_len_limit.
TBH, I don't really understand.
If a lower LMUL limit is reached, then specialisations for the corresponding
VLEN are simply unncessary/infeasible and the code for lower VLEN should be
used.
If a higher LMUL limit is reached due to register pressure (or the 8 hard
limit), then the given VLEN cannot be supported at all, or requires some
completely different code.
Either way, I don't really follow why vsetvlfixed macros need to be involved.
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-07-08 15:41 [FFmpeg-devel] [PATCH v5] " Rémi Denis-Courmont
@ 2024-07-10 10:02 ` uk7b
2024-07-16 14:21 ` Rémi Denis-Courmont
0 siblings, 1 reply; 19+ messages in thread
From: uk7b @ 2024-07-10 10:02 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
avg_8_2x2_c : 1.2 1.2
avg_8_2x2_rvv_i32 : 0.7 0.7
avg_8_2x4_c : 2.0 2.0
avg_8_2x4_rvv_i32 : 1.2 1.0
avg_8_2x8_c : 3.7 4.0
avg_8_2x8_rvv_i32 : 1.7 1.5
avg_8_2x16_c : 7.2 7.5
avg_8_2x16_rvv_i32 : 3.0 2.7
avg_8_2x32_c : 14.5 15.2
avg_8_2x32_rvv_i32 : 5.5 5.0
avg_8_2x64_c : 53.5 42.2
avg_8_2x64_rvv_i32 : 42.0 33.2
avg_8_2x128_c : 93.5 86.0
avg_8_2x128_rvv_i32 : 79.2 74.0
avg_8_4x2_c : 1.7 2.0
avg_8_4x2_rvv_i32 : 1.0 1.0
avg_8_4x4_c : 3.5 3.5
avg_8_4x4_rvv_i32 : 1.2 1.0
avg_8_4x8_c : 6.5 7.0
avg_8_4x8_rvv_i32 : 1.7 1.7
avg_8_4x16_c : 13.5 14.0
avg_8_4x16_rvv_i32 : 3.0 2.5
avg_8_4x32_c : 26.2 27.5
avg_8_4x32_rvv_i32 : 5.7 5.0
avg_8_4x64_c : 79.0 66.5
avg_8_4x64_rvv_i32 : 41.7 34.2
avg_8_4x128_c : 154.0 128.7
avg_8_4x128_rvv_i32 : 80.5 74.5
avg_8_8x2_c : 3.2 3.2
avg_8_8x2_rvv_i32 : 1.0 0.7
avg_8_8x4_c : 6.5 6.5
avg_8_8x4_rvv_i32 : 1.2 1.0
avg_8_8x8_c : 12.5 13.2
avg_8_8x8_rvv_i32 : 2.0 1.7
avg_8_8x16_c : 25.2 26.5
avg_8_8x16_rvv_i32 : 3.2 2.7
avg_8_8x32_c : 50.0 52.7
avg_8_8x32_rvv_i32 : 6.2 4.7
avg_8_8x64_c : 130.0 112.2
avg_8_8x64_rvv_i32 : 44.2 33.5
avg_8_8x128_c : 241.5 226.7
avg_8_8x128_rvv_i32 : 78.7 74.0
avg_8_16x2_c : 6.2 6.5
avg_8_16x2_rvv_i32 : 1.2 0.7
avg_8_16x4_c : 12.2 13.0
avg_8_16x4_rvv_i32 : 1.7 1.0
avg_8_16x8_c : 24.7 25.7
avg_8_16x8_rvv_i32 : 3.0 1.7
avg_8_16x16_c : 49.0 51.5
avg_8_16x16_rvv_i32 : 5.5 3.2
avg_8_16x32_c : 97.7 102.7
avg_8_16x32_rvv_i32 : 10.5 5.5
avg_8_16x64_c : 219.5 223.5
avg_8_16x64_rvv_i32 : 56.7 34.5
avg_8_16x128_c : 409.7 426.0
avg_8_16x128_rvv_i32 : 98.7 73.5
avg_8_32x2_c : 12.5 13.0
avg_8_32x2_rvv_i32 : 1.7 1.0
avg_8_32x4_c : 24.2 25.5
avg_8_32x4_rvv_i32 : 3.0 1.5
avg_8_32x8_c : 48.5 50.7
avg_8_32x8_rvv_i32 : 5.2 2.7
avg_8_32x16_c : 96.5 101.2
avg_8_32x16_rvv_i32 : 10.2 5.0
avg_8_32x32_c : 192.7 202.5
avg_8_32x32_rvv_i32 : 19.7 9.5
avg_8_32x64_c : 433.5 415.5
avg_8_32x64_rvv_i32 : 38.7 18.2
avg_8_32x128_c : 812.0 820.7
avg_8_32x128_rvv_i32 : 145.2 73.0
avg_8_64x2_c : 24.0 25.2
avg_8_64x2_rvv_i32 : 2.7 1.5
avg_8_64x4_c : 48.0 50.5
avg_8_64x4_rvv_i32 : 5.2 2.5
avg_8_64x8_c : 117.5 100.7
avg_8_64x8_rvv_i32 : 10.0 4.7
avg_8_64x16_c : 208.5 201.0
avg_8_64x16_rvv_i32 : 19.0 9.0
avg_8_64x32_c : 382.7 402.0
avg_8_64x32_rvv_i32 : 37.5 17.5
avg_8_64x64_c : 830.0 834.2
avg_8_64x64_rvv_i32 : 75.5 34.5
avg_8_64x128_c : 2008.0 1705.2
avg_8_64x128_rvv_i32 : 205.5 149.2
avg_8_128x2_c : 48.7 51.0
avg_8_128x2_rvv_i32 : 5.2 2.7
avg_8_128x4_c : 96.5 101.2
avg_8_128x4_rvv_i32 : 10.2 4.7
avg_8_128x8_c : 192.2 202.0
avg_8_128x8_rvv_i32 : 19.7 9.5
avg_8_128x16_c : 385.5 403.2
avg_8_128x16_rvv_i32 : 38.7 18.2
avg_8_128x32_c : 788.0 805.7
avg_8_128x32_rvv_i32 : 77.0 36.2
avg_8_128x64_c : 1597.5 1658.0
avg_8_128x64_rvv_i32 : 175.5 78.7
avg_8_128x128_c : 3156.0 3282.5
avg_8_128x128_rvv_i32 : 369.2 276.7
w_avg_8_2x2_c : 1.5 1.5
w_avg_8_2x2_rvv_i32 : 1.2 1.0
w_avg_8_2x4_c : 2.7 2.5
w_avg_8_2x4_rvv_i32 : 1.7 1.7
w_avg_8_2x8_c : 5.0 4.7
w_avg_8_2x8_rvv_i32 : 2.7 2.5
w_avg_8_2x16_c : 9.7 9.5
w_avg_8_2x16_rvv_i32 : 4.7 4.5
w_avg_8_2x32_c : 18.7 18.5
w_avg_8_2x32_rvv_i32 : 9.0 7.7
w_avg_8_2x64_c : 64.0 51.2
w_avg_8_2x64_rvv_i32 : 50.0 38.2
w_avg_8_2x128_c : 107.7 94.0
w_avg_8_2x128_rvv_i32 : 86.2 75.7
w_avg_8_4x2_c : 2.5 2.5
w_avg_8_4x2_rvv_i32 : 1.2 1.0
w_avg_8_4x4_c : 4.7 4.5
w_avg_8_4x4_rvv_i32 : 1.7 1.5
w_avg_8_4x8_c : 9.0 9.0
w_avg_8_4x8_rvv_i32 : 2.7 2.5
w_avg_8_4x16_c : 17.7 17.5
w_avg_8_4x16_rvv_i32 : 5.0 4.2
w_avg_8_4x32_c : 34.7 35.0
w_avg_8_4x32_rvv_i32 : 9.0 8.0
w_avg_8_4x64_c : 103.2 82.0
w_avg_8_4x64_rvv_i32 : 45.7 37.5
w_avg_8_4x128_c : 210.0 164.5
w_avg_8_4x128_rvv_i32 : 86.2 75.7
w_avg_8_8x2_c : 4.5 4.5
w_avg_8_8x2_rvv_i32 : 1.2 1.2
w_avg_8_8x4_c : 8.7 8.5
w_avg_8_8x4_rvv_i32 : 1.7 1.5
w_avg_8_8x8_c : 17.2 17.2
w_avg_8_8x8_rvv_i32 : 3.2 2.5
w_avg_8_8x16_c : 34.0 34.0
w_avg_8_8x16_rvv_i32 : 5.5 4.2
w_avg_8_8x32_c : 67.7 67.7
w_avg_8_8x32_rvv_i32 : 10.7 8.0
w_avg_8_8x64_c : 174.0 145.5
w_avg_8_8x64_rvv_i32 : 50.0 40.0
w_avg_8_8x128_c : 342.2 294.2
w_avg_8_8x128_rvv_i32 : 85.2 75.2
w_avg_8_16x2_c : 8.5 8.5
w_avg_8_16x2_rvv_i32 : 2.0 1.0
w_avg_8_16x4_c : 16.7 17.0
w_avg_8_16x4_rvv_i32 : 3.2 1.7
w_avg_8_16x8_c : 33.2 33.2
w_avg_8_16x8_rvv_i32 : 5.5 3.0
w_avg_8_16x16_c : 66.5 66.7
w_avg_8_16x16_rvv_i32 : 28.2 5.0
w_avg_8_16x32_c : 134.0 133.5
w_avg_8_16x32_rvv_i32 : 20.0 9.5
w_avg_8_16x64_c : 318.2 344.5
w_avg_8_16x64_rvv_i32 : 71.7 41.7
w_avg_8_16x128_c : 718.0 583.0
w_avg_8_16x128_rvv_i32 : 117.5 78.2
w_avg_8_32x2_c : 16.7 16.7
w_avg_8_32x2_rvv_i32 : 3.7 3.2
w_avg_8_32x4_c : 33.2 33.5
w_avg_8_32x4_rvv_i32 : 6.7 6.0
w_avg_8_32x8_c : 65.7 66.0
w_avg_8_32x8_rvv_i32 : 12.5 11.0
w_avg_8_32x16_c : 132.7 133.5
w_avg_8_32x16_rvv_i32 : 24.0 21.5
w_avg_8_32x32_c : 311.5 263.5
w_avg_8_32x32_rvv_i32 : 47.7 42.5
w_avg_8_32x64_c : 592.0 555.5
w_avg_8_32x64_rvv_i32 : 126.5 97.7
w_avg_8_32x128_c : 1179.0 1139.5
w_avg_8_32x128_rvv_i32 : 238.2 180.7
w_avg_8_64x2_c : 32.7 33.0
w_avg_8_64x2_rvv_i32 : 6.0 3.2
w_avg_8_64x4_c : 65.7 66.0
w_avg_8_64x4_rvv_i32 : 11.5 5.7
w_avg_8_64x8_c : 134.0 132.2
w_avg_8_64x8_rvv_i32 : 22.7 11.0
w_avg_8_64x16_c : 281.2 262.5
w_avg_8_64x16_rvv_i32 : 44.2 21.5
w_avg_8_64x32_c : 646.2 570.0
w_avg_8_64x32_rvv_i32 : 88.0 42.5
w_avg_8_64x64_c : 1203.0 1066.7
w_avg_8_64x64_rvv_i32 : 210.7 90.5
w_avg_8_64x128_c : 2688.0 2156.2
w_avg_8_64x128_rvv_i32 : 443.0 214.7
w_avg_8_128x2_c : 65.7 66.0
w_avg_8_128x2_rvv_i32 : 11.2 5.5
w_avg_8_128x4_c : 131.0 133.0
w_avg_8_128x4_rvv_i32 : 22.0 10.2
w_avg_8_128x8_c : 263.5 273.0
w_avg_8_128x8_rvv_i32 : 43.2 20.0
w_avg_8_128x16_c : 525.7 528.0
w_avg_8_128x16_rvv_i32 : 85.5 39.2
w_avg_8_128x32_c : 1064.5 1211.0
w_avg_8_128x32_rvv_i32 : 170.7 78.5
w_avg_8_128x64_c : 2305.5 2350.7
w_avg_8_128x64_rvv_i32 : 400.0 177.5
w_avg_8_128x128_c : 4771.7 4992.7
w_avg_8_128x128_rvv_i32 : 757.5 371.5
---
libavcodec/riscv/vvc/Makefile | 2 +
libavcodec/riscv/vvc/vvc_mc_rvv.S | 288 +++++++++++++++++++++++++++++
libavcodec/riscv/vvc/vvcdsp_init.c | 72 ++++++++
libavcodec/vvc/dsp.c | 4 +-
libavcodec/vvc/dsp.h | 1 +
5 files changed, 366 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/riscv/vvc/Makefile
create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S
create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c
diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile
new file mode 100644
index 0000000000..582b051579
--- /dev/null
+++ b/libavcodec/riscv/vvc/Makefile
@@ -0,0 +1,2 @@
+OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o
+RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o
diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S b/libavcodec/riscv/vvc/vvc_mc_rvv.S
new file mode 100644
index 0000000000..8cf4bcf680
--- /dev/null
+++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro vsetvlstatic8 w, vlen, max_lmul=m4
+ .if \w == 2 && \vlen == 128
+ vsetivli zero, \w, e8, mf8, ta, ma
+ .elseif \w == 4 && \vlen == 128
+ vsetivli zero, \w, e8, mf4, ta, ma
+ .elseif \w == 8 && \vlen == 128
+ vsetivli zero, \w, e8, mf2, ta, ma
+ .elseif \w == 16 && \vlen == 128
+ vsetivli zero, \w, e8, m1, ta, ma
+ .elseif \w == 32 && \vlen == 128
+ li t0, \w
+ vsetvli zero, t0, e8, m2, ta, ma
+ .elseif \w <= 4 && \vlen == 256
+ vsetivli zero, \w, e8, mf8, ta, ma
+ .elseif \w == 8 && \vlen == 256
+ vsetivli zero, \w, e8, mf4, ta, ma
+ .elseif \w == 16 && \vlen == 256
+ vsetivli zero, \w, e8, mf2, ta, ma
+ .elseif \w == 32 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e8, m1, ta, ma
+ .elseif \w == 64 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e8, m2, ta, ma
+ // (\w <= 128 && \vlen == 128) || (\w == 128 && \vlen == 256)
+ .else
+ li t0, \w
+ vsetvli zero, t0, e8, \max_lmul, ta, ma
+ .endif
+.endm
+
+.macro vsetvlstatic16 w, vlen, max_lmul=m8
+ .if \w == 2 && \vlen == 128
+ vsetivli zero, \w, e16, mf4, ta, ma
+ .elseif \w == 4 && \vlen == 128
+ vsetivli zero, \w, e16, mf2, ta, ma
+ .elseif \w == 8 && \vlen == 128
+ vsetivli zero, \w, e16, m1, ta, ma
+ .elseif \w == 16 && \vlen == 128
+ vsetivli zero, \w, e16, m2, ta, ma
+ .elseif \w == 32 && \vlen == 128
+ li t0, \w
+ vsetvli zero, t0, e16, m4, ta, ma
+ .elseif \w <= 4 && \vlen == 256
+ vsetivli zero, \w, e16, mf4, ta, ma
+ .elseif \w == 8 && \vlen == 256
+ vsetivli zero, \w, e16, mf2, ta, ma
+ .elseif \w == 16 && \vlen == 256
+ vsetivli zero, \w, e16, m1, ta, ma
+ .elseif \w == 32 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e16, m2, ta, ma
+ .elseif \w == 64 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e16, m4, ta, ma
+ // (\w <= 128 && \vlen == 128) || (\w == 128 && \vlen == 256)
+ .else
+ li t0, \w
+ vsetvli zero, t0, e16, \max_lmul, ta, ma
+ .endif
+.endm
+
+.macro vsetvlstatic32 w, vlen
+ .if \w == 2
+ vsetivli zero, \w, e32, mf2, ta, ma
+ .elseif \w == 4 && \vlen == 128
+ vsetivli zero, \w, e32, m1, ta, ma
+ .elseif \w == 8 && \vlen == 128
+ vsetivli zero, \w, e32, m2, ta, ma
+ .elseif \w == 16 && \vlen == 128
+ vsetivli zero, \w, e32, m4, ta, ma
+ .elseif \w == 4 && \vlen == 256
+ vsetivli zero, \w, e32, mf2, ta, ma
+ .elseif \w == 8 && \vlen == 256
+ vsetivli zero, \w, e32, m1, ta, ma
+ .elseif \w == 16 && \vlen == 256
+ vsetivli zero, \w, e32, m2, ta, ma
+ .elseif \w == 32 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e32, m4, ta, ma
+ // (\w <= 128 && \vlen == 128) || (\w <= 128 && \vlen == 256)
+ .else
+ li t0, \w
+ vsetvli zero, t0, e32, m8, ta, ma
+ .endif
+.endm
+
+.macro avg w, vlen, id
+\id\w\vlen:
+.if \w < 128
+ vsetvlstatic16 \w, \vlen
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ add t2, a0, a1
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ addi a5, a5, -2
+ vle16.v v16, (t0)
+ vle16.v v24, (t1)
+ vadd.vv v8, v8, v0
+ vadd.vv v24, v24, v16
+ vmax.vx v8, v8, zero
+ vmax.vx v24, v24, zero
+ vsetvlstatic8 \w, \vlen
+ addi a2, a2, 128*4
+ vnclipu.wi v8, v8, 7
+ vnclipu.wi v24, v24, 7
+ addi a3, a3, 128*4
+ vse8.v v8, (a0)
+ vse8.v v24, (t2)
+ sh1add a0, a1, a0
+.else
+ addi a5, a5, -1
+ mv t1, a0
+ mv t2, a2
+ mv t3, a3
+ mv t4, a4
+1:
+ vsetvli t0, a4, e16, m8, ta, ma
+ sub a4, a4, t0
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vadd.vv v8, v8, v0
+ vmax.vx v8, v8, zero
+ vsetvli zero, zero, e8, m4, ta, ma
+ vnclipu.wi v8, v8, 7
+ vse8.v v8, (a0)
+ sh1add a2, t0, a2
+ sh1add a3, t0, a3
+ add a0, a0, t0
+ bnez a4, 1b
+ add a0, t1, a1
+ addi a2, t2, 128*2
+ addi a3, t3, 128*2
+ mv a4, t4
+.endif
+ bnez a5, \id\w\vlen\()b
+ ret
+.endm
+
+
+.macro AVG_JMP_TABLE id, vlen
+const jmp_table_\id\vlen
+ .4byte \id\()2\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()4\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()8\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()16\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()32\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()64\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()128\vlen\()f - jmp_table_\id\vlen
+endconst
+.endm
+
+.macro AVG_J vlen, id
+ clz t1, a4
+ neg t1, t1
+ lla t5, jmp_table_\id\vlen
+ sh2add t1, t1, t5
+ lw t1, ((__riscv_xlen-2)<<2)(t1)
+ add t1, t1, t5
+ jr t1
+.endm
+
+.macro func_avg vlen
+func ff_vvc_avg_8_rvv_\vlen\(), zve32x
+ AVG_JMP_TABLE 1, \vlen
+ csrwi vxrm, 0
+ AVG_J \vlen, 1
+ .irp w,2,4,8,16,32,64,128
+ avg \w, \vlen, 1
+ .endr
+endfunc
+.endm
+
+.macro w_avg w, vlen, id
+\id\w\vlen:
+.if \w < 32
+ vsetvlstatic16 \w, \vlen, m4
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ addi a5, a5, -2
+ vle16.v v20, (t0)
+ vle16.v v24, (t1)
+ vwmul.vx v16, v0, a7
+ vwmul.vx v28, v20, a7
+ vwmacc.vx v16, t3, v8
+ vwmacc.vx v28, t3, v24
+ vsetvlstatic32 \w, \vlen
+ add t2, a0, a1
+ vadd.vx v16, v16, t4
+ vadd.vx v28, v28, t4
+ vsetvlstatic16 \w, \vlen, m4
+ vnsrl.wx v16, v16, t6
+ vnsrl.wx v28, v28, t6
+ vmax.vx v16, v16, zero
+ vmax.vx v28, v28, zero
+ vsetvlstatic8 \w, \vlen, m2
+ addi a2, a2, 128*4
+ vnclipu.wi v16, v16, 0
+ vnclipu.wi v28, v28, 0
+ vse8.v v16, (a0)
+ addi a3, a3, 128*4
+ vse8.v v28, (t2)
+ sh1add a0, a1, a0
+.else
+ addi a5, a5, -1
+ mv t1, a0
+ mv t2, a2
+ mv t5, a3
+ mv a6, a4
+1:
+ vsetvli t0, a4, e16, m4, ta, ma
+ sub a4, a4, t0
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vwmul.vx v16, v0, a7
+ vwmacc.vx v16, t3, v8
+ vsetvli zero, zero, e32, m8, ta, ma
+ vadd.vx v16, v16, t4
+ vsetvli zero, zero, e16, m4, ta, ma
+ vnsrl.wx v16, v16, t6
+ vmax.vx v16, v16, zero
+ vsetvli zero, zero, e8, m2, ta, ma
+ vnclipu.wi v16, v16, 0
+ vse8.v v16, (a0)
+ sh1add a2, t0, a2
+ sh1add a3, t0, a3
+ add a0, a0, t0
+ bnez a4, 1b
+ add a0, t1, a1
+ addi a2, t2, 128*2
+ addi a3, t5, 128*2
+ mv a4, a6
+.endif
+ bnez a5, \id\w\vlen\()b
+ ret
+.endm
+
+
+.macro func_w_avg vlen
+func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x
+ AVG_JMP_TABLE 2, \vlen
+ csrwi vxrm, 0
+ addi t6, a6, 7
+ ld t3, (sp)
+ ld t4, 8(sp)
+ ld t5, 16(sp)
+ addi t4, t4, 1 // o0 + o1 + 1
+ add t4, t4, t5
+ addi t5, t6, -1 // shift - 1
+ sll t4, t4, t5
+ AVG_J \vlen, 2
+ .irp w,2,4,8,16,32,64,128
+ w_avg \w, \vlen, 2
+ .endr
+endfunc
+.endm
+
+func_avg 128
+func_avg 256
+#if (__riscv_xlen == 64)
+func_w_avg 128
+func_w_avg 256
+#endif
diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c b/libavcodec/riscv/vvc/vvcdsp_init.c
new file mode 100644
index 0000000000..9819a7c570
--- /dev/null
+++ b/libavcodec/riscv/vvc/vvcdsp_init.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/vvc/dsp.h"
+
+#define bf(fn, bd, opt) fn##_##bd##_##opt
+
+#define AVG_PROTOTYPES(bd, opt) \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height); \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height, \
+ int denom, int w0, int w1, int o0, int o1);
+
+AVG_PROTOTYPES(8, rvv_128)
+AVG_PROTOTYPES(8, rvv_256)
+
+void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
+{
+#if HAVE_RVV
+ const int flags = av_get_cpu_flags();
+ int vlenb = ff_get_rv_vlenb();
+
+ if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+ vlenb >= 32) {
+ switch (bd) {
+ case 8:
+ c->inter.avg = ff_vvc_avg_8_rvv_256;
+# if (__riscv_xlen == 64)
+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
+# endif
+ break;
+ default:
+ break;
+ }
+ } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+ vlenb >= 16) {
+ switch (bd) {
+ case 8:
+ c->inter.avg = ff_vvc_avg_8_rvv_128;
+# if (__riscv_xlen == 64)
+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
+# endif
+ break;
+ default:
+ break;
+ }
+ }
+#endif
+}
diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
index 41e830a98a..c55a37d255 100644
--- a/libavcodec/vvc/dsp.c
+++ b/libavcodec/vvc/dsp.c
@@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
break;
}
-#if ARCH_X86
+#if ARCH_RISCV
+ ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
+#elif ARCH_X86
ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
#endif
}
diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
index 1f14096c41..e03236dd76 100644
--- a/libavcodec/vvc/dsp.h
+++ b/libavcodec/vvc/dsp.h
@@ -180,6 +180,7 @@ typedef struct VVCDSPContext {
void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
+void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
#endif /* AVCODEC_VVC_DSP_H */
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-07-10 10:02 ` [FFmpeg-devel] [PATCH] " uk7b
@ 2024-07-16 14:21 ` Rémi Denis-Courmont
2024-07-18 15:02 ` uk7b
2024-07-18 15:04 ` flow gg
0 siblings, 2 replies; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-16 14:21 UTC (permalink / raw)
To: ffmpeg-devel
Le keskiviikkona 10. heinäkuuta 2024, 13.02.44 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908 X60
> avg_8_2x2_c : 1.2 1.2
> avg_8_2x2_rvv_i32 : 0.7 0.7
> avg_8_2x4_c : 2.0 2.0
> avg_8_2x4_rvv_i32 : 1.2 1.0
> avg_8_2x8_c : 3.7 4.0
> avg_8_2x8_rvv_i32 : 1.7 1.5
> avg_8_2x16_c : 7.2 7.5
> avg_8_2x16_rvv_i32 : 3.0 2.7
> avg_8_2x32_c : 14.5 15.2
> avg_8_2x32_rvv_i32 : 5.5 5.0
> avg_8_2x64_c : 53.5 42.2
> avg_8_2x64_rvv_i32 : 42.0 33.2
> avg_8_2x128_c : 93.5 86.0
> avg_8_2x128_rvv_i32 : 79.2 74.0
> avg_8_4x2_c : 1.7 2.0
> avg_8_4x2_rvv_i32 : 1.0 1.0
> avg_8_4x4_c : 3.5 3.5
> avg_8_4x4_rvv_i32 : 1.2 1.0
> avg_8_4x8_c : 6.5 7.0
> avg_8_4x8_rvv_i32 : 1.7 1.7
> avg_8_4x16_c : 13.5 14.0
> avg_8_4x16_rvv_i32 : 3.0 2.5
> avg_8_4x32_c : 26.2 27.5
> avg_8_4x32_rvv_i32 : 5.7 5.0
> avg_8_4x64_c : 79.0 66.5
> avg_8_4x64_rvv_i32 : 41.7 34.2
> avg_8_4x128_c : 154.0 128.7
> avg_8_4x128_rvv_i32 : 80.5 74.5
> avg_8_8x2_c : 3.2 3.2
> avg_8_8x2_rvv_i32 : 1.0 0.7
> avg_8_8x4_c : 6.5 6.5
> avg_8_8x4_rvv_i32 : 1.2 1.0
> avg_8_8x8_c : 12.5 13.2
> avg_8_8x8_rvv_i32 : 2.0 1.7
> avg_8_8x16_c : 25.2 26.5
> avg_8_8x16_rvv_i32 : 3.2 2.7
> avg_8_8x32_c : 50.0 52.7
> avg_8_8x32_rvv_i32 : 6.2 4.7
> avg_8_8x64_c : 130.0 112.2
> avg_8_8x64_rvv_i32 : 44.2 33.5
> avg_8_8x128_c : 241.5 226.7
> avg_8_8x128_rvv_i32 : 78.7 74.0
> avg_8_16x2_c : 6.2 6.5
> avg_8_16x2_rvv_i32 : 1.2 0.7
> avg_8_16x4_c : 12.2 13.0
> avg_8_16x4_rvv_i32 : 1.7 1.0
> avg_8_16x8_c : 24.7 25.7
> avg_8_16x8_rvv_i32 : 3.0 1.7
> avg_8_16x16_c : 49.0 51.5
> avg_8_16x16_rvv_i32 : 5.5 3.2
> avg_8_16x32_c : 97.7 102.7
> avg_8_16x32_rvv_i32 : 10.5 5.5
> avg_8_16x64_c : 219.5 223.5
> avg_8_16x64_rvv_i32 : 56.7 34.5
> avg_8_16x128_c : 409.7 426.0
> avg_8_16x128_rvv_i32 : 98.7 73.5
> avg_8_32x2_c : 12.5 13.0
> avg_8_32x2_rvv_i32 : 1.7 1.0
> avg_8_32x4_c : 24.2 25.5
> avg_8_32x4_rvv_i32 : 3.0 1.5
> avg_8_32x8_c : 48.5 50.7
> avg_8_32x8_rvv_i32 : 5.2 2.7
> avg_8_32x16_c : 96.5 101.2
> avg_8_32x16_rvv_i32 : 10.2 5.0
> avg_8_32x32_c : 192.7 202.5
> avg_8_32x32_rvv_i32 : 19.7 9.5
> avg_8_32x64_c : 433.5 415.5
> avg_8_32x64_rvv_i32 : 38.7 18.2
> avg_8_32x128_c : 812.0 820.7
> avg_8_32x128_rvv_i32 : 145.2 73.0
> avg_8_64x2_c : 24.0 25.2
> avg_8_64x2_rvv_i32 : 2.7 1.5
> avg_8_64x4_c : 48.0 50.5
> avg_8_64x4_rvv_i32 : 5.2 2.5
> avg_8_64x8_c : 117.5 100.7
> avg_8_64x8_rvv_i32 : 10.0 4.7
> avg_8_64x16_c : 208.5 201.0
> avg_8_64x16_rvv_i32 : 19.0 9.0
> avg_8_64x32_c : 382.7 402.0
> avg_8_64x32_rvv_i32 : 37.5 17.5
> avg_8_64x64_c : 830.0 834.2
> avg_8_64x64_rvv_i32 : 75.5 34.5
> avg_8_64x128_c : 2008.0 1705.2
> avg_8_64x128_rvv_i32 : 205.5 149.2
> avg_8_128x2_c : 48.7 51.0
> avg_8_128x2_rvv_i32 : 5.2 2.7
> avg_8_128x4_c : 96.5 101.2
> avg_8_128x4_rvv_i32 : 10.2 4.7
> avg_8_128x8_c : 192.2 202.0
> avg_8_128x8_rvv_i32 : 19.7 9.5
> avg_8_128x16_c : 385.5 403.2
> avg_8_128x16_rvv_i32 : 38.7 18.2
> avg_8_128x32_c : 788.0 805.7
> avg_8_128x32_rvv_i32 : 77.0 36.2
> avg_8_128x64_c : 1597.5 1658.0
> avg_8_128x64_rvv_i32 : 175.5 78.7
> avg_8_128x128_c : 3156.0 3282.5
> avg_8_128x128_rvv_i32 : 369.2 276.7
> w_avg_8_2x2_c : 1.5 1.5
> w_avg_8_2x2_rvv_i32 : 1.2 1.0
> w_avg_8_2x4_c : 2.7 2.5
> w_avg_8_2x4_rvv_i32 : 1.7 1.7
> w_avg_8_2x8_c : 5.0 4.7
> w_avg_8_2x8_rvv_i32 : 2.7 2.5
> w_avg_8_2x16_c : 9.7 9.5
> w_avg_8_2x16_rvv_i32 : 4.7 4.5
> w_avg_8_2x32_c : 18.7 18.5
> w_avg_8_2x32_rvv_i32 : 9.0 7.7
> w_avg_8_2x64_c : 64.0 51.2
> w_avg_8_2x64_rvv_i32 : 50.0 38.2
> w_avg_8_2x128_c : 107.7 94.0
> w_avg_8_2x128_rvv_i32 : 86.2 75.7
> w_avg_8_4x2_c : 2.5 2.5
> w_avg_8_4x2_rvv_i32 : 1.2 1.0
> w_avg_8_4x4_c : 4.7 4.5
> w_avg_8_4x4_rvv_i32 : 1.7 1.5
> w_avg_8_4x8_c : 9.0 9.0
> w_avg_8_4x8_rvv_i32 : 2.7 2.5
> w_avg_8_4x16_c : 17.7 17.5
> w_avg_8_4x16_rvv_i32 : 5.0 4.2
> w_avg_8_4x32_c : 34.7 35.0
> w_avg_8_4x32_rvv_i32 : 9.0 8.0
> w_avg_8_4x64_c : 103.2 82.0
> w_avg_8_4x64_rvv_i32 : 45.7 37.5
> w_avg_8_4x128_c : 210.0 164.5
> w_avg_8_4x128_rvv_i32 : 86.2 75.7
> w_avg_8_8x2_c : 4.5 4.5
> w_avg_8_8x2_rvv_i32 : 1.2 1.2
> w_avg_8_8x4_c : 8.7 8.5
> w_avg_8_8x4_rvv_i32 : 1.7 1.5
> w_avg_8_8x8_c : 17.2 17.2
> w_avg_8_8x8_rvv_i32 : 3.2 2.5
> w_avg_8_8x16_c : 34.0 34.0
> w_avg_8_8x16_rvv_i32 : 5.5 4.2
> w_avg_8_8x32_c : 67.7 67.7
> w_avg_8_8x32_rvv_i32 : 10.7 8.0
> w_avg_8_8x64_c : 174.0 145.5
> w_avg_8_8x64_rvv_i32 : 50.0 40.0
> w_avg_8_8x128_c : 342.2 294.2
> w_avg_8_8x128_rvv_i32 : 85.2 75.2
> w_avg_8_16x2_c : 8.5 8.5
> w_avg_8_16x2_rvv_i32 : 2.0 1.0
> w_avg_8_16x4_c : 16.7 17.0
> w_avg_8_16x4_rvv_i32 : 3.2 1.7
> w_avg_8_16x8_c : 33.2 33.2
> w_avg_8_16x8_rvv_i32 : 5.5 3.0
> w_avg_8_16x16_c : 66.5 66.7
> w_avg_8_16x16_rvv_i32 : 28.2 5.0
> w_avg_8_16x32_c : 134.0 133.5
> w_avg_8_16x32_rvv_i32 : 20.0 9.5
> w_avg_8_16x64_c : 318.2 344.5
> w_avg_8_16x64_rvv_i32 : 71.7 41.7
> w_avg_8_16x128_c : 718.0 583.0
> w_avg_8_16x128_rvv_i32 : 117.5 78.2
> w_avg_8_32x2_c : 16.7 16.7
> w_avg_8_32x2_rvv_i32 : 3.7 3.2
> w_avg_8_32x4_c : 33.2 33.5
> w_avg_8_32x4_rvv_i32 : 6.7 6.0
> w_avg_8_32x8_c : 65.7 66.0
> w_avg_8_32x8_rvv_i32 : 12.5 11.0
> w_avg_8_32x16_c : 132.7 133.5
> w_avg_8_32x16_rvv_i32 : 24.0 21.5
> w_avg_8_32x32_c : 311.5 263.5
> w_avg_8_32x32_rvv_i32 : 47.7 42.5
> w_avg_8_32x64_c : 592.0 555.5
> w_avg_8_32x64_rvv_i32 : 126.5 97.7
> w_avg_8_32x128_c : 1179.0 1139.5
> w_avg_8_32x128_rvv_i32 : 238.2 180.7
> w_avg_8_64x2_c : 32.7 33.0
> w_avg_8_64x2_rvv_i32 : 6.0 3.2
> w_avg_8_64x4_c : 65.7 66.0
> w_avg_8_64x4_rvv_i32 : 11.5 5.7
> w_avg_8_64x8_c : 134.0 132.2
> w_avg_8_64x8_rvv_i32 : 22.7 11.0
> w_avg_8_64x16_c : 281.2 262.5
> w_avg_8_64x16_rvv_i32 : 44.2 21.5
> w_avg_8_64x32_c : 646.2 570.0
> w_avg_8_64x32_rvv_i32 : 88.0 42.5
> w_avg_8_64x64_c : 1203.0 1066.7
> w_avg_8_64x64_rvv_i32 : 210.7 90.5
> w_avg_8_64x128_c : 2688.0 2156.2
> w_avg_8_64x128_rvv_i32 : 443.0 214.7
> w_avg_8_128x2_c : 65.7 66.0
> w_avg_8_128x2_rvv_i32 : 11.2 5.5
> w_avg_8_128x4_c : 131.0 133.0
> w_avg_8_128x4_rvv_i32 : 22.0 10.2
> w_avg_8_128x8_c : 263.5 273.0
> w_avg_8_128x8_rvv_i32 : 43.2 20.0
> w_avg_8_128x16_c : 525.7 528.0
> w_avg_8_128x16_rvv_i32 : 85.5 39.2
> w_avg_8_128x32_c : 1064.5 1211.0
> w_avg_8_128x32_rvv_i32 : 170.7 78.5
> w_avg_8_128x64_c : 2305.5 2350.7
> w_avg_8_128x64_rvv_i32 : 400.0 177.5
> w_avg_8_128x128_c : 4771.7 4992.7
> w_avg_8_128x128_rvv_i32 : 757.5 371.5
> ---
> libavcodec/riscv/vvc/Makefile | 2 +
> libavcodec/riscv/vvc/vvc_mc_rvv.S | 288 +++++++++++++++++++++++++++++
> libavcodec/riscv/vvc/vvcdsp_init.c | 72 ++++++++
> libavcodec/vvc/dsp.c | 4 +-
> libavcodec/vvc/dsp.h | 1 +
> 5 files changed, 366 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/riscv/vvc/Makefile
> create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S
> create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c
>
> diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile
> new file mode 100644
> index 0000000000..582b051579
> --- /dev/null
> +++ b/libavcodec/riscv/vvc/Makefile
> @@ -0,0 +1,2 @@
> +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o
> +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o
> diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S
> b/libavcodec/riscv/vvc/vvc_mc_rvv.S new file mode 100644
> index 0000000000..8cf4bcf680
> --- /dev/null
> +++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
> @@ -0,0 +1,288 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS). + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro vsetvlstatic8 w, vlen, max_lmul=m4
Again, I don't think that a maximul multiplier belongs here. If the calling
code cannot scale the multiplier up, then it should be a normal loop providing
the same code for all VLENs.
> + .if \w == 2 && \vlen == 128
> + vsetivli zero, \w, e8, mf8, ta, ma
> + .elseif \w == 4 && \vlen == 128
> + vsetivli zero, \w, e8, mf4, ta, ma
> + .elseif \w == 8 && \vlen == 128
> + vsetivli zero, \w, e8, mf2, ta, ma
> + .elseif \w == 16 && \vlen == 128
> + vsetivli zero, \w, e8, m1, ta, ma
> + .elseif \w == 32 && \vlen == 128
> + li t0, \w
> + vsetvli zero, t0, e8, m2, ta, ma
> + .elseif \w <= 4 && \vlen == 256
> + vsetivli zero, \w, e8, mf8, ta, ma
> + .elseif \w == 8 && \vlen == 256
> + vsetivli zero, \w, e8, mf4, ta, ma
> + .elseif \w == 16 && \vlen == 256
> + vsetivli zero, \w, e8, mf2, ta, ma
> + .elseif \w == 32 && \vlen == 256
> + li t0, \w
> + vsetvli zero, t0, e8, m1, ta, ma
> + .elseif \w == 64 && \vlen == 256
> + li t0, \w
> + vsetvli zero, t0, e8, m2, ta, ma
> + // (\w <= 128 && \vlen == 128) || (\w == 128 && \vlen == 256)
> + .else
> + li t0, \w
> + vsetvli zero, t0, e8, \max_lmul, ta, ma
> + .endif
> +.endm
> +
> +.macro vsetvlstatic16 w, vlen, max_lmul=m8
> + .if \w == 2 && \vlen == 128
> + vsetivli zero, \w, e16, mf4, ta, ma
> + .elseif \w == 4 && \vlen == 128
> + vsetivli zero, \w, e16, mf2, ta, ma
> + .elseif \w == 8 && \vlen == 128
> + vsetivli zero, \w, e16, m1, ta, ma
> + .elseif \w == 16 && \vlen == 128
> + vsetivli zero, \w, e16, m2, ta, ma
> + .elseif \w == 32 && \vlen == 128
> + li t0, \w
> + vsetvli zero, t0, e16, m4, ta, ma
> + .elseif \w <= 4 && \vlen == 256
> + vsetivli zero, \w, e16, mf4, ta, ma
> + .elseif \w == 8 && \vlen == 256
> + vsetivli zero, \w, e16, mf2, ta, ma
> + .elseif \w == 16 && \vlen == 256
> + vsetivli zero, \w, e16, m1, ta, ma
> + .elseif \w == 32 && \vlen == 256
> + li t0, \w
> + vsetvli zero, t0, e16, m2, ta, ma
> + .elseif \w == 64 && \vlen == 256
> + li t0, \w
> + vsetvli zero, t0, e16, m4, ta, ma
> + // (\w <= 128 && \vlen == 128) || (\w == 128 && \vlen == 256)
> + .else
> + li t0, \w
> + vsetvli zero, t0, e16, \max_lmul, ta, ma
> + .endif
> +.endm
> +
> +.macro vsetvlstatic32 w, vlen
> + .if \w == 2
> + vsetivli zero, \w, e32, mf2, ta, ma
> + .elseif \w == 4 && \vlen == 128
> + vsetivli zero, \w, e32, m1, ta, ma
> + .elseif \w == 8 && \vlen == 128
> + vsetivli zero, \w, e32, m2, ta, ma
> + .elseif \w == 16 && \vlen == 128
> + vsetivli zero, \w, e32, m4, ta, ma
> + .elseif \w == 4 && \vlen == 256
> + vsetivli zero, \w, e32, mf2, ta, ma
> + .elseif \w == 8 && \vlen == 256
> + vsetivli zero, \w, e32, m1, ta, ma
> + .elseif \w == 16 && \vlen == 256
> + vsetivli zero, \w, e32, m2, ta, ma
> + .elseif \w == 32 && \vlen == 256
> + li t0, \w
> + vsetvli zero, t0, e32, m4, ta, ma
> + // (\w <= 128 && \vlen == 128) || (\w <= 128 && \vlen == 256)
> + .else
> + li t0, \w
> + vsetvli zero, t0, e32, m8, ta, ma
> + .endif
> +.endm
> +
> +.macro avg w, vlen, id
> +\id\w\vlen:
> +.if \w < 128
> + vsetvlstatic16 \w, \vlen
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + add t2, a0, a1
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + addi a5, a5, -2
> + vle16.v v16, (t0)
> + vle16.v v24, (t1)
> + vadd.vv v8, v8, v0
> + vadd.vv v24, v24, v16
> + vmax.vx v8, v8, zero
> + vmax.vx v24, v24, zero
> + vsetvlstatic8 \w, \vlen
> + addi a2, a2, 128*4
> + vnclipu.wi v8, v8, 7
> + vnclipu.wi v24, v24, 7
> + addi a3, a3, 128*4
> + vse8.v v8, (a0)
> + vse8.v v24, (t2)
> + sh1add a0, a1, a0
> +.else
> + addi a5, a5, -1
> + mv t1, a0
> + mv t2, a2
> + mv t3, a3
> + mv t4, a4
> +1:
> + vsetvli t0, a4, e16, m8, ta, ma
> + sub a4, a4, t0
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vadd.vv v8, v8, v0
> + vmax.vx v8, v8, zero
> + vsetvli zero, zero, e8, m4, ta, ma
> + vnclipu.wi v8, v8, 7
> + vse8.v v8, (a0)
> + sh1add a2, t0, a2
> + sh1add a3, t0, a3
> + add a0, a0, t0
> + bnez a4, 1b
> + add a0, t1, a1
> + addi a2, t2, 128*2
> + addi a3, t3, 128*2
> + mv a4, t4
> +.endif
> + bnez a5, \id\w\vlen\()b
> + ret
> +.endm
> +
> +
> +.macro AVG_JMP_TABLE id, vlen
> +const jmp_table_\id\vlen
> + .4byte \id\()2\vlen\()f - jmp_table_\id\vlen
> + .4byte \id\()4\vlen\()f - jmp_table_\id\vlen
> + .4byte \id\()8\vlen\()f - jmp_table_\id\vlen
> + .4byte \id\()16\vlen\()f - jmp_table_\id\vlen
> + .4byte \id\()32\vlen\()f - jmp_table_\id\vlen
> + .4byte \id\()64\vlen\()f - jmp_table_\id\vlen
> + .4byte \id\()128\vlen\()f - jmp_table_\id\vlen
Maybe use .irp here?
> +endconst
> +.endm
> +
> +.macro AVG_J vlen, id
> + clz t1, a4
> + neg t1, t1
> + lla t5, jmp_table_\id\vlen
> + sh2add t1, t1, t5
> + lw t1, ((__riscv_xlen-2)<<2)(t1)
> + add t1, t1, t5
> + jr t1
> +.endm
> +
> +.macro func_avg vlen
> +func ff_vvc_avg_8_rvv_\vlen\(), zve32x
> + AVG_JMP_TABLE 1, \vlen
> + csrwi vxrm, 0
> + AVG_J \vlen, 1
> + .irp w,2,4,8,16,32,64,128
> + avg \w, \vlen, 1
> + .endr
> +endfunc
> +.endm
> +
> +.macro w_avg w, vlen, id
> +\id\w\vlen:
> +.if \w < 32
> + vsetvlstatic16 \w, \vlen, m4
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + addi a5, a5, -2
> + vle16.v v20, (t0)
> + vle16.v v24, (t1)
> + vwmul.vx v16, v0, a7
> + vwmul.vx v28, v20, a7
> + vwmacc.vx v16, t3, v8
> + vwmacc.vx v28, t3, v24
> + vsetvlstatic32 \w, \vlen
> + add t2, a0, a1
> + vadd.vx v16, v16, t4
> + vadd.vx v28, v28, t4
> + vsetvlstatic16 \w, \vlen, m4
> + vnsrl.wx v16, v16, t6
> + vnsrl.wx v28, v28, t6
> + vmax.vx v16, v16, zero
> + vmax.vx v28, v28, zero
> + vsetvlstatic8 \w, \vlen, m2
> + addi a2, a2, 128*4
> + vnclipu.wi v16, v16, 0
> + vnclipu.wi v28, v28, 0
> + vse8.v v16, (a0)
> + addi a3, a3, 128*4
> + vse8.v v28, (t2)
> + sh1add a0, a1, a0
> +.else
> + addi a5, a5, -1
> + mv t1, a0
> + mv t2, a2
> + mv t5, a3
> + mv a6, a4
> +1:
> + vsetvli t0, a4, e16, m4, ta, ma
> + sub a4, a4, t0
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vwmul.vx v16, v0, a7
> + vwmacc.vx v16, t3, v8
> + vsetvli zero, zero, e32, m8, ta, ma
> + vadd.vx v16, v16, t4
> + vsetvli zero, zero, e16, m4, ta, ma
> + vnsrl.wx v16, v16, t6
> + vmax.vx v16, v16, zero
> + vsetvli zero, zero, e8, m2, ta, ma
> + vnclipu.wi v16, v16, 0
> + vse8.v v16, (a0)
> + sh1add a2, t0, a2
> + sh1add a3, t0, a3
> + add a0, a0, t0
> + bnez a4, 1b
> + add a0, t1, a1
> + addi a2, t2, 128*2
> + addi a3, t5, 128*2
> + mv a4, a6
> +.endif
> + bnez a5, \id\w\vlen\()b
> + ret
> +.endm
> +
> +
> +.macro func_w_avg vlen
> +func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x
> + AVG_JMP_TABLE 2, \vlen
> + csrwi vxrm, 0
> + addi t6, a6, 7
> + ld t3, (sp)
> + ld t4, 8(sp)
> + ld t5, 16(sp)
Breaks build if XLEN = 32.
> + addi t4, t4, 1 // o0 + o1 + 1
> + add t4, t4, t5
> + addi t5, t6, -1 // shift - 1
> + sll t4, t4, t5
> + AVG_J \vlen, 2
> + .irp w,2,4,8,16,32,64,128
> + w_avg \w, \vlen, 2
> + .endr
> +endfunc
> +.endm
> +
> +func_avg 128
> +func_avg 256
> +#if (__riscv_xlen == 64)
> +func_w_avg 128
> +func_w_avg 256
> +#endif
> diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c
> b/libavcodec/riscv/vvc/vvcdsp_init.c new file mode 100644
> index 0000000000..9819a7c570
> --- /dev/null
> +++ b/libavcodec/riscv/vvc/vvcdsp_init.c
> @@ -0,0 +1,72 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS). + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include "config.h"
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
> +#include "libavcodec/vvc/dsp.h"
> +
> +#define bf(fn, bd, opt) fn##_##bd##_##opt
> +
> +#define AVG_PROTOTYPES(bd, opt)
> \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst,
> ptrdiff_t dst_stride, \ + const
> int16_t *src0, const int16_t *src1, int width, int height);
> \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t
> dst_stride, \ + const int16_t *src0,
> const int16_t *src1, int width, int height,
> \ + int denom, int w0, int w1, int o0, int o1);
> +
> +AVG_PROTOTYPES(8, rvv_128)
> +AVG_PROTOTYPES(8, rvv_256)
> +
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> +{
> +#if HAVE_RVV
> + const int flags = av_get_cpu_flags();
> + int vlenb = ff_get_rv_vlenb();
> +
> + if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
> + vlenb >= 32) {
> + switch (bd) {
> + case 8:
> + c->inter.avg = ff_vvc_avg_8_rvv_256;
> +# if (__riscv_xlen == 64)
> + c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
> +# endif
> + break;
> + default:
> + break;
> + }
> + } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags &
> AV_CPU_FLAG_RVB_ADDR) && + vlenb >= 16) {
> + switch (bd) {
> + case 8:
> + c->inter.avg = ff_vvc_avg_8_rvv_128;
> +# if (__riscv_xlen == 64)
> + c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
> +# endif
> + break;
> + default:
> + break;
> + }
> + }
> +#endif
> +}
> diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
> index 41e830a98a..c55a37d255 100644
> --- a/libavcodec/vvc/dsp.c
> +++ b/libavcodec/vvc/dsp.c
> @@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int
> bit_depth) break;
> }
>
> -#if ARCH_X86
> +#if ARCH_RISCV
> + ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
> +#elif ARCH_X86
> ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
> #endif
> }
> diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> index 1f14096c41..e03236dd76 100644
> --- a/libavcodec/vvc/dsp.h
> +++ b/libavcodec/vvc/dsp.h
> @@ -180,6 +180,7 @@ typedef struct VVCDSPContext {
>
> void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
>
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
> void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
>
> #endif /* AVCODEC_VVC_DSP_H */
--
Rémi Denis-Courmont
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-07-16 14:21 ` Rémi Denis-Courmont
@ 2024-07-18 15:02 ` uk7b
2024-07-18 15:04 ` flow gg
1 sibling, 0 replies; 19+ messages in thread
From: uk7b @ 2024-07-18 15:02 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
avg_8_2x2_c : 1.2 1.2
avg_8_2x2_rvv_i32 : 0.7 0.7
avg_8_2x4_c : 2.0 2.0
avg_8_2x4_rvv_i32 : 1.2 1.0
avg_8_2x8_c : 3.7 4.0
avg_8_2x8_rvv_i32 : 1.7 1.5
avg_8_2x16_c : 7.2 7.5
avg_8_2x16_rvv_i32 : 3.0 2.7
avg_8_2x32_c : 14.5 15.2
avg_8_2x32_rvv_i32 : 5.5 5.0
avg_8_2x64_c : 53.5 42.2
avg_8_2x64_rvv_i32 : 42.0 33.2
avg_8_2x128_c : 93.5 86.0
avg_8_2x128_rvv_i32 : 79.2 74.0
avg_8_4x2_c : 1.7 2.0
avg_8_4x2_rvv_i32 : 1.0 1.0
avg_8_4x4_c : 3.5 3.5
avg_8_4x4_rvv_i32 : 1.2 1.0
avg_8_4x8_c : 6.5 7.0
avg_8_4x8_rvv_i32 : 1.7 1.7
avg_8_4x16_c : 13.5 14.0
avg_8_4x16_rvv_i32 : 3.0 2.5
avg_8_4x32_c : 26.2 27.5
avg_8_4x32_rvv_i32 : 5.7 5.0
avg_8_4x64_c : 79.0 66.5
avg_8_4x64_rvv_i32 : 41.7 34.2
avg_8_4x128_c : 154.0 128.7
avg_8_4x128_rvv_i32 : 80.5 74.5
avg_8_8x2_c : 3.2 3.2
avg_8_8x2_rvv_i32 : 1.0 0.7
avg_8_8x4_c : 6.5 6.5
avg_8_8x4_rvv_i32 : 1.2 1.0
avg_8_8x8_c : 12.5 13.2
avg_8_8x8_rvv_i32 : 2.0 1.7
avg_8_8x16_c : 25.2 26.5
avg_8_8x16_rvv_i32 : 3.2 2.7
avg_8_8x32_c : 50.0 52.7
avg_8_8x32_rvv_i32 : 6.2 4.7
avg_8_8x64_c : 130.0 112.2
avg_8_8x64_rvv_i32 : 44.2 33.5
avg_8_8x128_c : 241.5 226.7
avg_8_8x128_rvv_i32 : 78.7 74.0
avg_8_16x2_c : 6.2 6.5
avg_8_16x2_rvv_i32 : 1.2 0.7
avg_8_16x4_c : 12.2 13.0
avg_8_16x4_rvv_i32 : 1.7 1.0
avg_8_16x8_c : 24.7 25.7
avg_8_16x8_rvv_i32 : 3.0 1.7
avg_8_16x16_c : 49.0 51.5
avg_8_16x16_rvv_i32 : 5.5 3.2
avg_8_16x32_c : 97.7 102.7
avg_8_16x32_rvv_i32 : 10.5 5.5
avg_8_16x64_c : 219.5 223.5
avg_8_16x64_rvv_i32 : 56.7 34.5
avg_8_16x128_c : 409.7 426.0
avg_8_16x128_rvv_i32 : 98.7 73.5
avg_8_32x2_c : 12.5 13.0
avg_8_32x2_rvv_i32 : 1.7 1.0
avg_8_32x4_c : 24.2 25.5
avg_8_32x4_rvv_i32 : 3.0 1.5
avg_8_32x8_c : 48.5 50.7
avg_8_32x8_rvv_i32 : 5.2 2.7
avg_8_32x16_c : 96.5 101.2
avg_8_32x16_rvv_i32 : 10.2 5.0
avg_8_32x32_c : 192.7 202.5
avg_8_32x32_rvv_i32 : 19.7 9.5
avg_8_32x64_c : 433.5 415.5
avg_8_32x64_rvv_i32 : 38.7 18.2
avg_8_32x128_c : 812.0 820.7
avg_8_32x128_rvv_i32 : 145.2 73.0
avg_8_64x2_c : 24.0 25.2
avg_8_64x2_rvv_i32 : 2.7 1.5
avg_8_64x4_c : 48.0 50.5
avg_8_64x4_rvv_i32 : 5.2 2.5
avg_8_64x8_c : 117.5 100.7
avg_8_64x8_rvv_i32 : 10.0 4.7
avg_8_64x16_c : 208.5 201.0
avg_8_64x16_rvv_i32 : 19.0 9.0
avg_8_64x32_c : 382.7 402.0
avg_8_64x32_rvv_i32 : 37.5 17.5
avg_8_64x64_c : 830.0 834.2
avg_8_64x64_rvv_i32 : 75.5 34.5
avg_8_64x128_c : 2008.0 1705.2
avg_8_64x128_rvv_i32 : 205.5 149.2
avg_8_128x2_c : 48.7 51.0
avg_8_128x2_rvv_i32 : 5.2 2.7
avg_8_128x4_c : 96.5 101.2
avg_8_128x4_rvv_i32 : 10.2 4.7
avg_8_128x8_c : 192.2 202.0
avg_8_128x8_rvv_i32 : 19.7 9.5
avg_8_128x16_c : 385.5 403.2
avg_8_128x16_rvv_i32 : 38.7 18.2
avg_8_128x32_c : 788.0 805.7
avg_8_128x32_rvv_i32 : 77.0 36.2
avg_8_128x64_c : 1597.5 1658.0
avg_8_128x64_rvv_i32 : 175.5 78.7
avg_8_128x128_c : 3156.0 3282.5
avg_8_128x128_rvv_i32 : 369.2 276.7
w_avg_8_2x2_c : 1.5 1.5
w_avg_8_2x2_rvv_i32 : 1.2 1.0
w_avg_8_2x4_c : 2.7 2.5
w_avg_8_2x4_rvv_i32 : 1.7 1.7
w_avg_8_2x8_c : 5.0 4.7
w_avg_8_2x8_rvv_i32 : 2.7 2.5
w_avg_8_2x16_c : 9.7 9.5
w_avg_8_2x16_rvv_i32 : 4.7 4.5
w_avg_8_2x32_c : 18.7 18.5
w_avg_8_2x32_rvv_i32 : 9.0 7.7
w_avg_8_2x64_c : 64.0 51.2
w_avg_8_2x64_rvv_i32 : 50.0 38.2
w_avg_8_2x128_c : 107.7 94.0
w_avg_8_2x128_rvv_i32 : 86.2 75.7
w_avg_8_4x2_c : 2.5 2.5
w_avg_8_4x2_rvv_i32 : 1.2 1.0
w_avg_8_4x4_c : 4.7 4.5
w_avg_8_4x4_rvv_i32 : 1.7 1.5
w_avg_8_4x8_c : 9.0 9.0
w_avg_8_4x8_rvv_i32 : 2.7 2.5
w_avg_8_4x16_c : 17.7 17.5
w_avg_8_4x16_rvv_i32 : 5.0 4.2
w_avg_8_4x32_c : 34.7 35.0
w_avg_8_4x32_rvv_i32 : 9.0 8.0
w_avg_8_4x64_c : 103.2 82.0
w_avg_8_4x64_rvv_i32 : 45.7 37.5
w_avg_8_4x128_c : 210.0 164.5
w_avg_8_4x128_rvv_i32 : 86.2 75.7
w_avg_8_8x2_c : 4.5 4.5
w_avg_8_8x2_rvv_i32 : 1.2 1.2
w_avg_8_8x4_c : 8.7 8.5
w_avg_8_8x4_rvv_i32 : 1.7 1.5
w_avg_8_8x8_c : 17.2 17.2
w_avg_8_8x8_rvv_i32 : 3.2 2.5
w_avg_8_8x16_c : 34.0 34.0
w_avg_8_8x16_rvv_i32 : 5.5 4.2
w_avg_8_8x32_c : 67.7 67.7
w_avg_8_8x32_rvv_i32 : 10.7 8.0
w_avg_8_8x64_c : 174.0 145.5
w_avg_8_8x64_rvv_i32 : 50.0 40.0
w_avg_8_8x128_c : 342.2 294.2
w_avg_8_8x128_rvv_i32 : 85.2 75.2
w_avg_8_16x2_c : 8.5 8.5
w_avg_8_16x2_rvv_i32 : 2.0 1.0
w_avg_8_16x4_c : 16.7 17.0
w_avg_8_16x4_rvv_i32 : 3.2 1.7
w_avg_8_16x8_c : 33.2 33.2
w_avg_8_16x8_rvv_i32 : 5.5 3.0
w_avg_8_16x16_c : 66.5 66.7
w_avg_8_16x16_rvv_i32 : 28.2 5.0
w_avg_8_16x32_c : 134.0 133.5
w_avg_8_16x32_rvv_i32 : 20.0 9.5
w_avg_8_16x64_c : 318.2 344.5
w_avg_8_16x64_rvv_i32 : 71.7 41.7
w_avg_8_16x128_c : 718.0 583.0
w_avg_8_16x128_rvv_i32 : 117.5 78.2
w_avg_8_32x2_c : 16.7 16.7
w_avg_8_32x2_rvv_i32 : 3.7 3.2
w_avg_8_32x4_c : 33.2 33.5
w_avg_8_32x4_rvv_i32 : 6.7 6.0
w_avg_8_32x8_c : 65.7 66.0
w_avg_8_32x8_rvv_i32 : 12.5 11.0
w_avg_8_32x16_c : 132.7 133.5
w_avg_8_32x16_rvv_i32 : 24.0 21.5
w_avg_8_32x32_c : 311.5 263.5
w_avg_8_32x32_rvv_i32 : 47.7 42.5
w_avg_8_32x64_c : 592.0 555.5
w_avg_8_32x64_rvv_i32 : 126.5 97.7
w_avg_8_32x128_c : 1179.0 1139.5
w_avg_8_32x128_rvv_i32 : 238.2 180.7
w_avg_8_64x2_c : 32.7 33.0
w_avg_8_64x2_rvv_i32 : 6.0 3.2
w_avg_8_64x4_c : 65.7 66.0
w_avg_8_64x4_rvv_i32 : 11.5 5.7
w_avg_8_64x8_c : 134.0 132.2
w_avg_8_64x8_rvv_i32 : 22.7 11.0
w_avg_8_64x16_c : 281.2 262.5
w_avg_8_64x16_rvv_i32 : 44.2 21.5
w_avg_8_64x32_c : 646.2 570.0
w_avg_8_64x32_rvv_i32 : 88.0 42.5
w_avg_8_64x64_c : 1203.0 1066.7
w_avg_8_64x64_rvv_i32 : 210.7 90.5
w_avg_8_64x128_c : 2688.0 2156.2
w_avg_8_64x128_rvv_i32 : 443.0 214.7
w_avg_8_128x2_c : 65.7 66.0
w_avg_8_128x2_rvv_i32 : 11.2 5.5
w_avg_8_128x4_c : 131.0 133.0
w_avg_8_128x4_rvv_i32 : 22.0 10.2
w_avg_8_128x8_c : 263.5 273.0
w_avg_8_128x8_rvv_i32 : 43.2 20.0
w_avg_8_128x16_c : 525.7 528.0
w_avg_8_128x16_rvv_i32 : 85.5 39.2
w_avg_8_128x32_c : 1064.5 1211.0
w_avg_8_128x32_rvv_i32 : 170.7 78.5
w_avg_8_128x64_c : 2305.5 2350.7
w_avg_8_128x64_rvv_i32 : 400.0 177.5
w_avg_8_128x128_c : 4771.7 4992.7
w_avg_8_128x128_rvv_i32 : 757.5 371.5
---
libavcodec/riscv/vvc/Makefile | 2 +
libavcodec/riscv/vvc/vvc_mc_rvv.S | 288 +++++++++++++++++++++++++++++
libavcodec/riscv/vvc/vvcdsp_init.c | 72 ++++++++
libavcodec/vvc/dsp.c | 4 +-
libavcodec/vvc/dsp.h | 1 +
5 files changed, 366 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/riscv/vvc/Makefile
create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S
create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c
diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile
new file mode 100644
index 0000000000..582b051579
--- /dev/null
+++ b/libavcodec/riscv/vvc/Makefile
@@ -0,0 +1,2 @@
+OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o
+RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o
diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S b/libavcodec/riscv/vvc/vvc_mc_rvv.S
new file mode 100644
index 0000000000..79ad96a2b7
--- /dev/null
+++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro vsetvlstatic8 w, vlen, max_lmul=m4
+ .if \w == 2 && \vlen == 128
+ vsetivli zero, \w, e8, mf8, ta, ma
+ .elseif \w == 4 && \vlen == 128
+ vsetivli zero, \w, e8, mf4, ta, ma
+ .elseif \w == 8 && \vlen == 128
+ vsetivli zero, \w, e8, mf2, ta, ma
+ .elseif \w == 16 && \vlen == 128
+ vsetivli zero, \w, e8, m1, ta, ma
+ .elseif \w == 32 && \vlen == 128
+ li t0, \w
+ vsetvli zero, t0, e8, m2, ta, ma
+ .elseif \w <= 4 && \vlen == 256
+ vsetivli zero, \w, e8, mf8, ta, ma
+ .elseif \w == 8 && \vlen == 256
+ vsetivli zero, \w, e8, mf4, ta, ma
+ .elseif \w == 16 && \vlen == 256
+ vsetivli zero, \w, e8, mf2, ta, ma
+ .elseif \w == 32 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e8, m1, ta, ma
+ .elseif \w == 64 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e8, m2, ta, ma
+ // (\w <= 128 && \vlen == 128) || (\w == 128 && \vlen == 256)
+ .else
+ li t0, \w
+ vsetvli zero, t0, e8, \max_lmul, ta, ma
+ .endif
+.endm
+
+.macro vsetvlstatic16 w, vlen, max_lmul=m8
+ .if \w == 2 && \vlen == 128
+ vsetivli zero, \w, e16, mf4, ta, ma
+ .elseif \w == 4 && \vlen == 128
+ vsetivli zero, \w, e16, mf2, ta, ma
+ .elseif \w == 8 && \vlen == 128
+ vsetivli zero, \w, e16, m1, ta, ma
+ .elseif \w == 16 && \vlen == 128
+ vsetivli zero, \w, e16, m2, ta, ma
+ .elseif \w == 32 && \vlen == 128
+ li t0, \w
+ vsetvli zero, t0, e16, m4, ta, ma
+ .elseif \w <= 4 && \vlen == 256
+ vsetivli zero, \w, e16, mf4, ta, ma
+ .elseif \w == 8 && \vlen == 256
+ vsetivli zero, \w, e16, mf2, ta, ma
+ .elseif \w == 16 && \vlen == 256
+ vsetivli zero, \w, e16, m1, ta, ma
+ .elseif \w == 32 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e16, m2, ta, ma
+ .elseif \w == 64 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e16, m4, ta, ma
+ // (\w <= 128 && \vlen == 128) || (\w == 128 && \vlen == 256)
+ .else
+ li t0, \w
+ vsetvli zero, t0, e16, \max_lmul, ta, ma
+ .endif
+.endm
+
+.macro vsetvlstatic32 w, vlen
+ .if \w == 2
+ vsetivli zero, \w, e32, mf2, ta, ma
+ .elseif \w == 4 && \vlen == 128
+ vsetivli zero, \w, e32, m1, ta, ma
+ .elseif \w == 8 && \vlen == 128
+ vsetivli zero, \w, e32, m2, ta, ma
+ .elseif \w == 16 && \vlen == 128
+ vsetivli zero, \w, e32, m4, ta, ma
+ .elseif \w == 4 && \vlen == 256
+ vsetivli zero, \w, e32, mf2, ta, ma
+ .elseif \w == 8 && \vlen == 256
+ vsetivli zero, \w, e32, m1, ta, ma
+ .elseif \w == 16 && \vlen == 256
+ vsetivli zero, \w, e32, m2, ta, ma
+ .elseif \w == 32 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e32, m4, ta, ma
+ // (\w <= 128 && \vlen == 128) || (\w <= 128 && \vlen == 256)
+ .else
+ li t0, \w
+ vsetvli zero, t0, e32, m8, ta, ma
+ .endif
+.endm
+
+.macro avg w, vlen, id
+\id\w\vlen:
+.if \w < 128
+ vsetvlstatic16 \w, \vlen
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ add t2, a0, a1
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ addi a5, a5, -2
+ vle16.v v16, (t0)
+ vle16.v v24, (t1)
+ vadd.vv v8, v8, v0
+ vadd.vv v24, v24, v16
+ vmax.vx v8, v8, zero
+ vmax.vx v24, v24, zero
+ vsetvlstatic8 \w, \vlen
+ addi a2, a2, 128*4
+ vnclipu.wi v8, v8, 7
+ vnclipu.wi v24, v24, 7
+ addi a3, a3, 128*4
+ vse8.v v8, (a0)
+ vse8.v v24, (t2)
+ sh1add a0, a1, a0
+.else
+ addi a5, a5, -1
+ mv t1, a0
+ mv t2, a2
+ mv t3, a3
+ mv t4, a4
+1:
+ vsetvli t0, a4, e16, m8, ta, ma
+ sub a4, a4, t0
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vadd.vv v8, v8, v0
+ vmax.vx v8, v8, zero
+ vsetvli zero, zero, e8, m4, ta, ma
+ vnclipu.wi v8, v8, 7
+ vse8.v v8, (a0)
+ sh1add a2, t0, a2
+ sh1add a3, t0, a3
+ add a0, a0, t0
+ bnez a4, 1b
+ add a0, t1, a1
+ addi a2, t2, 128*2
+ addi a3, t3, 128*2
+ mv a4, t4
+.endif
+ bnez a5, \id\w\vlen\()b
+ ret
+.endm
+
+
+.macro AVG_JMP_TABLE id, vlen
+const jmp_table_\id\vlen
+ .4byte \id\()2\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()4\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()8\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()16\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()32\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()64\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()128\vlen\()f - jmp_table_\id\vlen
+endconst
+.endm
+
+.macro AVG_J vlen, id
+ clz t1, a4
+ neg t1, t1
+ lla t5, jmp_table_\id\vlen
+ sh2add t1, t1, t5
+ lw t1, ((__riscv_xlen-2)<<2)(t1)
+ add t1, t1, t5
+ jr t1
+.endm
+
+.macro func_avg vlen
+func ff_vvc_avg_8_rvv_\vlen\(), zve32x
+ AVG_JMP_TABLE 1, \vlen
+ csrwi vxrm, 0
+ AVG_J \vlen, 1
+ .irp w,2,4,8,16,32,64,128
+ avg \w, \vlen, 1
+ .endr
+endfunc
+.endm
+
+func_avg 128
+func_avg 256
+
+#if (__riscv_xlen == 64)
+.macro w_avg w, vlen, id
+\id\w\vlen:
+.if \w < 32
+ vsetvlstatic16 \w, \vlen, m4
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ addi a5, a5, -2
+ vle16.v v20, (t0)
+ vle16.v v24, (t1)
+ vwmul.vx v16, v0, a7
+ vwmul.vx v28, v20, a7
+ vwmacc.vx v16, t3, v8
+ vwmacc.vx v28, t3, v24
+ vsetvlstatic32 \w, \vlen
+ add t2, a0, a1
+ vadd.vx v16, v16, t4
+ vadd.vx v28, v28, t4
+ vsetvlstatic16 \w, \vlen, m4
+ vnsrl.wx v16, v16, t6
+ vnsrl.wx v28, v28, t6
+ vmax.vx v16, v16, zero
+ vmax.vx v28, v28, zero
+ vsetvlstatic8 \w, \vlen, m2
+ addi a2, a2, 128*4
+ vnclipu.wi v16, v16, 0
+ vnclipu.wi v28, v28, 0
+ vse8.v v16, (a0)
+ addi a3, a3, 128*4
+ vse8.v v28, (t2)
+ sh1add a0, a1, a0
+.else
+ addi a5, a5, -1
+ mv t1, a0
+ mv t2, a2
+ mv t5, a3
+ mv a6, a4
+1:
+ vsetvli t0, a4, e16, m4, ta, ma
+ sub a4, a4, t0
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vwmul.vx v16, v0, a7
+ vwmacc.vx v16, t3, v8
+ vsetvli zero, zero, e32, m8, ta, ma
+ vadd.vx v16, v16, t4
+ vsetvli zero, zero, e16, m4, ta, ma
+ vnsrl.wx v16, v16, t6
+ vmax.vx v16, v16, zero
+ vsetvli zero, zero, e8, m2, ta, ma
+ vnclipu.wi v16, v16, 0
+ vse8.v v16, (a0)
+ sh1add a2, t0, a2
+ sh1add a3, t0, a3
+ add a0, a0, t0
+ bnez a4, 1b
+ add a0, t1, a1
+ addi a2, t2, 128*2
+ addi a3, t5, 128*2
+ mv a4, a6
+.endif
+ bnez a5, \id\w\vlen\()b
+ ret
+.endm
+
+.macro func_w_avg vlen
+func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x
+ AVG_JMP_TABLE 2, \vlen
+ csrwi vxrm, 0
+ addi t6, a6, 7
+ ld t3, (sp)
+ ld t4, 8(sp)
+ ld t5, 16(sp)
+ addi t4, t4, 1 // o0 + o1 + 1
+ add t4, t4, t5
+ addi t5, t6, -1 // shift - 1
+ sll t4, t4, t5
+ AVG_J \vlen, 2
+ .irp w,2,4,8,16,32,64,128
+ w_avg \w, \vlen, 2
+ .endr
+endfunc
+.endm
+
+func_w_avg 128
+func_w_avg 256
+#endif
diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c b/libavcodec/riscv/vvc/vvcdsp_init.c
new file mode 100644
index 0000000000..9819a7c570
--- /dev/null
+++ b/libavcodec/riscv/vvc/vvcdsp_init.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/vvc/dsp.h"
+
+#define bf(fn, bd, opt) fn##_##bd##_##opt
+
+#define AVG_PROTOTYPES(bd, opt) \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height); \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height, \
+ int denom, int w0, int w1, int o0, int o1);
+
+AVG_PROTOTYPES(8, rvv_128)
+AVG_PROTOTYPES(8, rvv_256)
+
+void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
+{
+#if HAVE_RVV
+ const int flags = av_get_cpu_flags();
+ int vlenb = ff_get_rv_vlenb();
+
+ if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+ vlenb >= 32) {
+ switch (bd) {
+ case 8:
+ c->inter.avg = ff_vvc_avg_8_rvv_256;
+# if (__riscv_xlen == 64)
+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
+# endif
+ break;
+ default:
+ break;
+ }
+ } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+ vlenb >= 16) {
+ switch (bd) {
+ case 8:
+ c->inter.avg = ff_vvc_avg_8_rvv_128;
+# if (__riscv_xlen == 64)
+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
+# endif
+ break;
+ default:
+ break;
+ }
+ }
+#endif
+}
diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
index 41e830a98a..c55a37d255 100644
--- a/libavcodec/vvc/dsp.c
+++ b/libavcodec/vvc/dsp.c
@@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
break;
}
-#if ARCH_X86
+#if ARCH_RISCV
+ ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
+#elif ARCH_X86
ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
#endif
}
diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
index 1f14096c41..e03236dd76 100644
--- a/libavcodec/vvc/dsp.h
+++ b/libavcodec/vvc/dsp.h
@@ -180,6 +180,7 @@ typedef struct VVCDSPContext {
void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
+void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
#endif /* AVCODEC_VVC_DSP_H */
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-07-16 14:21 ` Rémi Denis-Courmont
2024-07-18 15:02 ` uk7b
@ 2024-07-18 15:04 ` flow gg
2024-07-19 15:55 ` Rémi Denis-Courmont
1 sibling, 1 reply; 19+ messages in thread
From: flow gg @ 2024-07-18 15:04 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> Again, I don't think that a maximul multiplier belongs here. If the
calling
> code cannot scale the multiplier up, then it should be a normal loop
providing
> the same code for all VLENs.
I think it's acceptable to add such a parameter, which isn't particularly
common in other files, because this vset is used for vvc_mc_rvv.S rather
than libavutil/riscv/asm.S. This parameter isn't only used for avg and
w_avg; it can also save some if for other functions in vvc_mc_rvv.S later
on
>> + .4byte \id\()64\vlen\()f - jmp_table_\id\vlen
>> + .4byte \id\()128\vlen\()f - jmp_table_\id\vlen
> Maybe use .irp here?
I'm not sure, there is a syntax error here with the mixed use of them.
.irp w,2,4,8,16,32,64,128
.4byte \id\()\w\()\vlen\()f - jmp_table_\id\vlen
.endr
libavcodec/riscv/vvc/vvc_mc_rvv.S:176: Error: junk at end of line, first
unrecognized character is `\'
libavcodec/riscv/vvc/vvc_mc_rvv.S:195: Info: macro invoked from here
> Breaks build if XLEN = 32.
Okay,updated it
Rémi Denis-Courmont <remi@remlab.net> 于2024年7月16日周二 22:31写道:
> Le keskiviikkona 10. heinäkuuta 2024, 13.02.44 EEST uk7b@foxmail.com a
> écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> > C908 X60
> > avg_8_2x2_c : 1.2 1.2
> > avg_8_2x2_rvv_i32 : 0.7 0.7
> > avg_8_2x4_c : 2.0 2.0
> > avg_8_2x4_rvv_i32 : 1.2 1.0
> > avg_8_2x8_c : 3.7 4.0
> > avg_8_2x8_rvv_i32 : 1.7 1.5
> > avg_8_2x16_c : 7.2 7.5
> > avg_8_2x16_rvv_i32 : 3.0 2.7
> > avg_8_2x32_c : 14.5 15.2
> > avg_8_2x32_rvv_i32 : 5.5 5.0
> > avg_8_2x64_c : 53.5 42.2
> > avg_8_2x64_rvv_i32 : 42.0 33.2
> > avg_8_2x128_c : 93.5 86.0
> > avg_8_2x128_rvv_i32 : 79.2 74.0
> > avg_8_4x2_c : 1.7 2.0
> > avg_8_4x2_rvv_i32 : 1.0 1.0
> > avg_8_4x4_c : 3.5 3.5
> > avg_8_4x4_rvv_i32 : 1.2 1.0
> > avg_8_4x8_c : 6.5 7.0
> > avg_8_4x8_rvv_i32 : 1.7 1.7
> > avg_8_4x16_c : 13.5 14.0
> > avg_8_4x16_rvv_i32 : 3.0 2.5
> > avg_8_4x32_c : 26.2 27.5
> > avg_8_4x32_rvv_i32 : 5.7 5.0
> > avg_8_4x64_c : 79.0 66.5
> > avg_8_4x64_rvv_i32 : 41.7 34.2
> > avg_8_4x128_c : 154.0 128.7
> > avg_8_4x128_rvv_i32 : 80.5 74.5
> > avg_8_8x2_c : 3.2 3.2
> > avg_8_8x2_rvv_i32 : 1.0 0.7
> > avg_8_8x4_c : 6.5 6.5
> > avg_8_8x4_rvv_i32 : 1.2 1.0
> > avg_8_8x8_c : 12.5 13.2
> > avg_8_8x8_rvv_i32 : 2.0 1.7
> > avg_8_8x16_c : 25.2 26.5
> > avg_8_8x16_rvv_i32 : 3.2 2.7
> > avg_8_8x32_c : 50.0 52.7
> > avg_8_8x32_rvv_i32 : 6.2 4.7
> > avg_8_8x64_c : 130.0 112.2
> > avg_8_8x64_rvv_i32 : 44.2 33.5
> > avg_8_8x128_c : 241.5 226.7
> > avg_8_8x128_rvv_i32 : 78.7 74.0
> > avg_8_16x2_c : 6.2 6.5
> > avg_8_16x2_rvv_i32 : 1.2 0.7
> > avg_8_16x4_c : 12.2 13.0
> > avg_8_16x4_rvv_i32 : 1.7 1.0
> > avg_8_16x8_c : 24.7 25.7
> > avg_8_16x8_rvv_i32 : 3.0 1.7
> > avg_8_16x16_c : 49.0 51.5
> > avg_8_16x16_rvv_i32 : 5.5 3.2
> > avg_8_16x32_c : 97.7 102.7
> > avg_8_16x32_rvv_i32 : 10.5 5.5
> > avg_8_16x64_c : 219.5 223.5
> > avg_8_16x64_rvv_i32 : 56.7 34.5
> > avg_8_16x128_c : 409.7 426.0
> > avg_8_16x128_rvv_i32 : 98.7 73.5
> > avg_8_32x2_c : 12.5 13.0
> > avg_8_32x2_rvv_i32 : 1.7 1.0
> > avg_8_32x4_c : 24.2 25.5
> > avg_8_32x4_rvv_i32 : 3.0 1.5
> > avg_8_32x8_c : 48.5 50.7
> > avg_8_32x8_rvv_i32 : 5.2 2.7
> > avg_8_32x16_c : 96.5 101.2
> > avg_8_32x16_rvv_i32 : 10.2 5.0
> > avg_8_32x32_c : 192.7 202.5
> > avg_8_32x32_rvv_i32 : 19.7 9.5
> > avg_8_32x64_c : 433.5 415.5
> > avg_8_32x64_rvv_i32 : 38.7 18.2
> > avg_8_32x128_c : 812.0 820.7
> > avg_8_32x128_rvv_i32 : 145.2 73.0
> > avg_8_64x2_c : 24.0 25.2
> > avg_8_64x2_rvv_i32 : 2.7 1.5
> > avg_8_64x4_c : 48.0 50.5
> > avg_8_64x4_rvv_i32 : 5.2 2.5
> > avg_8_64x8_c : 117.5 100.7
> > avg_8_64x8_rvv_i32 : 10.0 4.7
> > avg_8_64x16_c : 208.5 201.0
> > avg_8_64x16_rvv_i32 : 19.0 9.0
> > avg_8_64x32_c : 382.7 402.0
> > avg_8_64x32_rvv_i32 : 37.5 17.5
> > avg_8_64x64_c : 830.0 834.2
> > avg_8_64x64_rvv_i32 : 75.5 34.5
> > avg_8_64x128_c : 2008.0 1705.2
> > avg_8_64x128_rvv_i32 : 205.5 149.2
> > avg_8_128x2_c : 48.7 51.0
> > avg_8_128x2_rvv_i32 : 5.2 2.7
> > avg_8_128x4_c : 96.5 101.2
> > avg_8_128x4_rvv_i32 : 10.2 4.7
> > avg_8_128x8_c : 192.2 202.0
> > avg_8_128x8_rvv_i32 : 19.7 9.5
> > avg_8_128x16_c : 385.5 403.2
> > avg_8_128x16_rvv_i32 : 38.7 18.2
> > avg_8_128x32_c : 788.0 805.7
> > avg_8_128x32_rvv_i32 : 77.0 36.2
> > avg_8_128x64_c : 1597.5 1658.0
> > avg_8_128x64_rvv_i32 : 175.5 78.7
> > avg_8_128x128_c : 3156.0 3282.5
> > avg_8_128x128_rvv_i32 : 369.2 276.7
> > w_avg_8_2x2_c : 1.5 1.5
> > w_avg_8_2x2_rvv_i32 : 1.2 1.0
> > w_avg_8_2x4_c : 2.7 2.5
> > w_avg_8_2x4_rvv_i32 : 1.7 1.7
> > w_avg_8_2x8_c : 5.0 4.7
> > w_avg_8_2x8_rvv_i32 : 2.7 2.5
> > w_avg_8_2x16_c : 9.7 9.5
> > w_avg_8_2x16_rvv_i32 : 4.7 4.5
> > w_avg_8_2x32_c : 18.7 18.5
> > w_avg_8_2x32_rvv_i32 : 9.0 7.7
> > w_avg_8_2x64_c : 64.0 51.2
> > w_avg_8_2x64_rvv_i32 : 50.0 38.2
> > w_avg_8_2x128_c : 107.7 94.0
> > w_avg_8_2x128_rvv_i32 : 86.2 75.7
> > w_avg_8_4x2_c : 2.5 2.5
> > w_avg_8_4x2_rvv_i32 : 1.2 1.0
> > w_avg_8_4x4_c : 4.7 4.5
> > w_avg_8_4x4_rvv_i32 : 1.7 1.5
> > w_avg_8_4x8_c : 9.0 9.0
> > w_avg_8_4x8_rvv_i32 : 2.7 2.5
> > w_avg_8_4x16_c : 17.7 17.5
> > w_avg_8_4x16_rvv_i32 : 5.0 4.2
> > w_avg_8_4x32_c : 34.7 35.0
> > w_avg_8_4x32_rvv_i32 : 9.0 8.0
> > w_avg_8_4x64_c : 103.2 82.0
> > w_avg_8_4x64_rvv_i32 : 45.7 37.5
> > w_avg_8_4x128_c : 210.0 164.5
> > w_avg_8_4x128_rvv_i32 : 86.2 75.7
> > w_avg_8_8x2_c : 4.5 4.5
> > w_avg_8_8x2_rvv_i32 : 1.2 1.2
> > w_avg_8_8x4_c : 8.7 8.5
> > w_avg_8_8x4_rvv_i32 : 1.7 1.5
> > w_avg_8_8x8_c : 17.2 17.2
> > w_avg_8_8x8_rvv_i32 : 3.2 2.5
> > w_avg_8_8x16_c : 34.0 34.0
> > w_avg_8_8x16_rvv_i32 : 5.5 4.2
> > w_avg_8_8x32_c : 67.7 67.7
> > w_avg_8_8x32_rvv_i32 : 10.7 8.0
> > w_avg_8_8x64_c : 174.0 145.5
> > w_avg_8_8x64_rvv_i32 : 50.0 40.0
> > w_avg_8_8x128_c : 342.2 294.2
> > w_avg_8_8x128_rvv_i32 : 85.2 75.2
> > w_avg_8_16x2_c : 8.5 8.5
> > w_avg_8_16x2_rvv_i32 : 2.0 1.0
> > w_avg_8_16x4_c : 16.7 17.0
> > w_avg_8_16x4_rvv_i32 : 3.2 1.7
> > w_avg_8_16x8_c : 33.2 33.2
> > w_avg_8_16x8_rvv_i32 : 5.5 3.0
> > w_avg_8_16x16_c : 66.5 66.7
> > w_avg_8_16x16_rvv_i32 : 28.2 5.0
> > w_avg_8_16x32_c : 134.0 133.5
> > w_avg_8_16x32_rvv_i32 : 20.0 9.5
> > w_avg_8_16x64_c : 318.2 344.5
> > w_avg_8_16x64_rvv_i32 : 71.7 41.7
> > w_avg_8_16x128_c : 718.0 583.0
> > w_avg_8_16x128_rvv_i32 : 117.5 78.2
> > w_avg_8_32x2_c : 16.7 16.7
> > w_avg_8_32x2_rvv_i32 : 3.7 3.2
> > w_avg_8_32x4_c : 33.2 33.5
> > w_avg_8_32x4_rvv_i32 : 6.7 6.0
> > w_avg_8_32x8_c : 65.7 66.0
> > w_avg_8_32x8_rvv_i32 : 12.5 11.0
> > w_avg_8_32x16_c : 132.7 133.5
> > w_avg_8_32x16_rvv_i32 : 24.0 21.5
> > w_avg_8_32x32_c : 311.5 263.5
> > w_avg_8_32x32_rvv_i32 : 47.7 42.5
> > w_avg_8_32x64_c : 592.0 555.5
> > w_avg_8_32x64_rvv_i32 : 126.5 97.7
> > w_avg_8_32x128_c : 1179.0 1139.5
> > w_avg_8_32x128_rvv_i32 : 238.2 180.7
> > w_avg_8_64x2_c : 32.7 33.0
> > w_avg_8_64x2_rvv_i32 : 6.0 3.2
> > w_avg_8_64x4_c : 65.7 66.0
> > w_avg_8_64x4_rvv_i32 : 11.5 5.7
> > w_avg_8_64x8_c : 134.0 132.2
> > w_avg_8_64x8_rvv_i32 : 22.7 11.0
> > w_avg_8_64x16_c : 281.2 262.5
> > w_avg_8_64x16_rvv_i32 : 44.2 21.5
> > w_avg_8_64x32_c : 646.2 570.0
> > w_avg_8_64x32_rvv_i32 : 88.0 42.5
> > w_avg_8_64x64_c : 1203.0 1066.7
> > w_avg_8_64x64_rvv_i32 : 210.7 90.5
> > w_avg_8_64x128_c : 2688.0 2156.2
> > w_avg_8_64x128_rvv_i32 : 443.0 214.7
> > w_avg_8_128x2_c : 65.7 66.0
> > w_avg_8_128x2_rvv_i32 : 11.2 5.5
> > w_avg_8_128x4_c : 131.0 133.0
> > w_avg_8_128x4_rvv_i32 : 22.0 10.2
> > w_avg_8_128x8_c : 263.5 273.0
> > w_avg_8_128x8_rvv_i32 : 43.2 20.0
> > w_avg_8_128x16_c : 525.7 528.0
> > w_avg_8_128x16_rvv_i32 : 85.5 39.2
> > w_avg_8_128x32_c : 1064.5 1211.0
> > w_avg_8_128x32_rvv_i32 : 170.7 78.5
> > w_avg_8_128x64_c : 2305.5 2350.7
> > w_avg_8_128x64_rvv_i32 : 400.0 177.5
> > w_avg_8_128x128_c : 4771.7 4992.7
> > w_avg_8_128x128_rvv_i32 : 757.5 371.5
> > ---
> > libavcodec/riscv/vvc/Makefile | 2 +
> > libavcodec/riscv/vvc/vvc_mc_rvv.S | 288 +++++++++++++++++++++++++++++
> > libavcodec/riscv/vvc/vvcdsp_init.c | 72 ++++++++
> > libavcodec/vvc/dsp.c | 4 +-
> > libavcodec/vvc/dsp.h | 1 +
> > 5 files changed, 366 insertions(+), 1 deletion(-)
> > create mode 100644 libavcodec/riscv/vvc/Makefile
> > create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S
> > create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c
> >
> > diff --git a/libavcodec/riscv/vvc/Makefile
> b/libavcodec/riscv/vvc/Makefile
> > new file mode 100644
> > index 0000000000..582b051579
> > --- /dev/null
> > +++ b/libavcodec/riscv/vvc/Makefile
> > @@ -0,0 +1,2 @@
> > +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o
> > +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o
> > diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S
> > b/libavcodec/riscv/vvc/vvc_mc_rvv.S new file mode 100644
> > index 0000000000..8cf4bcf680
> > --- /dev/null
> > +++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
> > @@ -0,0 +1,288 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +.macro vsetvlstatic8 w, vlen, max_lmul=m4
>
> Again, I don't think that a maximul multiplier belongs here. If the
> calling
> code cannot scale the multiplier up, then it should be a normal loop
> providing
> the same code for all VLENs.
>
> > + .if \w == 2 && \vlen == 128
> > + vsetivli zero, \w, e8, mf8, ta, ma
> > + .elseif \w == 4 && \vlen == 128
> > + vsetivli zero, \w, e8, mf4, ta, ma
> > + .elseif \w == 8 && \vlen == 128
> > + vsetivli zero, \w, e8, mf2, ta, ma
> > + .elseif \w == 16 && \vlen == 128
> > + vsetivli zero, \w, e8, m1, ta, ma
> > + .elseif \w == 32 && \vlen == 128
> > + li t0, \w
> > + vsetvli zero, t0, e8, m2, ta, ma
> > + .elseif \w <= 4 && \vlen == 256
> > + vsetivli zero, \w, e8, mf8, ta, ma
> > + .elseif \w == 8 && \vlen == 256
> > + vsetivli zero, \w, e8, mf4, ta, ma
> > + .elseif \w == 16 && \vlen == 256
> > + vsetivli zero, \w, e8, mf2, ta, ma
> > + .elseif \w == 32 && \vlen == 256
> > + li t0, \w
> > + vsetvli zero, t0, e8, m1, ta, ma
> > + .elseif \w == 64 && \vlen == 256
> > + li t0, \w
> > + vsetvli zero, t0, e8, m2, ta, ma
> > + // (\w <= 128 && \vlen == 128) || (\w == 128 && \vlen == 256)
> > + .else
> > + li t0, \w
> > + vsetvli zero, t0, e8, \max_lmul, ta, ma
> > + .endif
> > +.endm
> > +
> > +.macro vsetvlstatic16 w, vlen, max_lmul=m8
> > + .if \w == 2 && \vlen == 128
> > + vsetivli zero, \w, e16, mf4, ta, ma
> > + .elseif \w == 4 && \vlen == 128
> > + vsetivli zero, \w, e16, mf2, ta, ma
> > + .elseif \w == 8 && \vlen == 128
> > + vsetivli zero, \w, e16, m1, ta, ma
> > + .elseif \w == 16 && \vlen == 128
> > + vsetivli zero, \w, e16, m2, ta, ma
> > + .elseif \w == 32 && \vlen == 128
> > + li t0, \w
> > + vsetvli zero, t0, e16, m4, ta, ma
> > + .elseif \w <= 4 && \vlen == 256
> > + vsetivli zero, \w, e16, mf4, ta, ma
> > + .elseif \w == 8 && \vlen == 256
> > + vsetivli zero, \w, e16, mf2, ta, ma
> > + .elseif \w == 16 && \vlen == 256
> > + vsetivli zero, \w, e16, m1, ta, ma
> > + .elseif \w == 32 && \vlen == 256
> > + li t0, \w
> > + vsetvli zero, t0, e16, m2, ta, ma
> > + .elseif \w == 64 && \vlen == 256
> > + li t0, \w
> > + vsetvli zero, t0, e16, m4, ta, ma
> > + // (\w <= 128 && \vlen == 128) || (\w == 128 && \vlen == 256)
> > + .else
> > + li t0, \w
> > + vsetvli zero, t0, e16, \max_lmul, ta, ma
> > + .endif
> > +.endm
> > +
> > +.macro vsetvlstatic32 w, vlen
> > + .if \w == 2
> > + vsetivli zero, \w, e32, mf2, ta, ma
> > + .elseif \w == 4 && \vlen == 128
> > + vsetivli zero, \w, e32, m1, ta, ma
> > + .elseif \w == 8 && \vlen == 128
> > + vsetivli zero, \w, e32, m2, ta, ma
> > + .elseif \w == 16 && \vlen == 128
> > + vsetivli zero, \w, e32, m4, ta, ma
> > + .elseif \w == 4 && \vlen == 256
> > + vsetivli zero, \w, e32, mf2, ta, ma
> > + .elseif \w == 8 && \vlen == 256
> > + vsetivli zero, \w, e32, m1, ta, ma
> > + .elseif \w == 16 && \vlen == 256
> > + vsetivli zero, \w, e32, m2, ta, ma
> > + .elseif \w == 32 && \vlen == 256
> > + li t0, \w
> > + vsetvli zero, t0, e32, m4, ta, ma
> > + // (\w <= 128 && \vlen == 128) || (\w <= 128 && \vlen == 256)
> > + .else
> > + li t0, \w
> > + vsetvli zero, t0, e32, m8, ta, ma
> > + .endif
> > +.endm
> > +
> > +.macro avg w, vlen, id
> > +\id\w\vlen:
> > +.if \w < 128
> > + vsetvlstatic16 \w, \vlen
> > + addi t0, a2, 128*2
> > + addi t1, a3, 128*2
> > + add t2, a0, a1
> > + vle16.v v0, (a2)
> > + vle16.v v8, (a3)
> > + addi a5, a5, -2
> > + vle16.v v16, (t0)
> > + vle16.v v24, (t1)
> > + vadd.vv v8, v8, v0
> > + vadd.vv v24, v24, v16
> > + vmax.vx v8, v8, zero
> > + vmax.vx v24, v24, zero
> > + vsetvlstatic8 \w, \vlen
> > + addi a2, a2, 128*4
> > + vnclipu.wi v8, v8, 7
> > + vnclipu.wi v24, v24, 7
> > + addi a3, a3, 128*4
> > + vse8.v v8, (a0)
> > + vse8.v v24, (t2)
> > + sh1add a0, a1, a0
> > +.else
> > + addi a5, a5, -1
> > + mv t1, a0
> > + mv t2, a2
> > + mv t3, a3
> > + mv t4, a4
> > +1:
> > + vsetvli t0, a4, e16, m8, ta, ma
> > + sub a4, a4, t0
> > + vle16.v v0, (a2)
> > + vle16.v v8, (a3)
> > + vadd.vv v8, v8, v0
> > + vmax.vx v8, v8, zero
> > + vsetvli zero, zero, e8, m4, ta, ma
> > + vnclipu.wi v8, v8, 7
> > + vse8.v v8, (a0)
> > + sh1add a2, t0, a2
> > + sh1add a3, t0, a3
> > + add a0, a0, t0
> > + bnez a4, 1b
> > + add a0, t1, a1
> > + addi a2, t2, 128*2
> > + addi a3, t3, 128*2
> > + mv a4, t4
> > +.endif
> > + bnez a5, \id\w\vlen\()b
> > + ret
> > +.endm
> > +
> > +
> > +.macro AVG_JMP_TABLE id, vlen
> > +const jmp_table_\id\vlen
> > + .4byte \id\()2\vlen\()f - jmp_table_\id\vlen
> > + .4byte \id\()4\vlen\()f - jmp_table_\id\vlen
> > + .4byte \id\()8\vlen\()f - jmp_table_\id\vlen
> > + .4byte \id\()16\vlen\()f - jmp_table_\id\vlen
> > + .4byte \id\()32\vlen\()f - jmp_table_\id\vlen
> > + .4byte \id\()64\vlen\()f - jmp_table_\id\vlen
> > + .4byte \id\()128\vlen\()f - jmp_table_\id\vlen
>
> Maybe use .irp here?
>
> > +endconst
> > +.endm
> > +
> > +.macro AVG_J vlen, id
> > + clz t1, a4
> > + neg t1, t1
> > + lla t5, jmp_table_\id\vlen
> > + sh2add t1, t1, t5
> > + lw t1, ((__riscv_xlen-2)<<2)(t1)
> > + add t1, t1, t5
> > + jr t1
> > +.endm
> > +
> > +.macro func_avg vlen
> > +func ff_vvc_avg_8_rvv_\vlen\(), zve32x
> > + AVG_JMP_TABLE 1, \vlen
> > + csrwi vxrm, 0
> > + AVG_J \vlen, 1
> > + .irp w,2,4,8,16,32,64,128
> > + avg \w, \vlen, 1
> > + .endr
> > +endfunc
> > +.endm
> > +
> > +.macro w_avg w, vlen, id
> > +\id\w\vlen:
> > +.if \w < 32
> > + vsetvlstatic16 \w, \vlen, m4
> > + addi t0, a2, 128*2
> > + addi t1, a3, 128*2
> > + vle16.v v0, (a2)
> > + vle16.v v8, (a3)
> > + addi a5, a5, -2
> > + vle16.v v20, (t0)
> > + vle16.v v24, (t1)
> > + vwmul.vx v16, v0, a7
> > + vwmul.vx v28, v20, a7
> > + vwmacc.vx v16, t3, v8
> > + vwmacc.vx v28, t3, v24
> > + vsetvlstatic32 \w, \vlen
> > + add t2, a0, a1
> > + vadd.vx v16, v16, t4
> > + vadd.vx v28, v28, t4
> > + vsetvlstatic16 \w, \vlen, m4
> > + vnsrl.wx v16, v16, t6
> > + vnsrl.wx v28, v28, t6
> > + vmax.vx v16, v16, zero
> > + vmax.vx v28, v28, zero
> > + vsetvlstatic8 \w, \vlen, m2
> > + addi a2, a2, 128*4
> > + vnclipu.wi v16, v16, 0
> > + vnclipu.wi v28, v28, 0
> > + vse8.v v16, (a0)
> > + addi a3, a3, 128*4
> > + vse8.v v28, (t2)
> > + sh1add a0, a1, a0
> > +.else
> > + addi a5, a5, -1
> > + mv t1, a0
> > + mv t2, a2
> > + mv t5, a3
> > + mv a6, a4
> > +1:
> > + vsetvli t0, a4, e16, m4, ta, ma
> > + sub a4, a4, t0
> > + vle16.v v0, (a2)
> > + vle16.v v8, (a3)
> > + vwmul.vx v16, v0, a7
> > + vwmacc.vx v16, t3, v8
> > + vsetvli zero, zero, e32, m8, ta, ma
> > + vadd.vx v16, v16, t4
> > + vsetvli zero, zero, e16, m4, ta, ma
> > + vnsrl.wx v16, v16, t6
> > + vmax.vx v16, v16, zero
> > + vsetvli zero, zero, e8, m2, ta, ma
> > + vnclipu.wi v16, v16, 0
> > + vse8.v v16, (a0)
> > + sh1add a2, t0, a2
> > + sh1add a3, t0, a3
> > + add a0, a0, t0
> > + bnez a4, 1b
> > + add a0, t1, a1
> > + addi a2, t2, 128*2
> > + addi a3, t5, 128*2
> > + mv a4, a6
> > +.endif
> > + bnez a5, \id\w\vlen\()b
> > + ret
> > +.endm
> > +
> > +
> > +.macro func_w_avg vlen
> > +func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x
> > + AVG_JMP_TABLE 2, \vlen
> > + csrwi vxrm, 0
> > + addi t6, a6, 7
> > + ld t3, (sp)
> > + ld t4, 8(sp)
> > + ld t5, 16(sp)
>
> Breaks build if XLEN = 32.
>
> > + addi t4, t4, 1 // o0 + o1 + 1
> > + add t4, t4, t5
> > + addi t5, t6, -1 // shift - 1
> > + sll t4, t4, t5
> > + AVG_J \vlen, 2
> > + .irp w,2,4,8,16,32,64,128
> > + w_avg \w, \vlen, 2
> > + .endr
> > +endfunc
> > +.endm
> > +
> > +func_avg 128
> > +func_avg 256
> > +#if (__riscv_xlen == 64)
> > +func_w_avg 128
> > +func_w_avg 256
> > +#endif
> > diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c
> > b/libavcodec/riscv/vvc/vvcdsp_init.c new file mode 100644
> > index 0000000000..9819a7c570
> > --- /dev/null
> > +++ b/libavcodec/riscv/vvc/vvcdsp_init.c
> > @@ -0,0 +1,72 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "config.h"
> > +
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavutil/riscv/cpu.h"
> > +#include "libavcodec/vvc/dsp.h"
> > +
> > +#define bf(fn, bd, opt) fn##_##bd##_##opt
> > +
> > +#define AVG_PROTOTYPES(bd, opt)
>
> > \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst,
> > ptrdiff_t dst_stride, \ + const
> > int16_t *src0, const int16_t *src1, int width, int height);
>
> > \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t
> > dst_stride, \ + const int16_t *src0,
> > const int16_t *src1, int width, int height,
>
> > \ + int denom, int w0, int w1, int o0, int o1);
> > +
> > +AVG_PROTOTYPES(8, rvv_128)
> > +AVG_PROTOTYPES(8, rvv_256)
> > +
> > +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> > +{
> > +#if HAVE_RVV
> > + const int flags = av_get_cpu_flags();
> > + int vlenb = ff_get_rv_vlenb();
> > +
> > + if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)
> &&
> > + vlenb >= 32) {
> > + switch (bd) {
> > + case 8:
> > + c->inter.avg = ff_vvc_avg_8_rvv_256;
> > +# if (__riscv_xlen == 64)
> > + c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
> > +# endif
> > + break;
> > + default:
> > + break;
> > + }
> > + } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags &
> > AV_CPU_FLAG_RVB_ADDR) && + vlenb >= 16) {
> > + switch (bd) {
> > + case 8:
> > + c->inter.avg = ff_vvc_avg_8_rvv_128;
> > +# if (__riscv_xlen == 64)
> > + c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
> > +# endif
> > + break;
> > + default:
> > + break;
> > + }
> > + }
> > +#endif
> > +}
> > diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
> > index 41e830a98a..c55a37d255 100644
> > --- a/libavcodec/vvc/dsp.c
> > +++ b/libavcodec/vvc/dsp.c
> > @@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int
> > bit_depth) break;
> > }
> >
> > -#if ARCH_X86
> > +#if ARCH_RISCV
> > + ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
> > +#elif ARCH_X86
> > ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
> > #endif
> > }
> > diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> > index 1f14096c41..e03236dd76 100644
> > --- a/libavcodec/vvc/dsp.h
> > +++ b/libavcodec/vvc/dsp.h
> > @@ -180,6 +180,7 @@ typedef struct VVCDSPContext {
> >
> > void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
> >
> > +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
> > void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
> >
> > #endif /* AVCODEC_VVC_DSP_H */
>
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-07-18 15:04 ` flow gg
@ 2024-07-19 15:55 ` Rémi Denis-Courmont
2024-07-21 13:43 ` uk7b
2024-07-21 13:45 ` flow gg
0 siblings, 2 replies; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-19 15:55 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Le torstaina 18. heinäkuuta 2024, 18.04.15 EEST flow gg a écrit :
> > Again, I don't think that a maximul multiplier belongs here. If the
> > calling code cannot scale the multiplier up, then it should be a normal
> > loop providing the same code for all VLENs.
>
> I think it's acceptable to add such a parameter, which isn't particularly
> common in other files, because this vset is used for vvc_mc_rvv.S rather
> than libavutil/riscv/asm.S.
Maybe but that's really not my point. If you use the same LMUL for all VLENBs,
then you should use the same function, not two copies of the exact same
function.
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-07-19 15:55 ` Rémi Denis-Courmont
@ 2024-07-21 13:43 ` uk7b
2024-07-21 13:45 ` flow gg
1 sibling, 0 replies; 19+ messages in thread
From: uk7b @ 2024-07-21 13:43 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
avg_8_2x2_c : 1.2 1.0
avg_8_2x2_rvv_i32 : 0.7 0.7
avg_8_2x4_c : 2.0 2.2
avg_8_2x4_rvv_i32 : 1.2 1.2
avg_8_2x8_c : 3.7 4.0
avg_8_2x8_rvv_i32 : 1.7 1.5
avg_8_2x16_c : 7.2 7.7
avg_8_2x16_rvv_i32 : 3.0 2.7
avg_8_2x32_c : 14.2 15.2
avg_8_2x32_rvv_i32 : 5.5 5.0
avg_8_2x64_c : 51.0 43.7
avg_8_2x64_rvv_i32 : 39.2 29.7
avg_8_2x128_c : 100.5 79.2
avg_8_2x128_rvv_i32 : 79.7 68.2
avg_8_4x2_c : 1.7 2.0
avg_8_4x2_rvv_i32 : 1.0 0.7
avg_8_4x4_c : 3.5 3.7
avg_8_4x4_rvv_i32 : 1.2 1.2
avg_8_4x8_c : 6.7 7.0
avg_8_4x8_rvv_i32 : 1.7 1.5
avg_8_4x16_c : 13.5 14.0
avg_8_4x16_rvv_i32 : 3.0 2.7
avg_8_4x32_c : 26.2 27.7
avg_8_4x32_rvv_i32 : 5.5 4.7
avg_8_4x64_c : 73.0 73.7
avg_8_4x64_rvv_i32 : 39.0 32.5
avg_8_4x128_c : 143.0 137.2
avg_8_4x128_rvv_i32 : 72.7 68.0
avg_8_8x2_c : 3.5 3.5
avg_8_8x2_rvv_i32 : 1.0 0.7
avg_8_8x4_c : 6.2 6.5
avg_8_8x4_rvv_i32 : 1.5 1.0
avg_8_8x8_c : 12.7 13.2
avg_8_8x8_rvv_i32 : 2.0 1.5
avg_8_8x16_c : 25.0 26.5
avg_8_8x16_rvv_i32 : 3.2 2.7
avg_8_8x32_c : 50.0 52.7
avg_8_8x32_rvv_i32 : 6.2 5.0
avg_8_8x64_c : 118.7 122.5
avg_8_8x64_rvv_i32 : 40.2 31.5
avg_8_8x128_c : 236.7 220.2
avg_8_8x128_rvv_i32 : 85.2 67.7
avg_8_16x2_c : 6.2 6.7
avg_8_16x2_rvv_i32 : 1.2 0.7
avg_8_16x4_c : 12.5 13.0
avg_8_16x4_rvv_i32 : 1.7 1.0
avg_8_16x8_c : 24.5 26.0
avg_8_16x8_rvv_i32 : 3.0 1.7
avg_8_16x16_c : 49.0 51.5
avg_8_16x16_rvv_i32 : 5.5 3.0
avg_8_16x32_c : 97.5 102.5
avg_8_16x32_rvv_i32 : 10.5 5.5
avg_8_16x64_c : 213.7 222.0
avg_8_16x64_rvv_i32 : 48.5 34.2
avg_8_16x128_c : 434.7 420.0
avg_8_16x128_rvv_i32 : 97.7 74.0
avg_8_32x2_c : 12.2 12.7
avg_8_32x2_rvv_i32 : 1.5 1.0
avg_8_32x4_c : 24.5 25.5
avg_8_32x4_rvv_i32 : 3.0 1.7
avg_8_32x8_c : 48.5 50.7
avg_8_32x8_rvv_i32 : 5.2 2.7
avg_8_32x16_c : 96.7 101.2
avg_8_32x16_rvv_i32 : 10.2 5.0
avg_8_32x32_c : 192.7 202.2
avg_8_32x32_rvv_i32 : 19.7 9.5
avg_8_32x64_c : 427.5 426.5
avg_8_32x64_rvv_i32 : 64.2 18.2
avg_8_32x128_c : 816.5 821.0
avg_8_32x128_rvv_i32 : 135.2 75.5
avg_8_64x2_c : 24.0 25.2
avg_8_64x2_rvv_i32 : 2.7 1.5
avg_8_64x4_c : 48.2 50.5
avg_8_64x4_rvv_i32 : 5.0 2.7
avg_8_64x8_c : 96.0 100.7
avg_8_64x8_rvv_i32 : 9.7 4.5
avg_8_64x16_c : 207.7 201.2
avg_8_64x16_rvv_i32 : 19.0 9.0
avg_8_64x32_c : 383.2 402.0
avg_8_64x32_rvv_i32 : 37.5 17.5
avg_8_64x64_c : 837.2 828.7
avg_8_64x64_rvv_i32 : 84.7 35.5
avg_8_64x128_c : 1640.7 1640.2
avg_8_64x128_rvv_i32 : 206.0 153.0
avg_8_128x2_c : 48.7 51.0
avg_8_128x2_rvv_i32 : 5.2 2.7
avg_8_128x4_c : 96.7 101.5
avg_8_128x4_rvv_i32 : 10.2 5.0
avg_8_128x8_c : 192.2 202.0
avg_8_128x8_rvv_i32 : 19.7 9.2
avg_8_128x16_c : 400.7 403.2
avg_8_128x16_rvv_i32 : 38.7 18.5
avg_8_128x32_c : 786.7 805.7
avg_8_128x32_rvv_i32 : 77.0 36.2
avg_8_128x64_c : 1615.5 1655.5
avg_8_128x64_rvv_i32 : 189.7 80.7
avg_8_128x128_c : 3182.0 3238.0
avg_8_128x128_rvv_i32 : 397.5 308.5
w_avg_8_2x2_c : 1.7 1.2
w_avg_8_2x2_rvv_i32 : 1.2 1.0
w_avg_8_2x4_c : 2.7 2.7
w_avg_8_2x4_rvv_i32 : 1.7 1.5
w_avg_8_2x8_c : 21.7 4.7
w_avg_8_2x8_rvv_i32 : 2.7 2.5
w_avg_8_2x16_c : 9.5 9.2
w_avg_8_2x16_rvv_i32 : 4.7 4.2
w_avg_8_2x32_c : 19.0 18.7
w_avg_8_2x32_rvv_i32 : 9.0 8.0
w_avg_8_2x64_c : 62.0 50.2
w_avg_8_2x64_rvv_i32 : 47.7 33.5
w_avg_8_2x128_c : 116.7 87.7
w_avg_8_2x128_rvv_i32 : 80.0 69.5
w_avg_8_4x2_c : 2.5 2.5
w_avg_8_4x2_rvv_i32 : 1.2 1.0
w_avg_8_4x4_c : 4.7 4.5
w_avg_8_4x4_rvv_i32 : 1.7 1.7
w_avg_8_4x8_c : 9.0 8.7
w_avg_8_4x8_rvv_i32 : 2.7 2.5
w_avg_8_4x16_c : 17.7 17.5
w_avg_8_4x16_rvv_i32 : 4.7 4.2
w_avg_8_4x32_c : 35.0 35.0
w_avg_8_4x32_rvv_i32 : 9.0 8.0
w_avg_8_4x64_c : 100.5 84.5
w_avg_8_4x64_rvv_i32 : 42.2 33.7
w_avg_8_4x128_c : 203.5 151.2
w_avg_8_4x128_rvv_i32 : 83.0 69.5
w_avg_8_8x2_c : 4.5 4.5
w_avg_8_8x2_rvv_i32 : 1.2 1.2
w_avg_8_8x4_c : 8.7 8.7
w_avg_8_8x4_rvv_i32 : 2.0 1.7
w_avg_8_8x8_c : 17.0 17.0
w_avg_8_8x8_rvv_i32 : 3.2 2.5
w_avg_8_8x16_c : 34.0 33.5
w_avg_8_8x16_rvv_i32 : 5.5 4.2
w_avg_8_8x32_c : 86.0 67.5
w_avg_8_8x32_rvv_i32 : 10.5 8.0
w_avg_8_8x64_c : 187.2 149.5
w_avg_8_8x64_rvv_i32 : 45.0 35.5
w_avg_8_8x128_c : 342.7 290.0
w_avg_8_8x128_rvv_i32 : 108.7 70.2
w_avg_8_16x2_c : 8.5 8.2
w_avg_8_16x2_rvv_i32 : 2.0 1.2
w_avg_8_16x4_c : 16.7 16.7
w_avg_8_16x4_rvv_i32 : 3.0 1.7
w_avg_8_16x8_c : 33.2 33.5
w_avg_8_16x8_rvv_i32 : 5.5 3.0
w_avg_8_16x16_c : 66.2 66.7
w_avg_8_16x16_rvv_i32 : 10.5 5.0
w_avg_8_16x32_c : 132.5 131.0
w_avg_8_16x32_rvv_i32 : 20.0 9.7
w_avg_8_16x64_c : 340.0 283.5
w_avg_8_16x64_rvv_i32 : 60.5 37.2
w_avg_8_16x128_c : 641.2 597.5
w_avg_8_16x128_rvv_i32 : 118.7 77.7
w_avg_8_32x2_c : 16.5 16.7
w_avg_8_32x2_rvv_i32 : 3.2 1.7
w_avg_8_32x4_c : 33.2 33.2
w_avg_8_32x4_rvv_i32 : 5.5 2.7
w_avg_8_32x8_c : 66.0 62.5
w_avg_8_32x8_rvv_i32 : 10.5 5.0
w_avg_8_32x16_c : 131.5 132.0
w_avg_8_32x16_rvv_i32 : 20.2 9.5
w_avg_8_32x32_c : 261.7 272.0
w_avg_8_32x32_rvv_i32 : 39.7 18.0
w_avg_8_32x64_c : 575.2 545.5
w_avg_8_32x64_rvv_i32 : 105.5 58.7
w_avg_8_32x128_c : 1154.2 1088.0
w_avg_8_32x128_rvv_i32 : 207.0 98.0
w_avg_8_64x2_c : 33.0 33.0
w_avg_8_64x2_rvv_i32 : 6.2 2.7
w_avg_8_64x4_c : 65.5 66.0
w_avg_8_64x4_rvv_i32 : 11.5 5.0
w_avg_8_64x8_c : 131.2 132.5
w_avg_8_64x8_rvv_i32 : 22.5 9.5
w_avg_8_64x16_c : 268.2 262.5
w_avg_8_64x16_rvv_i32 : 44.2 18.0
w_avg_8_64x32_c : 561.5 528.7
w_avg_8_64x32_rvv_i32 : 88.0 35.2
w_avg_8_64x64_c : 1136.2 1124.0
w_avg_8_64x64_rvv_i32 : 222.0 82.2
w_avg_8_64x128_c : 2345.0 2312.7
w_avg_8_64x128_rvv_i32 : 423.0 190.5
w_avg_8_128x2_c : 65.7 66.5
w_avg_8_128x2_rvv_i32 : 11.2 5.5
w_avg_8_128x4_c : 131.2 132.2
w_avg_8_128x4_rvv_i32 : 22.0 10.2
w_avg_8_128x8_c : 263.5 312.0
w_avg_8_128x8_rvv_i32 : 43.2 19.7
w_avg_8_128x16_c : 528.7 526.2
w_avg_8_128x16_rvv_i32 : 85.5 39.5
w_avg_8_128x32_c : 1067.7 1062.7
w_avg_8_128x32_rvv_i32 : 171.7 78.2
w_avg_8_128x64_c : 2234.7 2168.7
w_avg_8_128x64_rvv_i32 : 400.0 159.0
w_avg_8_128x128_c : 4752.5 4295.0
w_avg_8_128x128_rvv_i32 : 757.7 365.5
---
libavcodec/riscv/vvc/Makefile | 2 +
libavcodec/riscv/vvc/vvc_mc_rvv.S | 285 +++++++++++++++++++++++++++++
libavcodec/riscv/vvc/vvcdsp_init.c | 72 ++++++++
libavcodec/vvc/dsp.c | 4 +-
libavcodec/vvc/dsp.h | 1 +
5 files changed, 363 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/riscv/vvc/Makefile
create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S
create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c
diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile
new file mode 100644
index 0000000000..582b051579
--- /dev/null
+++ b/libavcodec/riscv/vvc/Makefile
@@ -0,0 +1,2 @@
+OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o
+RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o
diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S b/libavcodec/riscv/vvc/vvc_mc_rvv.S
new file mode 100644
index 0000000000..89496c73a1
--- /dev/null
+++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro vsetvlstatic8 w, vlen
+ .if \w == 2 && \vlen == 128
+ vsetivli zero, \w, e8, mf8, ta, ma
+ .elseif \w == 4 && \vlen == 128
+ vsetivli zero, \w, e8, mf4, ta, ma
+ .elseif \w == 8 && \vlen == 128
+ vsetivli zero, \w, e8, mf2, ta, ma
+ .elseif \w == 16 && \vlen == 128
+ vsetivli zero, \w, e8, m1, ta, ma
+ .elseif \w == 32 && \vlen == 128
+ li t0, \w
+ vsetvli zero, t0, e8, m2, ta, ma
+ .elseif \w <= 4 && \vlen == 256
+ vsetivli zero, \w, e8, mf8, ta, ma
+ .elseif \w == 8 && \vlen == 256
+ vsetivli zero, \w, e8, mf4, ta, ma
+ .elseif \w == 16 && \vlen == 256
+ vsetivli zero, \w, e8, mf2, ta, ma
+ .elseif \w == 32 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e8, m1, ta, ma
+ .elseif \w == 64 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e8, m2, ta, ma
+ .else
+ li t0, \w
+ vsetvli zero, t0, e8, m4, ta, ma
+ .endif
+.endm
+
+.macro vsetvlstatic16 w, vlen
+ .if \w == 2 && \vlen == 128
+ vsetivli zero, \w, e16, mf4, ta, ma
+ .elseif \w == 4 && \vlen == 128
+ vsetivli zero, \w, e16, mf2, ta, ma
+ .elseif \w == 8 && \vlen == 128
+ vsetivli zero, \w, e16, m1, ta, ma
+ .elseif \w == 16 && \vlen == 128
+ vsetivli zero, \w, e16, m2, ta, ma
+ .elseif \w == 32 && \vlen == 128
+ li t0, \w
+ vsetvli zero, t0, e16, m4, ta, ma
+ .elseif \w <= 4 && \vlen == 256
+ vsetivli zero, \w, e16, mf4, ta, ma
+ .elseif \w == 8 && \vlen == 256
+ vsetivli zero, \w, e16, mf2, ta, ma
+ .elseif \w == 16 && \vlen == 256
+ vsetivli zero, \w, e16, m1, ta, ma
+ .elseif \w == 32 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e16, m2, ta, ma
+ .elseif \w == 64 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e16, m4, ta, ma
+ .else
+ li t0, \w
+ vsetvli zero, t0, e16, m8, ta, ma
+ .endif
+.endm
+
+.macro vsetvlstatic32 w, vlen
+ .if \w == 2
+ vsetivli zero, \w, e32, mf2, ta, ma
+ .elseif \w == 4 && \vlen == 128
+ vsetivli zero, \w, e32, m1, ta, ma
+ .elseif \w == 8 && \vlen == 128
+ vsetivli zero, \w, e32, m2, ta, ma
+ .elseif \w == 16 && \vlen == 128
+ vsetivli zero, \w, e32, m4, ta, ma
+ .elseif \w == 4 && \vlen == 256
+ vsetivli zero, \w, e32, mf2, ta, ma
+ .elseif \w == 8 && \vlen == 256
+ vsetivli zero, \w, e32, m1, ta, ma
+ .elseif \w == 16 && \vlen == 256
+ vsetivli zero, \w, e32, m2, ta, ma
+ .elseif \w == 32 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e32, m4, ta, ma
+ .else
+ li t0, \w
+ vsetvli zero, t0, e32, m8, ta, ma
+ .endif
+.endm
+
+.macro avg w, vlen, id
+\id\w\vlen:
+.if \w < 128
+ vsetvlstatic16 \w, \vlen
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ add t2, a0, a1
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ addi a5, a5, -2
+ vle16.v v16, (t0)
+ vle16.v v24, (t1)
+ vadd.vv v8, v8, v0
+ vadd.vv v24, v24, v16
+ vmax.vx v8, v8, zero
+ vmax.vx v24, v24, zero
+ vsetvlstatic8 \w, \vlen
+ addi a2, a2, 128*4
+ vnclipu.wi v8, v8, 7
+ vnclipu.wi v24, v24, 7
+ addi a3, a3, 128*4
+ vse8.v v8, (a0)
+ vse8.v v24, (t2)
+ sh1add a0, a1, a0
+.else
+ addi a5, a5, -1
+ mv t1, a0
+ mv t2, a2
+ mv t3, a3
+ mv t4, a4
+1:
+ vsetvli t0, a4, e16, m8, ta, ma
+ sub a4, a4, t0
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vadd.vv v8, v8, v0
+ vmax.vx v8, v8, zero
+ vsetvli zero, zero, e8, m4, ta, ma
+ vnclipu.wi v8, v8, 7
+ vse8.v v8, (a0)
+ sh1add a2, t0, a2
+ sh1add a3, t0, a3
+ add a0, a0, t0
+ bnez a4, 1b
+ add a0, t1, a1
+ addi a2, t2, 128*2
+ addi a3, t3, 128*2
+ mv a4, t4
+.endif
+ bnez a5, \id\w\vlen\()b
+ ret
+.endm
+
+
+.macro AVG_JMP_TABLE id, vlen
+const jmp_table_\id\vlen
+ .4byte \id\()2\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()4\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()8\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()16\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()32\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()64\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()128\vlen\()f - jmp_table_\id\vlen
+endconst
+.endm
+
+.macro AVG_J vlen, id
+ clz t1, a4
+ neg t1, t1
+ lla t5, jmp_table_\id\vlen
+ sh2add t1, t1, t5
+ lw t1, ((__riscv_xlen-2)<<2)(t1)
+ add t1, t1, t5
+ jr t1
+.endm
+
+.macro func_avg vlen
+func ff_vvc_avg_8_rvv_\vlen\(), zve32x
+ AVG_JMP_TABLE 1, \vlen
+ csrwi vxrm, 0
+ AVG_J \vlen, 1
+ .irp w,2,4,8,16,32,64,128
+ avg \w, \vlen, 1
+ .endr
+endfunc
+.endm
+
+func_avg 128
+func_avg 256
+
+#if (__riscv_xlen == 64)
+.macro w_avg w, vlen, id
+\id\w\vlen:
+.if \w <= 32 || (\w == 64 && \vlen == 256)
+ vsetvlstatic16 \w, \vlen
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ vle16.v v0, (a2)
+ vle16.v v4, (a3)
+ addi a5, a5, -2
+ vle16.v v8, (t0)
+ vle16.v v12, (t1)
+ vwmul.vx v16, v0, a7
+ vwmul.vx v24, v8, a7
+ vwmacc.vx v16, t3, v4
+ vwmacc.vx v24, t3, v12
+ vsetvlstatic32 \w, \vlen
+ add t2, a0, a1
+ vadd.vx v16, v16, t4
+ vadd.vx v24, v24, t4
+ vsetvlstatic16 \w, \vlen
+ vnsrl.wx v16, v16, t6
+ vnsrl.wx v24, v24, t6
+ vmax.vx v16, v16, zero
+ vmax.vx v24, v24, zero
+ vsetvlstatic8 \w, \vlen
+ addi a2, a2, 128*4
+ vnclipu.wi v16, v16, 0
+ vnclipu.wi v24, v24, 0
+ vse8.v v16, (a0)
+ addi a3, a3, 128*4
+ vse8.v v24, (t2)
+ sh1add a0, a1, a0
+.else
+ addi a5, a5, -1
+ mv t1, a0
+ mv t2, a2
+ mv t5, a3
+ mv a6, a4
+1:
+ vsetvli t0, a4, e16, m4, ta, ma
+ sub a4, a4, t0
+ vle16.v v0, (a2)
+ vle16.v v4, (a3)
+ vwmul.vx v16, v0, a7
+ vwmacc.vx v16, t3, v4
+ vsetvli zero, zero, e32, m8, ta, ma
+ vadd.vx v16, v16, t4
+ vsetvli zero, zero, e16, m4, ta, ma
+ vnsrl.wx v16, v16, t6
+ vmax.vx v16, v16, zero
+ vsetvli zero, zero, e8, m2, ta, ma
+ vnclipu.wi v16, v16, 0
+ vse8.v v16, (a0)
+ sh1add a2, t0, a2
+ sh1add a3, t0, a3
+ add a0, a0, t0
+ bnez a4, 1b
+ add a0, t1, a1
+ addi a2, t2, 128*2
+ addi a3, t5, 128*2
+ mv a4, a6
+.endif
+ bnez a5, \id\w\vlen\()b
+ ret
+.endm
+
+.macro func_w_avg vlen
+func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x
+ AVG_JMP_TABLE 2, \vlen
+ csrwi vxrm, 0
+ addi t6, a6, 7
+ ld t3, (sp)
+ ld t4, 8(sp)
+ ld t5, 16(sp)
+ addi t4, t4, 1 // o0 + o1 + 1
+ add t4, t4, t5
+ addi t5, t6, -1 // shift - 1
+ sll t4, t4, t5
+ AVG_J \vlen, 2
+ .irp w,2,4,8,16,32,64,128
+ w_avg \w, \vlen, 2
+ .endr
+endfunc
+.endm
+
+func_w_avg 128
+func_w_avg 256
+#endif
diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c b/libavcodec/riscv/vvc/vvcdsp_init.c
new file mode 100644
index 0000000000..9819a7c570
--- /dev/null
+++ b/libavcodec/riscv/vvc/vvcdsp_init.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/vvc/dsp.h"
+
+#define bf(fn, bd, opt) fn##_##bd##_##opt
+
+#define AVG_PROTOTYPES(bd, opt) \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height); \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height, \
+ int denom, int w0, int w1, int o0, int o1);
+
+AVG_PROTOTYPES(8, rvv_128)
+AVG_PROTOTYPES(8, rvv_256)
+
+void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
+{
+#if HAVE_RVV
+ const int flags = av_get_cpu_flags();
+ int vlenb = ff_get_rv_vlenb();
+
+ if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+ vlenb >= 32) {
+ switch (bd) {
+ case 8:
+ c->inter.avg = ff_vvc_avg_8_rvv_256;
+# if (__riscv_xlen == 64)
+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
+# endif
+ break;
+ default:
+ break;
+ }
+ } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+ vlenb >= 16) {
+ switch (bd) {
+ case 8:
+ c->inter.avg = ff_vvc_avg_8_rvv_128;
+# if (__riscv_xlen == 64)
+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
+# endif
+ break;
+ default:
+ break;
+ }
+ }
+#endif
+}
diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
index 41e830a98a..c55a37d255 100644
--- a/libavcodec/vvc/dsp.c
+++ b/libavcodec/vvc/dsp.c
@@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
break;
}
-#if ARCH_X86
+#if ARCH_RISCV
+ ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
+#elif ARCH_X86
ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
#endif
}
diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
index 1f14096c41..e03236dd76 100644
--- a/libavcodec/vvc/dsp.h
+++ b/libavcodec/vvc/dsp.h
@@ -180,6 +180,7 @@ typedef struct VVCDSPContext {
void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
+void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
#endif /* AVCODEC_VVC_DSP_H */
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-07-19 15:55 ` Rémi Denis-Courmont
2024-07-21 13:43 ` uk7b
@ 2024-07-21 13:45 ` flow gg
1 sibling, 0 replies; 19+ messages in thread
From: flow gg @ 2024-07-21 13:45 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Okay, updated it
Rémi Denis-Courmont <remi@remlab.net> 于2024年7月19日周五 23:56写道:
> Le torstaina 18. heinäkuuta 2024, 18.04.15 EEST flow gg a écrit :
> > > Again, I don't think that a maximul multiplier belongs here. If the
> > > calling code cannot scale the multiplier up, then it should be a normal
> > > loop providing the same code for all VLENs.
> >
> > I think it's acceptable to add such a parameter, which isn't particularly
> > common in other files, because this vset is used for vvc_mc_rvv.S rather
> > than libavutil/riscv/asm.S.
>
> Maybe but that's really not my point. If you use the same LMUL for all
> VLENBs,
> then you should use the same function, not two copies of the exact same
> function.
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
@ 2024-08-03 10:30 uk7b
2024-08-03 10:31 ` flow gg
2024-08-15 8:10 ` Rémi Denis-Courmont
0 siblings, 2 replies; 19+ messages in thread
From: uk7b @ 2024-08-03 10:30 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
avg_8_2x2_c : 1.2 1.0
avg_8_2x2_rvv_i32 : 0.7 0.7
avg_8_2x4_c : 2.0 2.2
avg_8_2x4_rvv_i32 : 1.2 1.2
avg_8_2x8_c : 3.7 4.0
avg_8_2x8_rvv_i32 : 1.7 1.5
avg_8_2x16_c : 7.2 7.7
avg_8_2x16_rvv_i32 : 3.0 2.7
avg_8_2x32_c : 14.2 15.2
avg_8_2x32_rvv_i32 : 5.5 5.0
avg_8_2x64_c : 51.0 43.7
avg_8_2x64_rvv_i32 : 39.2 29.7
avg_8_2x128_c : 100.5 79.2
avg_8_2x128_rvv_i32 : 79.7 68.2
avg_8_4x2_c : 1.7 2.0
avg_8_4x2_rvv_i32 : 1.0 0.7
avg_8_4x4_c : 3.5 3.7
avg_8_4x4_rvv_i32 : 1.2 1.2
avg_8_4x8_c : 6.7 7.0
avg_8_4x8_rvv_i32 : 1.7 1.5
avg_8_4x16_c : 13.5 14.0
avg_8_4x16_rvv_i32 : 3.0 2.7
avg_8_4x32_c : 26.2 27.7
avg_8_4x32_rvv_i32 : 5.5 4.7
avg_8_4x64_c : 73.0 73.7
avg_8_4x64_rvv_i32 : 39.0 32.5
avg_8_4x128_c : 143.0 137.2
avg_8_4x128_rvv_i32 : 72.7 68.0
avg_8_8x2_c : 3.5 3.5
avg_8_8x2_rvv_i32 : 1.0 0.7
avg_8_8x4_c : 6.2 6.5
avg_8_8x4_rvv_i32 : 1.5 1.0
avg_8_8x8_c : 12.7 13.2
avg_8_8x8_rvv_i32 : 2.0 1.5
avg_8_8x16_c : 25.0 26.5
avg_8_8x16_rvv_i32 : 3.2 2.7
avg_8_8x32_c : 50.0 52.7
avg_8_8x32_rvv_i32 : 6.2 5.0
avg_8_8x64_c : 118.7 122.5
avg_8_8x64_rvv_i32 : 40.2 31.5
avg_8_8x128_c : 236.7 220.2
avg_8_8x128_rvv_i32 : 85.2 67.7
avg_8_16x2_c : 6.2 6.7
avg_8_16x2_rvv_i32 : 1.2 0.7
avg_8_16x4_c : 12.5 13.0
avg_8_16x4_rvv_i32 : 1.7 1.0
avg_8_16x8_c : 24.5 26.0
avg_8_16x8_rvv_i32 : 3.0 1.7
avg_8_16x16_c : 49.0 51.5
avg_8_16x16_rvv_i32 : 5.5 3.0
avg_8_16x32_c : 97.5 102.5
avg_8_16x32_rvv_i32 : 10.5 5.5
avg_8_16x64_c : 213.7 222.0
avg_8_16x64_rvv_i32 : 48.5 34.2
avg_8_16x128_c : 434.7 420.0
avg_8_16x128_rvv_i32 : 97.7 74.0
avg_8_32x2_c : 12.2 12.7
avg_8_32x2_rvv_i32 : 1.5 1.0
avg_8_32x4_c : 24.5 25.5
avg_8_32x4_rvv_i32 : 3.0 1.7
avg_8_32x8_c : 48.5 50.7
avg_8_32x8_rvv_i32 : 5.2 2.7
avg_8_32x16_c : 96.7 101.2
avg_8_32x16_rvv_i32 : 10.2 5.0
avg_8_32x32_c : 192.7 202.2
avg_8_32x32_rvv_i32 : 19.7 9.5
avg_8_32x64_c : 427.5 426.5
avg_8_32x64_rvv_i32 : 64.2 18.2
avg_8_32x128_c : 816.5 821.0
avg_8_32x128_rvv_i32 : 135.2 75.5
avg_8_64x2_c : 24.0 25.2
avg_8_64x2_rvv_i32 : 2.7 1.5
avg_8_64x4_c : 48.2 50.5
avg_8_64x4_rvv_i32 : 5.0 2.7
avg_8_64x8_c : 96.0 100.7
avg_8_64x8_rvv_i32 : 9.7 4.5
avg_8_64x16_c : 207.7 201.2
avg_8_64x16_rvv_i32 : 19.0 9.0
avg_8_64x32_c : 383.2 402.0
avg_8_64x32_rvv_i32 : 37.5 17.5
avg_8_64x64_c : 837.2 828.7
avg_8_64x64_rvv_i32 : 84.7 35.5
avg_8_64x128_c : 1640.7 1640.2
avg_8_64x128_rvv_i32 : 206.0 153.0
avg_8_128x2_c : 48.7 51.0
avg_8_128x2_rvv_i32 : 5.2 2.7
avg_8_128x4_c : 96.7 101.5
avg_8_128x4_rvv_i32 : 10.2 5.0
avg_8_128x8_c : 192.2 202.0
avg_8_128x8_rvv_i32 : 19.7 9.2
avg_8_128x16_c : 400.7 403.2
avg_8_128x16_rvv_i32 : 38.7 18.5
avg_8_128x32_c : 786.7 805.7
avg_8_128x32_rvv_i32 : 77.0 36.2
avg_8_128x64_c : 1615.5 1655.5
avg_8_128x64_rvv_i32 : 189.7 80.7
avg_8_128x128_c : 3182.0 3238.0
avg_8_128x128_rvv_i32 : 397.5 308.5
w_avg_8_2x2_c : 1.7 1.2
w_avg_8_2x2_rvv_i32 : 1.2 1.0
w_avg_8_2x4_c : 2.7 2.7
w_avg_8_2x4_rvv_i32 : 1.7 1.5
w_avg_8_2x8_c : 21.7 4.7
w_avg_8_2x8_rvv_i32 : 2.7 2.5
w_avg_8_2x16_c : 9.5 9.2
w_avg_8_2x16_rvv_i32 : 4.7 4.2
w_avg_8_2x32_c : 19.0 18.7
w_avg_8_2x32_rvv_i32 : 9.0 8.0
w_avg_8_2x64_c : 62.0 50.2
w_avg_8_2x64_rvv_i32 : 47.7 33.5
w_avg_8_2x128_c : 116.7 87.7
w_avg_8_2x128_rvv_i32 : 80.0 69.5
w_avg_8_4x2_c : 2.5 2.5
w_avg_8_4x2_rvv_i32 : 1.2 1.0
w_avg_8_4x4_c : 4.7 4.5
w_avg_8_4x4_rvv_i32 : 1.7 1.7
w_avg_8_4x8_c : 9.0 8.7
w_avg_8_4x8_rvv_i32 : 2.7 2.5
w_avg_8_4x16_c : 17.7 17.5
w_avg_8_4x16_rvv_i32 : 4.7 4.2
w_avg_8_4x32_c : 35.0 35.0
w_avg_8_4x32_rvv_i32 : 9.0 8.0
w_avg_8_4x64_c : 100.5 84.5
w_avg_8_4x64_rvv_i32 : 42.2 33.7
w_avg_8_4x128_c : 203.5 151.2
w_avg_8_4x128_rvv_i32 : 83.0 69.5
w_avg_8_8x2_c : 4.5 4.5
w_avg_8_8x2_rvv_i32 : 1.2 1.2
w_avg_8_8x4_c : 8.7 8.7
w_avg_8_8x4_rvv_i32 : 2.0 1.7
w_avg_8_8x8_c : 17.0 17.0
w_avg_8_8x8_rvv_i32 : 3.2 2.5
w_avg_8_8x16_c : 34.0 33.5
w_avg_8_8x16_rvv_i32 : 5.5 4.2
w_avg_8_8x32_c : 86.0 67.5
w_avg_8_8x32_rvv_i32 : 10.5 8.0
w_avg_8_8x64_c : 187.2 149.5
w_avg_8_8x64_rvv_i32 : 45.0 35.5
w_avg_8_8x128_c : 342.7 290.0
w_avg_8_8x128_rvv_i32 : 108.7 70.2
w_avg_8_16x2_c : 8.5 8.2
w_avg_8_16x2_rvv_i32 : 2.0 1.2
w_avg_8_16x4_c : 16.7 16.7
w_avg_8_16x4_rvv_i32 : 3.0 1.7
w_avg_8_16x8_c : 33.2 33.5
w_avg_8_16x8_rvv_i32 : 5.5 3.0
w_avg_8_16x16_c : 66.2 66.7
w_avg_8_16x16_rvv_i32 : 10.5 5.0
w_avg_8_16x32_c : 132.5 131.0
w_avg_8_16x32_rvv_i32 : 20.0 9.7
w_avg_8_16x64_c : 340.0 283.5
w_avg_8_16x64_rvv_i32 : 60.5 37.2
w_avg_8_16x128_c : 641.2 597.5
w_avg_8_16x128_rvv_i32 : 118.7 77.7
w_avg_8_32x2_c : 16.5 16.7
w_avg_8_32x2_rvv_i32 : 3.2 1.7
w_avg_8_32x4_c : 33.2 33.2
w_avg_8_32x4_rvv_i32 : 5.5 2.7
w_avg_8_32x8_c : 66.0 62.5
w_avg_8_32x8_rvv_i32 : 10.5 5.0
w_avg_8_32x16_c : 131.5 132.0
w_avg_8_32x16_rvv_i32 : 20.2 9.5
w_avg_8_32x32_c : 261.7 272.0
w_avg_8_32x32_rvv_i32 : 39.7 18.0
w_avg_8_32x64_c : 575.2 545.5
w_avg_8_32x64_rvv_i32 : 105.5 58.7
w_avg_8_32x128_c : 1154.2 1088.0
w_avg_8_32x128_rvv_i32 : 207.0 98.0
w_avg_8_64x2_c : 33.0 33.0
w_avg_8_64x2_rvv_i32 : 6.2 2.7
w_avg_8_64x4_c : 65.5 66.0
w_avg_8_64x4_rvv_i32 : 11.5 5.0
w_avg_8_64x8_c : 131.2 132.5
w_avg_8_64x8_rvv_i32 : 22.5 9.5
w_avg_8_64x16_c : 268.2 262.5
w_avg_8_64x16_rvv_i32 : 44.2 18.0
w_avg_8_64x32_c : 561.5 528.7
w_avg_8_64x32_rvv_i32 : 88.0 35.2
w_avg_8_64x64_c : 1136.2 1124.0
w_avg_8_64x64_rvv_i32 : 222.0 82.2
w_avg_8_64x128_c : 2345.0 2312.7
w_avg_8_64x128_rvv_i32 : 423.0 190.5
w_avg_8_128x2_c : 65.7 66.5
w_avg_8_128x2_rvv_i32 : 11.2 5.5
w_avg_8_128x4_c : 131.2 132.2
w_avg_8_128x4_rvv_i32 : 22.0 10.2
w_avg_8_128x8_c : 263.5 312.0
w_avg_8_128x8_rvv_i32 : 43.2 19.7
w_avg_8_128x16_c : 528.7 526.2
w_avg_8_128x16_rvv_i32 : 85.5 39.5
w_avg_8_128x32_c : 1067.7 1062.7
w_avg_8_128x32_rvv_i32 : 171.7 78.2
w_avg_8_128x64_c : 2234.7 2168.7
w_avg_8_128x64_rvv_i32 : 400.0 159.0
w_avg_8_128x128_c : 4752.5 4295.0
w_avg_8_128x128_rvv_i32 : 757.7 365.5
---
libavcodec/riscv/vvc/Makefile | 2 +
libavcodec/riscv/vvc/vvc_mc_rvv.S | 287 +++++++++++++++++++++++++++++
libavcodec/riscv/vvc/vvcdsp_init.c | 72 ++++++++
libavcodec/vvc/dsp.c | 2 +
libavcodec/vvc/dsp.h | 1 +
5 files changed, 364 insertions(+)
create mode 100644 libavcodec/riscv/vvc/Makefile
create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S
create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c
diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile
new file mode 100644
index 0000000000..582b051579
--- /dev/null
+++ b/libavcodec/riscv/vvc/Makefile
@@ -0,0 +1,2 @@
+OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o
+RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o
diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S b/libavcodec/riscv/vvc/vvc_mc_rvv.S
new file mode 100644
index 0000000000..10e1bd67ee
--- /dev/null
+++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro vsetvlstatic8 w, vlen
+ .if \w == 2 && \vlen == 128
+ vsetivli zero, \w, e8, mf8, ta, ma
+ .elseif \w == 4 && \vlen == 128
+ vsetivli zero, \w, e8, mf4, ta, ma
+ .elseif \w == 8 && \vlen == 128
+ vsetivli zero, \w, e8, mf2, ta, ma
+ .elseif \w == 16 && \vlen == 128
+ vsetivli zero, \w, e8, m1, ta, ma
+ .elseif \w == 32 && \vlen == 128
+ li t0, \w
+ vsetvli zero, t0, e8, m2, ta, ma
+ .elseif \w <= 4 && \vlen == 256
+ vsetivli zero, \w, e8, mf8, ta, ma
+ .elseif \w == 8 && \vlen == 256
+ vsetivli zero, \w, e8, mf4, ta, ma
+ .elseif \w == 16 && \vlen == 256
+ vsetivli zero, \w, e8, mf2, ta, ma
+ .elseif \w == 32 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e8, m1, ta, ma
+ .elseif \w == 64 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e8, m2, ta, ma
+ .else
+ li t0, \w
+ vsetvli zero, t0, e8, m4, ta, ma
+ .endif
+.endm
+
+.macro vsetvlstatic16 w, vlen
+ .if \w == 2 && \vlen == 128
+ vsetivli zero, \w, e16, mf4, ta, ma
+ .elseif \w == 4 && \vlen == 128
+ vsetivli zero, \w, e16, mf2, ta, ma
+ .elseif \w == 8 && \vlen == 128
+ vsetivli zero, \w, e16, m1, ta, ma
+ .elseif \w == 16 && \vlen == 128
+ vsetivli zero, \w, e16, m2, ta, ma
+ .elseif \w == 32 && \vlen == 128
+ li t0, \w
+ vsetvli zero, t0, e16, m4, ta, ma
+ .elseif \w <= 4 && \vlen == 256
+ vsetivli zero, \w, e16, mf4, ta, ma
+ .elseif \w == 8 && \vlen == 256
+ vsetivli zero, \w, e16, mf2, ta, ma
+ .elseif \w == 16 && \vlen == 256
+ vsetivli zero, \w, e16, m1, ta, ma
+ .elseif \w == 32 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e16, m2, ta, ma
+ .elseif \w == 64 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e16, m4, ta, ma
+ .else
+ li t0, \w
+ vsetvli zero, t0, e16, m8, ta, ma
+ .endif
+.endm
+
+.macro vsetvlstatic32 w, vlen
+ .if \w == 2
+ vsetivli zero, \w, e32, mf2, ta, ma
+ .elseif \w == 4 && \vlen == 128
+ vsetivli zero, \w, e32, m1, ta, ma
+ .elseif \w == 8 && \vlen == 128
+ vsetivli zero, \w, e32, m2, ta, ma
+ .elseif \w == 16 && \vlen == 128
+ vsetivli zero, \w, e32, m4, ta, ma
+ .elseif \w == 4 && \vlen == 256
+ vsetivli zero, \w, e32, mf2, ta, ma
+ .elseif \w == 8 && \vlen == 256
+ vsetivli zero, \w, e32, m1, ta, ma
+ .elseif \w == 16 && \vlen == 256
+ vsetivli zero, \w, e32, m2, ta, ma
+ .elseif \w == 32 && \vlen == 256
+ li t0, \w
+ vsetvli zero, t0, e32, m4, ta, ma
+ .else
+ li t0, \w
+ vsetvli zero, t0, e32, m8, ta, ma
+ .endif
+.endm
+
+.macro avg w, vlen, id
+\id\w\vlen:
+.if \w < 128
+ vsetvlstatic16 \w, \vlen
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ add t2, a0, a1
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ addi a5, a5, -2
+ vle16.v v16, (t0)
+ vle16.v v24, (t1)
+ vadd.vv v8, v8, v0
+ vadd.vv v24, v24, v16
+ vmax.vx v8, v8, zero
+ vmax.vx v24, v24, zero
+ vsetvlstatic8 \w, \vlen
+ addi a2, a2, 128*4
+ vnclipu.wi v8, v8, 7
+ vnclipu.wi v24, v24, 7
+ addi a3, a3, 128*4
+ vse8.v v8, (a0)
+ vse8.v v24, (t2)
+ sh1add a0, a1, a0
+.else
+ addi a5, a5, -1
+ mv t1, a0
+ mv t2, a2
+ mv t3, a3
+ mv t4, a4
+1:
+ vsetvli t0, a4, e16, m8, ta, ma
+ sub a4, a4, t0
+ vle16.v v0, (a2)
+ vle16.v v8, (a3)
+ vadd.vv v8, v8, v0
+ vmax.vx v8, v8, zero
+ vsetvli zero, zero, e8, m4, ta, ma
+ vnclipu.wi v8, v8, 7
+ vse8.v v8, (a0)
+ sh1add a2, t0, a2
+ sh1add a3, t0, a3
+ add a0, a0, t0
+ bnez a4, 1b
+ add a0, t1, a1
+ addi a2, t2, 128*2
+ addi a3, t3, 128*2
+ mv a4, t4
+.endif
+ bnez a5, \id\w\vlen\()b
+ ret
+.endm
+
+
+.macro AVG_JMP_TABLE id, vlen
+const jmp_table_\id\vlen
+ .4byte \id\()2\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()4\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()8\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()16\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()32\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()64\vlen\()f - jmp_table_\id\vlen
+ .4byte \id\()128\vlen\()f - jmp_table_\id\vlen
+endconst
+.endm
+
+.macro AVG_J vlen, id
+ clz t1, a4
+ neg t1, t1
+ lla t5, jmp_table_\id\vlen
+ sh2add t1, t1, t5
+ lw t1, ((__riscv_xlen-2)<<2)(t1)
+ add t1, t1, t5
+ jr t1
+.endm
+
+.macro func_avg vlen
+func ff_vvc_avg_8_rvv_\vlen\(), zve32x
+ lpad 0
+ AVG_JMP_TABLE 1, \vlen
+ csrwi vxrm, 0
+ AVG_J \vlen, 1
+ .irp w,2,4,8,16,32,64,128
+ avg \w, \vlen, 1
+ .endr
+endfunc
+.endm
+
+func_avg 128
+func_avg 256
+
+#if (__riscv_xlen == 64)
+.macro w_avg w, vlen, id
+\id\w\vlen:
+.if \w <= 32 || (\w == 64 && \vlen == 256)
+ vsetvlstatic16 \w, \vlen
+ addi t0, a2, 128*2
+ addi t1, a3, 128*2
+ vle16.v v0, (a2)
+ vle16.v v4, (a3)
+ addi a5, a5, -2
+ vle16.v v8, (t0)
+ vle16.v v12, (t1)
+ vwmul.vx v16, v0, a7
+ vwmul.vx v24, v8, a7
+ vwmacc.vx v16, t3, v4
+ vwmacc.vx v24, t3, v12
+ vsetvlstatic32 \w, \vlen
+ add t2, a0, a1
+ vadd.vx v16, v16, t4
+ vadd.vx v24, v24, t4
+ vsetvlstatic16 \w, \vlen
+ vnsrl.wx v16, v16, t6
+ vnsrl.wx v24, v24, t6
+ vmax.vx v16, v16, zero
+ vmax.vx v24, v24, zero
+ vsetvlstatic8 \w, \vlen
+ addi a2, a2, 128*4
+ vnclipu.wi v16, v16, 0
+ vnclipu.wi v24, v24, 0
+ vse8.v v16, (a0)
+ addi a3, a3, 128*4
+ vse8.v v24, (t2)
+ sh1add a0, a1, a0
+.else
+ addi a5, a5, -1
+ mv t1, a0
+ mv t2, a2
+ mv t5, a3
+ mv a6, a4
+1:
+ vsetvli t0, a4, e16, m4, ta, ma
+ sub a4, a4, t0
+ vle16.v v0, (a2)
+ vle16.v v4, (a3)
+ vwmul.vx v16, v0, a7
+ vwmacc.vx v16, t3, v4
+ vsetvli zero, zero, e32, m8, ta, ma
+ vadd.vx v16, v16, t4
+ vsetvli zero, zero, e16, m4, ta, ma
+ vnsrl.wx v16, v16, t6
+ vmax.vx v16, v16, zero
+ vsetvli zero, zero, e8, m2, ta, ma
+ vnclipu.wi v16, v16, 0
+ vse8.v v16, (a0)
+ sh1add a2, t0, a2
+ sh1add a3, t0, a3
+ add a0, a0, t0
+ bnez a4, 1b
+ add a0, t1, a1
+ addi a2, t2, 128*2
+ addi a3, t5, 128*2
+ mv a4, a6
+.endif
+ bnez a5, \id\w\vlen\()b
+ ret
+.endm
+
+.macro func_w_avg vlen
+func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x
+ lpad 0
+ AVG_JMP_TABLE 2, \vlen
+ csrwi vxrm, 0
+ addi t6, a6, 7
+ ld t3, (sp)
+ ld t4, 8(sp)
+ ld t5, 16(sp)
+ addi t4, t4, 1 // o0 + o1 + 1
+ add t4, t4, t5
+ addi t5, t6, -1 // shift - 1
+ sll t4, t4, t5
+ AVG_J \vlen, 2
+ .irp w,2,4,8,16,32,64,128
+ w_avg \w, \vlen, 2
+ .endr
+endfunc
+.endm
+
+func_w_avg 128
+func_w_avg 256
+#endif
diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c b/libavcodec/riscv/vvc/vvcdsp_init.c
new file mode 100644
index 0000000000..9819a7c570
--- /dev/null
+++ b/libavcodec/riscv/vvc/vvcdsp_init.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/vvc/dsp.h"
+
+#define bf(fn, bd, opt) fn##_##bd##_##opt
+
+#define AVG_PROTOTYPES(bd, opt) \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height); \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height, \
+ int denom, int w0, int w1, int o0, int o1);
+
+AVG_PROTOTYPES(8, rvv_128)
+AVG_PROTOTYPES(8, rvv_256)
+
+void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
+{
+#if HAVE_RVV
+ const int flags = av_get_cpu_flags();
+ int vlenb = ff_get_rv_vlenb();
+
+ if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+ vlenb >= 32) {
+ switch (bd) {
+ case 8:
+ c->inter.avg = ff_vvc_avg_8_rvv_256;
+# if (__riscv_xlen == 64)
+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
+# endif
+ break;
+ default:
+ break;
+ }
+ } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+ vlenb >= 16) {
+ switch (bd) {
+ case 8:
+ c->inter.avg = ff_vvc_avg_8_rvv_128;
+# if (__riscv_xlen == 64)
+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
+# endif
+ break;
+ default:
+ break;
+ }
+ }
+#endif
+}
diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
index 648d54ebb2..0d2e315395 100644
--- a/libavcodec/vvc/dsp.c
+++ b/libavcodec/vvc/dsp.c
@@ -123,6 +123,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
#if ARCH_AARCH64
ff_vvc_dsp_init_aarch64(vvcdsp, bit_depth);
+#elif ARCH_RISCV
+ ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
#elif ARCH_X86
ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
#endif
diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
index 0b49b97021..4933cca891 100644
--- a/libavcodec/vvc/dsp.h
+++ b/libavcodec/vvc/dsp.h
@@ -181,6 +181,7 @@ typedef struct VVCDSPContext {
void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
void ff_vvc_dsp_init_aarch64(VVCDSPContext *hpc, const int bit_depth);
+void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
#endif /* AVCODEC_VVC_DSP_H */
--
2.46.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-08-03 10:30 uk7b
@ 2024-08-03 10:31 ` flow gg
2024-08-15 8:10 ` Rémi Denis-Courmont
1 sibling, 0 replies; 19+ messages in thread
From: flow gg @ 2024-08-03 10:31 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Added lpad and resolved conflicts with master.
<uk7b@foxmail.com> 于2024年8月3日周六 18:31写道:
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908 X60
> avg_8_2x2_c : 1.2 1.0
> avg_8_2x2_rvv_i32 : 0.7 0.7
> avg_8_2x4_c : 2.0 2.2
> avg_8_2x4_rvv_i32 : 1.2 1.2
> avg_8_2x8_c : 3.7 4.0
> avg_8_2x8_rvv_i32 : 1.7 1.5
> avg_8_2x16_c : 7.2 7.7
> avg_8_2x16_rvv_i32 : 3.0 2.7
> avg_8_2x32_c : 14.2 15.2
> avg_8_2x32_rvv_i32 : 5.5 5.0
> avg_8_2x64_c : 51.0 43.7
> avg_8_2x64_rvv_i32 : 39.2 29.7
> avg_8_2x128_c : 100.5 79.2
> avg_8_2x128_rvv_i32 : 79.7 68.2
> avg_8_4x2_c : 1.7 2.0
> avg_8_4x2_rvv_i32 : 1.0 0.7
> avg_8_4x4_c : 3.5 3.7
> avg_8_4x4_rvv_i32 : 1.2 1.2
> avg_8_4x8_c : 6.7 7.0
> avg_8_4x8_rvv_i32 : 1.7 1.5
> avg_8_4x16_c : 13.5 14.0
> avg_8_4x16_rvv_i32 : 3.0 2.7
> avg_8_4x32_c : 26.2 27.7
> avg_8_4x32_rvv_i32 : 5.5 4.7
> avg_8_4x64_c : 73.0 73.7
> avg_8_4x64_rvv_i32 : 39.0 32.5
> avg_8_4x128_c : 143.0 137.2
> avg_8_4x128_rvv_i32 : 72.7 68.0
> avg_8_8x2_c : 3.5 3.5
> avg_8_8x2_rvv_i32 : 1.0 0.7
> avg_8_8x4_c : 6.2 6.5
> avg_8_8x4_rvv_i32 : 1.5 1.0
> avg_8_8x8_c : 12.7 13.2
> avg_8_8x8_rvv_i32 : 2.0 1.5
> avg_8_8x16_c : 25.0 26.5
> avg_8_8x16_rvv_i32 : 3.2 2.7
> avg_8_8x32_c : 50.0 52.7
> avg_8_8x32_rvv_i32 : 6.2 5.0
> avg_8_8x64_c : 118.7 122.5
> avg_8_8x64_rvv_i32 : 40.2 31.5
> avg_8_8x128_c : 236.7 220.2
> avg_8_8x128_rvv_i32 : 85.2 67.7
> avg_8_16x2_c : 6.2 6.7
> avg_8_16x2_rvv_i32 : 1.2 0.7
> avg_8_16x4_c : 12.5 13.0
> avg_8_16x4_rvv_i32 : 1.7 1.0
> avg_8_16x8_c : 24.5 26.0
> avg_8_16x8_rvv_i32 : 3.0 1.7
> avg_8_16x16_c : 49.0 51.5
> avg_8_16x16_rvv_i32 : 5.5 3.0
> avg_8_16x32_c : 97.5 102.5
> avg_8_16x32_rvv_i32 : 10.5 5.5
> avg_8_16x64_c : 213.7 222.0
> avg_8_16x64_rvv_i32 : 48.5 34.2
> avg_8_16x128_c : 434.7 420.0
> avg_8_16x128_rvv_i32 : 97.7 74.0
> avg_8_32x2_c : 12.2 12.7
> avg_8_32x2_rvv_i32 : 1.5 1.0
> avg_8_32x4_c : 24.5 25.5
> avg_8_32x4_rvv_i32 : 3.0 1.7
> avg_8_32x8_c : 48.5 50.7
> avg_8_32x8_rvv_i32 : 5.2 2.7
> avg_8_32x16_c : 96.7 101.2
> avg_8_32x16_rvv_i32 : 10.2 5.0
> avg_8_32x32_c : 192.7 202.2
> avg_8_32x32_rvv_i32 : 19.7 9.5
> avg_8_32x64_c : 427.5 426.5
> avg_8_32x64_rvv_i32 : 64.2 18.2
> avg_8_32x128_c : 816.5 821.0
> avg_8_32x128_rvv_i32 : 135.2 75.5
> avg_8_64x2_c : 24.0 25.2
> avg_8_64x2_rvv_i32 : 2.7 1.5
> avg_8_64x4_c : 48.2 50.5
> avg_8_64x4_rvv_i32 : 5.0 2.7
> avg_8_64x8_c : 96.0 100.7
> avg_8_64x8_rvv_i32 : 9.7 4.5
> avg_8_64x16_c : 207.7 201.2
> avg_8_64x16_rvv_i32 : 19.0 9.0
> avg_8_64x32_c : 383.2 402.0
> avg_8_64x32_rvv_i32 : 37.5 17.5
> avg_8_64x64_c : 837.2 828.7
> avg_8_64x64_rvv_i32 : 84.7 35.5
> avg_8_64x128_c : 1640.7 1640.2
> avg_8_64x128_rvv_i32 : 206.0 153.0
> avg_8_128x2_c : 48.7 51.0
> avg_8_128x2_rvv_i32 : 5.2 2.7
> avg_8_128x4_c : 96.7 101.5
> avg_8_128x4_rvv_i32 : 10.2 5.0
> avg_8_128x8_c : 192.2 202.0
> avg_8_128x8_rvv_i32 : 19.7 9.2
> avg_8_128x16_c : 400.7 403.2
> avg_8_128x16_rvv_i32 : 38.7 18.5
> avg_8_128x32_c : 786.7 805.7
> avg_8_128x32_rvv_i32 : 77.0 36.2
> avg_8_128x64_c : 1615.5 1655.5
> avg_8_128x64_rvv_i32 : 189.7 80.7
> avg_8_128x128_c : 3182.0 3238.0
> avg_8_128x128_rvv_i32 : 397.5 308.5
> w_avg_8_2x2_c : 1.7 1.2
> w_avg_8_2x2_rvv_i32 : 1.2 1.0
> w_avg_8_2x4_c : 2.7 2.7
> w_avg_8_2x4_rvv_i32 : 1.7 1.5
> w_avg_8_2x8_c : 21.7 4.7
> w_avg_8_2x8_rvv_i32 : 2.7 2.5
> w_avg_8_2x16_c : 9.5 9.2
> w_avg_8_2x16_rvv_i32 : 4.7 4.2
> w_avg_8_2x32_c : 19.0 18.7
> w_avg_8_2x32_rvv_i32 : 9.0 8.0
> w_avg_8_2x64_c : 62.0 50.2
> w_avg_8_2x64_rvv_i32 : 47.7 33.5
> w_avg_8_2x128_c : 116.7 87.7
> w_avg_8_2x128_rvv_i32 : 80.0 69.5
> w_avg_8_4x2_c : 2.5 2.5
> w_avg_8_4x2_rvv_i32 : 1.2 1.0
> w_avg_8_4x4_c : 4.7 4.5
> w_avg_8_4x4_rvv_i32 : 1.7 1.7
> w_avg_8_4x8_c : 9.0 8.7
> w_avg_8_4x8_rvv_i32 : 2.7 2.5
> w_avg_8_4x16_c : 17.7 17.5
> w_avg_8_4x16_rvv_i32 : 4.7 4.2
> w_avg_8_4x32_c : 35.0 35.0
> w_avg_8_4x32_rvv_i32 : 9.0 8.0
> w_avg_8_4x64_c : 100.5 84.5
> w_avg_8_4x64_rvv_i32 : 42.2 33.7
> w_avg_8_4x128_c : 203.5 151.2
> w_avg_8_4x128_rvv_i32 : 83.0 69.5
> w_avg_8_8x2_c : 4.5 4.5
> w_avg_8_8x2_rvv_i32 : 1.2 1.2
> w_avg_8_8x4_c : 8.7 8.7
> w_avg_8_8x4_rvv_i32 : 2.0 1.7
> w_avg_8_8x8_c : 17.0 17.0
> w_avg_8_8x8_rvv_i32 : 3.2 2.5
> w_avg_8_8x16_c : 34.0 33.5
> w_avg_8_8x16_rvv_i32 : 5.5 4.2
> w_avg_8_8x32_c : 86.0 67.5
> w_avg_8_8x32_rvv_i32 : 10.5 8.0
> w_avg_8_8x64_c : 187.2 149.5
> w_avg_8_8x64_rvv_i32 : 45.0 35.5
> w_avg_8_8x128_c : 342.7 290.0
> w_avg_8_8x128_rvv_i32 : 108.7 70.2
> w_avg_8_16x2_c : 8.5 8.2
> w_avg_8_16x2_rvv_i32 : 2.0 1.2
> w_avg_8_16x4_c : 16.7 16.7
> w_avg_8_16x4_rvv_i32 : 3.0 1.7
> w_avg_8_16x8_c : 33.2 33.5
> w_avg_8_16x8_rvv_i32 : 5.5 3.0
> w_avg_8_16x16_c : 66.2 66.7
> w_avg_8_16x16_rvv_i32 : 10.5 5.0
> w_avg_8_16x32_c : 132.5 131.0
> w_avg_8_16x32_rvv_i32 : 20.0 9.7
> w_avg_8_16x64_c : 340.0 283.5
> w_avg_8_16x64_rvv_i32 : 60.5 37.2
> w_avg_8_16x128_c : 641.2 597.5
> w_avg_8_16x128_rvv_i32 : 118.7 77.7
> w_avg_8_32x2_c : 16.5 16.7
> w_avg_8_32x2_rvv_i32 : 3.2 1.7
> w_avg_8_32x4_c : 33.2 33.2
> w_avg_8_32x4_rvv_i32 : 5.5 2.7
> w_avg_8_32x8_c : 66.0 62.5
> w_avg_8_32x8_rvv_i32 : 10.5 5.0
> w_avg_8_32x16_c : 131.5 132.0
> w_avg_8_32x16_rvv_i32 : 20.2 9.5
> w_avg_8_32x32_c : 261.7 272.0
> w_avg_8_32x32_rvv_i32 : 39.7 18.0
> w_avg_8_32x64_c : 575.2 545.5
> w_avg_8_32x64_rvv_i32 : 105.5 58.7
> w_avg_8_32x128_c : 1154.2 1088.0
> w_avg_8_32x128_rvv_i32 : 207.0 98.0
> w_avg_8_64x2_c : 33.0 33.0
> w_avg_8_64x2_rvv_i32 : 6.2 2.7
> w_avg_8_64x4_c : 65.5 66.0
> w_avg_8_64x4_rvv_i32 : 11.5 5.0
> w_avg_8_64x8_c : 131.2 132.5
> w_avg_8_64x8_rvv_i32 : 22.5 9.5
> w_avg_8_64x16_c : 268.2 262.5
> w_avg_8_64x16_rvv_i32 : 44.2 18.0
> w_avg_8_64x32_c : 561.5 528.7
> w_avg_8_64x32_rvv_i32 : 88.0 35.2
> w_avg_8_64x64_c : 1136.2 1124.0
> w_avg_8_64x64_rvv_i32 : 222.0 82.2
> w_avg_8_64x128_c : 2345.0 2312.7
> w_avg_8_64x128_rvv_i32 : 423.0 190.5
> w_avg_8_128x2_c : 65.7 66.5
> w_avg_8_128x2_rvv_i32 : 11.2 5.5
> w_avg_8_128x4_c : 131.2 132.2
> w_avg_8_128x4_rvv_i32 : 22.0 10.2
> w_avg_8_128x8_c : 263.5 312.0
> w_avg_8_128x8_rvv_i32 : 43.2 19.7
> w_avg_8_128x16_c : 528.7 526.2
> w_avg_8_128x16_rvv_i32 : 85.5 39.5
> w_avg_8_128x32_c : 1067.7 1062.7
> w_avg_8_128x32_rvv_i32 : 171.7 78.2
> w_avg_8_128x64_c : 2234.7 2168.7
> w_avg_8_128x64_rvv_i32 : 400.0 159.0
> w_avg_8_128x128_c : 4752.5 4295.0
> w_avg_8_128x128_rvv_i32 : 757.7 365.5
> ---
> libavcodec/riscv/vvc/Makefile | 2 +
> libavcodec/riscv/vvc/vvc_mc_rvv.S | 287 +++++++++++++++++++++++++++++
> libavcodec/riscv/vvc/vvcdsp_init.c | 72 ++++++++
> libavcodec/vvc/dsp.c | 2 +
> libavcodec/vvc/dsp.h | 1 +
> 5 files changed, 364 insertions(+)
> create mode 100644 libavcodec/riscv/vvc/Makefile
> create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S
> create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c
>
> diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile
> new file mode 100644
> index 0000000000..582b051579
> --- /dev/null
> +++ b/libavcodec/riscv/vvc/Makefile
> @@ -0,0 +1,2 @@
> +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o
> +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o
> diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S
> b/libavcodec/riscv/vvc/vvc_mc_rvv.S
> new file mode 100644
> index 0000000000..10e1bd67ee
> --- /dev/null
> +++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
> @@ -0,0 +1,287 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro vsetvlstatic8 w, vlen
> + .if \w == 2 && \vlen == 128
> + vsetivli zero, \w, e8, mf8, ta, ma
> + .elseif \w == 4 && \vlen == 128
> + vsetivli zero, \w, e8, mf4, ta, ma
> + .elseif \w == 8 && \vlen == 128
> + vsetivli zero, \w, e8, mf2, ta, ma
> + .elseif \w == 16 && \vlen == 128
> + vsetivli zero, \w, e8, m1, ta, ma
> + .elseif \w == 32 && \vlen == 128
> + li t0, \w
> + vsetvli zero, t0, e8, m2, ta, ma
> + .elseif \w <= 4 && \vlen == 256
> + vsetivli zero, \w, e8, mf8, ta, ma
> + .elseif \w == 8 && \vlen == 256
> + vsetivli zero, \w, e8, mf4, ta, ma
> + .elseif \w == 16 && \vlen == 256
> + vsetivli zero, \w, e8, mf2, ta, ma
> + .elseif \w == 32 && \vlen == 256
> + li t0, \w
> + vsetvli zero, t0, e8, m1, ta, ma
> + .elseif \w == 64 && \vlen == 256
> + li t0, \w
> + vsetvli zero, t0, e8, m2, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e8, m4, ta, ma
> + .endif
> +.endm
> +
> +.macro vsetvlstatic16 w, vlen
> + .if \w == 2 && \vlen == 128
> + vsetivli zero, \w, e16, mf4, ta, ma
> + .elseif \w == 4 && \vlen == 128
> + vsetivli zero, \w, e16, mf2, ta, ma
> + .elseif \w == 8 && \vlen == 128
> + vsetivli zero, \w, e16, m1, ta, ma
> + .elseif \w == 16 && \vlen == 128
> + vsetivli zero, \w, e16, m2, ta, ma
> + .elseif \w == 32 && \vlen == 128
> + li t0, \w
> + vsetvli zero, t0, e16, m4, ta, ma
> + .elseif \w <= 4 && \vlen == 256
> + vsetivli zero, \w, e16, mf4, ta, ma
> + .elseif \w == 8 && \vlen == 256
> + vsetivli zero, \w, e16, mf2, ta, ma
> + .elseif \w == 16 && \vlen == 256
> + vsetivli zero, \w, e16, m1, ta, ma
> + .elseif \w == 32 && \vlen == 256
> + li t0, \w
> + vsetvli zero, t0, e16, m2, ta, ma
> + .elseif \w == 64 && \vlen == 256
> + li t0, \w
> + vsetvli zero, t0, e16, m4, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e16, m8, ta, ma
> + .endif
> +.endm
> +
> +.macro vsetvlstatic32 w, vlen
> + .if \w == 2
> + vsetivli zero, \w, e32, mf2, ta, ma
> + .elseif \w == 4 && \vlen == 128
> + vsetivli zero, \w, e32, m1, ta, ma
> + .elseif \w == 8 && \vlen == 128
> + vsetivli zero, \w, e32, m2, ta, ma
> + .elseif \w == 16 && \vlen == 128
> + vsetivli zero, \w, e32, m4, ta, ma
> + .elseif \w == 4 && \vlen == 256
> + vsetivli zero, \w, e32, mf2, ta, ma
> + .elseif \w == 8 && \vlen == 256
> + vsetivli zero, \w, e32, m1, ta, ma
> + .elseif \w == 16 && \vlen == 256
> + vsetivli zero, \w, e32, m2, ta, ma
> + .elseif \w == 32 && \vlen == 256
> + li t0, \w
> + vsetvli zero, t0, e32, m4, ta, ma
> + .else
> + li t0, \w
> + vsetvli zero, t0, e32, m8, ta, ma
> + .endif
> +.endm
> +
> +.macro avg w, vlen, id
> +\id\w\vlen:
> +.if \w < 128
> + vsetvlstatic16 \w, \vlen
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + add t2, a0, a1
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + addi a5, a5, -2
> + vle16.v v16, (t0)
> + vle16.v v24, (t1)
> + vadd.vv v8, v8, v0
> + vadd.vv v24, v24, v16
> + vmax.vx v8, v8, zero
> + vmax.vx v24, v24, zero
> + vsetvlstatic8 \w, \vlen
> + addi a2, a2, 128*4
> + vnclipu.wi v8, v8, 7
> + vnclipu.wi v24, v24, 7
> + addi a3, a3, 128*4
> + vse8.v v8, (a0)
> + vse8.v v24, (t2)
> + sh1add a0, a1, a0
> +.else
> + addi a5, a5, -1
> + mv t1, a0
> + mv t2, a2
> + mv t3, a3
> + mv t4, a4
> +1:
> + vsetvli t0, a4, e16, m8, ta, ma
> + sub a4, a4, t0
> + vle16.v v0, (a2)
> + vle16.v v8, (a3)
> + vadd.vv v8, v8, v0
> + vmax.vx v8, v8, zero
> + vsetvli zero, zero, e8, m4, ta, ma
> + vnclipu.wi v8, v8, 7
> + vse8.v v8, (a0)
> + sh1add a2, t0, a2
> + sh1add a3, t0, a3
> + add a0, a0, t0
> + bnez a4, 1b
> + add a0, t1, a1
> + addi a2, t2, 128*2
> + addi a3, t3, 128*2
> + mv a4, t4
> +.endif
> + bnez a5, \id\w\vlen\()b
> + ret
> +.endm
> +
> +
> +.macro AVG_JMP_TABLE id, vlen
> +const jmp_table_\id\vlen
> + .4byte \id\()2\vlen\()f - jmp_table_\id\vlen
> + .4byte \id\()4\vlen\()f - jmp_table_\id\vlen
> + .4byte \id\()8\vlen\()f - jmp_table_\id\vlen
> + .4byte \id\()16\vlen\()f - jmp_table_\id\vlen
> + .4byte \id\()32\vlen\()f - jmp_table_\id\vlen
> + .4byte \id\()64\vlen\()f - jmp_table_\id\vlen
> + .4byte \id\()128\vlen\()f - jmp_table_\id\vlen
> +endconst
> +.endm
> +
> +.macro AVG_J vlen, id
> + clz t1, a4
> + neg t1, t1
> + lla t5, jmp_table_\id\vlen
> + sh2add t1, t1, t5
> + lw t1, ((__riscv_xlen-2)<<2)(t1)
> + add t1, t1, t5
> + jr t1
> +.endm
> +
> +.macro func_avg vlen
> +func ff_vvc_avg_8_rvv_\vlen\(), zve32x
> + lpad 0
> + AVG_JMP_TABLE 1, \vlen
> + csrwi vxrm, 0
> + AVG_J \vlen, 1
> + .irp w,2,4,8,16,32,64,128
> + avg \w, \vlen, 1
> + .endr
> +endfunc
> +.endm
> +
> +func_avg 128
> +func_avg 256
> +
> +#if (__riscv_xlen == 64)
> +.macro w_avg w, vlen, id
> +\id\w\vlen:
> +.if \w <= 32 || (\w == 64 && \vlen == 256)
> + vsetvlstatic16 \w, \vlen
> + addi t0, a2, 128*2
> + addi t1, a3, 128*2
> + vle16.v v0, (a2)
> + vle16.v v4, (a3)
> + addi a5, a5, -2
> + vle16.v v8, (t0)
> + vle16.v v12, (t1)
> + vwmul.vx v16, v0, a7
> + vwmul.vx v24, v8, a7
> + vwmacc.vx v16, t3, v4
> + vwmacc.vx v24, t3, v12
> + vsetvlstatic32 \w, \vlen
> + add t2, a0, a1
> + vadd.vx v16, v16, t4
> + vadd.vx v24, v24, t4
> + vsetvlstatic16 \w, \vlen
> + vnsrl.wx v16, v16, t6
> + vnsrl.wx v24, v24, t6
> + vmax.vx v16, v16, zero
> + vmax.vx v24, v24, zero
> + vsetvlstatic8 \w, \vlen
> + addi a2, a2, 128*4
> + vnclipu.wi v16, v16, 0
> + vnclipu.wi v24, v24, 0
> + vse8.v v16, (a0)
> + addi a3, a3, 128*4
> + vse8.v v24, (t2)
> + sh1add a0, a1, a0
> +.else
> + addi a5, a5, -1
> + mv t1, a0
> + mv t2, a2
> + mv t5, a3
> + mv a6, a4
> +1:
> + vsetvli t0, a4, e16, m4, ta, ma
> + sub a4, a4, t0
> + vle16.v v0, (a2)
> + vle16.v v4, (a3)
> + vwmul.vx v16, v0, a7
> + vwmacc.vx v16, t3, v4
> + vsetvli zero, zero, e32, m8, ta, ma
> + vadd.vx v16, v16, t4
> + vsetvli zero, zero, e16, m4, ta, ma
> + vnsrl.wx v16, v16, t6
> + vmax.vx v16, v16, zero
> + vsetvli zero, zero, e8, m2, ta, ma
> + vnclipu.wi v16, v16, 0
> + vse8.v v16, (a0)
> + sh1add a2, t0, a2
> + sh1add a3, t0, a3
> + add a0, a0, t0
> + bnez a4, 1b
> + add a0, t1, a1
> + addi a2, t2, 128*2
> + addi a3, t5, 128*2
> + mv a4, a6
> +.endif
> + bnez a5, \id\w\vlen\()b
> + ret
> +.endm
> +
> +.macro func_w_avg vlen
> +func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x
> + lpad 0
> + AVG_JMP_TABLE 2, \vlen
> + csrwi vxrm, 0
> + addi t6, a6, 7
> + ld t3, (sp)
> + ld t4, 8(sp)
> + ld t5, 16(sp)
> + addi t4, t4, 1 // o0 + o1 + 1
> + add t4, t4, t5
> + addi t5, t6, -1 // shift - 1
> + sll t4, t4, t5
> + AVG_J \vlen, 2
> + .irp w,2,4,8,16,32,64,128
> + w_avg \w, \vlen, 2
> + .endr
> +endfunc
> +.endm
> +
> +func_w_avg 128
> +func_w_avg 256
> +#endif
> diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c
> b/libavcodec/riscv/vvc/vvcdsp_init.c
> new file mode 100644
> index 0000000000..9819a7c570
> --- /dev/null
> +++ b/libavcodec/riscv/vvc/vvcdsp_init.c
> @@ -0,0 +1,72 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "config.h"
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
> +#include "libavcodec/vvc/dsp.h"
> +
> +#define bf(fn, bd, opt) fn##_##bd##_##opt
> +
> +#define AVG_PROTOTYPES(bd, opt)
> \
> +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,
> \
> + const int16_t *src0, const int16_t *src1, int width, int height);
> \
> +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,
> \
> + const int16_t *src0, const int16_t *src1, int width, int height,
> \
> + int denom, int w0, int w1, int o0, int o1);
> +
> +AVG_PROTOTYPES(8, rvv_128)
> +AVG_PROTOTYPES(8, rvv_256)
> +
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> +{
> +#if HAVE_RVV
> + const int flags = av_get_cpu_flags();
> + int vlenb = ff_get_rv_vlenb();
> +
> + if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
> + vlenb >= 32) {
> + switch (bd) {
> + case 8:
> + c->inter.avg = ff_vvc_avg_8_rvv_256;
> +# if (__riscv_xlen == 64)
> + c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
> +# endif
> + break;
> + default:
> + break;
> + }
> + } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags &
> AV_CPU_FLAG_RVB_ADDR) &&
> + vlenb >= 16) {
> + switch (bd) {
> + case 8:
> + c->inter.avg = ff_vvc_avg_8_rvv_128;
> +# if (__riscv_xlen == 64)
> + c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
> +# endif
> + break;
> + default:
> + break;
> + }
> + }
> +#endif
> +}
> diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
> index 648d54ebb2..0d2e315395 100644
> --- a/libavcodec/vvc/dsp.c
> +++ b/libavcodec/vvc/dsp.c
> @@ -123,6 +123,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int
> bit_depth)
>
> #if ARCH_AARCH64
> ff_vvc_dsp_init_aarch64(vvcdsp, bit_depth);
> +#elif ARCH_RISCV
> + ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
> #elif ARCH_X86
> ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
> #endif
> diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> index 0b49b97021..4933cca891 100644
> --- a/libavcodec/vvc/dsp.h
> +++ b/libavcodec/vvc/dsp.h
> @@ -181,6 +181,7 @@ typedef struct VVCDSPContext {
> void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
>
> void ff_vvc_dsp_init_aarch64(VVCDSPContext *hpc, const int bit_depth);
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
> void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
>
> #endif /* AVCODEC_VVC_DSP_H */
> --
> 2.46.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg
2024-08-03 10:30 uk7b
2024-08-03 10:31 ` flow gg
@ 2024-08-15 8:10 ` Rémi Denis-Courmont
1 sibling, 0 replies; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-08-15 8:10 UTC (permalink / raw)
To: FFmpeg development discussions and patches, uk7b, ffmpeg-devel; +Cc: sunyuechi
Le 3 août 2024 13:30:34 GMT+03:00, uk7b@foxmail.com a écrit :
>From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908 X60
>avg_8_2x2_c : 1.2 1.0
>avg_8_2x2_rvv_i32 : 0.7 0.7
>avg_8_2x4_c : 2.0 2.2
>avg_8_2x4_rvv_i32 : 1.2 1.2
>avg_8_2x8_c : 3.7 4.0
>avg_8_2x8_rvv_i32 : 1.7 1.5
>avg_8_2x16_c : 7.2 7.7
>avg_8_2x16_rvv_i32 : 3.0 2.7
>avg_8_2x32_c : 14.2 15.2
>avg_8_2x32_rvv_i32 : 5.5 5.0
>avg_8_2x64_c : 51.0 43.7
>avg_8_2x64_rvv_i32 : 39.2 29.7
>avg_8_2x128_c : 100.5 79.2
>avg_8_2x128_rvv_i32 : 79.7 68.2
>avg_8_4x2_c : 1.7 2.0
>avg_8_4x2_rvv_i32 : 1.0 0.7
>avg_8_4x4_c : 3.5 3.7
>avg_8_4x4_rvv_i32 : 1.2 1.2
>avg_8_4x8_c : 6.7 7.0
>avg_8_4x8_rvv_i32 : 1.7 1.5
>avg_8_4x16_c : 13.5 14.0
>avg_8_4x16_rvv_i32 : 3.0 2.7
>avg_8_4x32_c : 26.2 27.7
>avg_8_4x32_rvv_i32 : 5.5 4.7
>avg_8_4x64_c : 73.0 73.7
>avg_8_4x64_rvv_i32 : 39.0 32.5
>avg_8_4x128_c : 143.0 137.2
>avg_8_4x128_rvv_i32 : 72.7 68.0
>avg_8_8x2_c : 3.5 3.5
>avg_8_8x2_rvv_i32 : 1.0 0.7
>avg_8_8x4_c : 6.2 6.5
>avg_8_8x4_rvv_i32 : 1.5 1.0
>avg_8_8x8_c : 12.7 13.2
>avg_8_8x8_rvv_i32 : 2.0 1.5
>avg_8_8x16_c : 25.0 26.5
>avg_8_8x16_rvv_i32 : 3.2 2.7
>avg_8_8x32_c : 50.0 52.7
>avg_8_8x32_rvv_i32 : 6.2 5.0
>avg_8_8x64_c : 118.7 122.5
>avg_8_8x64_rvv_i32 : 40.2 31.5
>avg_8_8x128_c : 236.7 220.2
>avg_8_8x128_rvv_i32 : 85.2 67.7
>avg_8_16x2_c : 6.2 6.7
>avg_8_16x2_rvv_i32 : 1.2 0.7
>avg_8_16x4_c : 12.5 13.0
>avg_8_16x4_rvv_i32 : 1.7 1.0
>avg_8_16x8_c : 24.5 26.0
>avg_8_16x8_rvv_i32 : 3.0 1.7
>avg_8_16x16_c : 49.0 51.5
>avg_8_16x16_rvv_i32 : 5.5 3.0
>avg_8_16x32_c : 97.5 102.5
>avg_8_16x32_rvv_i32 : 10.5 5.5
>avg_8_16x64_c : 213.7 222.0
>avg_8_16x64_rvv_i32 : 48.5 34.2
>avg_8_16x128_c : 434.7 420.0
>avg_8_16x128_rvv_i32 : 97.7 74.0
>avg_8_32x2_c : 12.2 12.7
>avg_8_32x2_rvv_i32 : 1.5 1.0
>avg_8_32x4_c : 24.5 25.5
>avg_8_32x4_rvv_i32 : 3.0 1.7
>avg_8_32x8_c : 48.5 50.7
>avg_8_32x8_rvv_i32 : 5.2 2.7
>avg_8_32x16_c : 96.7 101.2
>avg_8_32x16_rvv_i32 : 10.2 5.0
>avg_8_32x32_c : 192.7 202.2
>avg_8_32x32_rvv_i32 : 19.7 9.5
>avg_8_32x64_c : 427.5 426.5
>avg_8_32x64_rvv_i32 : 64.2 18.2
>avg_8_32x128_c : 816.5 821.0
>avg_8_32x128_rvv_i32 : 135.2 75.5
>avg_8_64x2_c : 24.0 25.2
>avg_8_64x2_rvv_i32 : 2.7 1.5
>avg_8_64x4_c : 48.2 50.5
>avg_8_64x4_rvv_i32 : 5.0 2.7
>avg_8_64x8_c : 96.0 100.7
>avg_8_64x8_rvv_i32 : 9.7 4.5
>avg_8_64x16_c : 207.7 201.2
>avg_8_64x16_rvv_i32 : 19.0 9.0
>avg_8_64x32_c : 383.2 402.0
>avg_8_64x32_rvv_i32 : 37.5 17.5
>avg_8_64x64_c : 837.2 828.7
>avg_8_64x64_rvv_i32 : 84.7 35.5
>avg_8_64x128_c : 1640.7 1640.2
>avg_8_64x128_rvv_i32 : 206.0 153.0
>avg_8_128x2_c : 48.7 51.0
>avg_8_128x2_rvv_i32 : 5.2 2.7
>avg_8_128x4_c : 96.7 101.5
>avg_8_128x4_rvv_i32 : 10.2 5.0
>avg_8_128x8_c : 192.2 202.0
>avg_8_128x8_rvv_i32 : 19.7 9.2
>avg_8_128x16_c : 400.7 403.2
>avg_8_128x16_rvv_i32 : 38.7 18.5
>avg_8_128x32_c : 786.7 805.7
>avg_8_128x32_rvv_i32 : 77.0 36.2
>avg_8_128x64_c : 1615.5 1655.5
>avg_8_128x64_rvv_i32 : 189.7 80.7
>avg_8_128x128_c : 3182.0 3238.0
>avg_8_128x128_rvv_i32 : 397.5 308.5
>w_avg_8_2x2_c : 1.7 1.2
>w_avg_8_2x2_rvv_i32 : 1.2 1.0
>w_avg_8_2x4_c : 2.7 2.7
>w_avg_8_2x4_rvv_i32 : 1.7 1.5
>w_avg_8_2x8_c : 21.7 4.7
>w_avg_8_2x8_rvv_i32 : 2.7 2.5
>w_avg_8_2x16_c : 9.5 9.2
>w_avg_8_2x16_rvv_i32 : 4.7 4.2
>w_avg_8_2x32_c : 19.0 18.7
>w_avg_8_2x32_rvv_i32 : 9.0 8.0
>w_avg_8_2x64_c : 62.0 50.2
>w_avg_8_2x64_rvv_i32 : 47.7 33.5
>w_avg_8_2x128_c : 116.7 87.7
>w_avg_8_2x128_rvv_i32 : 80.0 69.5
>w_avg_8_4x2_c : 2.5 2.5
>w_avg_8_4x2_rvv_i32 : 1.2 1.0
>w_avg_8_4x4_c : 4.7 4.5
>w_avg_8_4x4_rvv_i32 : 1.7 1.7
>w_avg_8_4x8_c : 9.0 8.7
>w_avg_8_4x8_rvv_i32 : 2.7 2.5
>w_avg_8_4x16_c : 17.7 17.5
>w_avg_8_4x16_rvv_i32 : 4.7 4.2
>w_avg_8_4x32_c : 35.0 35.0
>w_avg_8_4x32_rvv_i32 : 9.0 8.0
>w_avg_8_4x64_c : 100.5 84.5
>w_avg_8_4x64_rvv_i32 : 42.2 33.7
>w_avg_8_4x128_c : 203.5 151.2
>w_avg_8_4x128_rvv_i32 : 83.0 69.5
>w_avg_8_8x2_c : 4.5 4.5
>w_avg_8_8x2_rvv_i32 : 1.2 1.2
>w_avg_8_8x4_c : 8.7 8.7
>w_avg_8_8x4_rvv_i32 : 2.0 1.7
>w_avg_8_8x8_c : 17.0 17.0
>w_avg_8_8x8_rvv_i32 : 3.2 2.5
>w_avg_8_8x16_c : 34.0 33.5
>w_avg_8_8x16_rvv_i32 : 5.5 4.2
>w_avg_8_8x32_c : 86.0 67.5
>w_avg_8_8x32_rvv_i32 : 10.5 8.0
>w_avg_8_8x64_c : 187.2 149.5
>w_avg_8_8x64_rvv_i32 : 45.0 35.5
>w_avg_8_8x128_c : 342.7 290.0
>w_avg_8_8x128_rvv_i32 : 108.7 70.2
>w_avg_8_16x2_c : 8.5 8.2
>w_avg_8_16x2_rvv_i32 : 2.0 1.2
>w_avg_8_16x4_c : 16.7 16.7
>w_avg_8_16x4_rvv_i32 : 3.0 1.7
>w_avg_8_16x8_c : 33.2 33.5
>w_avg_8_16x8_rvv_i32 : 5.5 3.0
>w_avg_8_16x16_c : 66.2 66.7
>w_avg_8_16x16_rvv_i32 : 10.5 5.0
>w_avg_8_16x32_c : 132.5 131.0
>w_avg_8_16x32_rvv_i32 : 20.0 9.7
>w_avg_8_16x64_c : 340.0 283.5
>w_avg_8_16x64_rvv_i32 : 60.5 37.2
>w_avg_8_16x128_c : 641.2 597.5
>w_avg_8_16x128_rvv_i32 : 118.7 77.7
>w_avg_8_32x2_c : 16.5 16.7
>w_avg_8_32x2_rvv_i32 : 3.2 1.7
>w_avg_8_32x4_c : 33.2 33.2
>w_avg_8_32x4_rvv_i32 : 5.5 2.7
>w_avg_8_32x8_c : 66.0 62.5
>w_avg_8_32x8_rvv_i32 : 10.5 5.0
>w_avg_8_32x16_c : 131.5 132.0
>w_avg_8_32x16_rvv_i32 : 20.2 9.5
>w_avg_8_32x32_c : 261.7 272.0
>w_avg_8_32x32_rvv_i32 : 39.7 18.0
>w_avg_8_32x64_c : 575.2 545.5
>w_avg_8_32x64_rvv_i32 : 105.5 58.7
>w_avg_8_32x128_c : 1154.2 1088.0
>w_avg_8_32x128_rvv_i32 : 207.0 98.0
>w_avg_8_64x2_c : 33.0 33.0
>w_avg_8_64x2_rvv_i32 : 6.2 2.7
>w_avg_8_64x4_c : 65.5 66.0
>w_avg_8_64x4_rvv_i32 : 11.5 5.0
>w_avg_8_64x8_c : 131.2 132.5
>w_avg_8_64x8_rvv_i32 : 22.5 9.5
>w_avg_8_64x16_c : 268.2 262.5
>w_avg_8_64x16_rvv_i32 : 44.2 18.0
>w_avg_8_64x32_c : 561.5 528.7
>w_avg_8_64x32_rvv_i32 : 88.0 35.2
>w_avg_8_64x64_c : 1136.2 1124.0
>w_avg_8_64x64_rvv_i32 : 222.0 82.2
>w_avg_8_64x128_c : 2345.0 2312.7
>w_avg_8_64x128_rvv_i32 : 423.0 190.5
>w_avg_8_128x2_c : 65.7 66.5
>w_avg_8_128x2_rvv_i32 : 11.2 5.5
>w_avg_8_128x4_c : 131.2 132.2
>w_avg_8_128x4_rvv_i32 : 22.0 10.2
>w_avg_8_128x8_c : 263.5 312.0
>w_avg_8_128x8_rvv_i32 : 43.2 19.7
>w_avg_8_128x16_c : 528.7 526.2
>w_avg_8_128x16_rvv_i32 : 85.5 39.5
>w_avg_8_128x32_c : 1067.7 1062.7
>w_avg_8_128x32_rvv_i32 : 171.7 78.2
>w_avg_8_128x64_c : 2234.7 2168.7
>w_avg_8_128x64_rvv_i32 : 400.0 159.0
>w_avg_8_128x128_c : 4752.5 4295.0
>w_avg_8_128x128_rvv_i32 : 757.7 365.5
>---
> libavcodec/riscv/vvc/Makefile | 2 +
> libavcodec/riscv/vvc/vvc_mc_rvv.S | 287 +++++++++++++++++++++++++++++
> libavcodec/riscv/vvc/vvcdsp_init.c | 72 ++++++++
> libavcodec/vvc/dsp.c | 2 +
> libavcodec/vvc/dsp.h | 1 +
> 5 files changed, 364 insertions(+)
> create mode 100644 libavcodec/riscv/vvc/Makefile
> create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S
> create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c
>
>diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile
>new file mode 100644
>index 0000000000..582b051579
>--- /dev/null
>+++ b/libavcodec/riscv/vvc/Makefile
>@@ -0,0 +1,2 @@
>+OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o
>+RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o
>diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S b/libavcodec/riscv/vvc/vvc_mc_rvv.S
>new file mode 100644
>index 0000000000..10e1bd67ee
>--- /dev/null
>+++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
>@@ -0,0 +1,287 @@
>+/*
>+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or
>+ * modify it under the terms of the GNU Lesser General Public
>+ * License as published by the Free Software Foundation; either
>+ * version 2.1 of the License, or (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>+ * Lesser General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU Lesser General Public
>+ * License along with FFmpeg; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+ */
>+
>+#include "libavutil/riscv/asm.S"
>+
>+.macro vsetvlstatic8 w, vlen
>+ .if \w == 2 && \vlen == 128
>+ vsetivli zero, \w, e8, mf8, ta, ma
>+ .elseif \w == 4 && \vlen == 128
>+ vsetivli zero, \w, e8, mf4, ta, ma
>+ .elseif \w == 8 && \vlen == 128
>+ vsetivli zero, \w, e8, mf2, ta, ma
>+ .elseif \w == 16 && \vlen == 128
>+ vsetivli zero, \w, e8, m1, ta, ma
>+ .elseif \w == 32 && \vlen == 128
>+ li t0, \w
>+ vsetvli zero, t0, e8, m2, ta, ma
>+ .elseif \w <= 4 && \vlen == 256
>+ vsetivli zero, \w, e8, mf8, ta, ma
>+ .elseif \w == 8 && \vlen == 256
>+ vsetivli zero, \w, e8, mf4, ta, ma
>+ .elseif \w == 16 && \vlen == 256
>+ vsetivli zero, \w, e8, mf2, ta, ma
>+ .elseif \w == 32 && \vlen == 256
>+ li t0, \w
>+ vsetvli zero, t0, e8, m1, ta, ma
>+ .elseif \w == 64 && \vlen == 256
>+ li t0, \w
>+ vsetvli zero, t0, e8, m2, ta, ma
>+ .else
>+ li t0, \w
>+ vsetvli zero, t0, e8, m4, ta, ma
>+ .endif
>+.endm
>+
>+.macro vsetvlstatic16 w, vlen
>+ .if \w == 2 && \vlen == 128
>+ vsetivli zero, \w, e16, mf4, ta, ma
>+ .elseif \w == 4 && \vlen == 128
>+ vsetivli zero, \w, e16, mf2, ta, ma
>+ .elseif \w == 8 && \vlen == 128
>+ vsetivli zero, \w, e16, m1, ta, ma
>+ .elseif \w == 16 && \vlen == 128
>+ vsetivli zero, \w, e16, m2, ta, ma
>+ .elseif \w == 32 && \vlen == 128
>+ li t0, \w
>+ vsetvli zero, t0, e16, m4, ta, ma
>+ .elseif \w <= 4 && \vlen == 256
>+ vsetivli zero, \w, e16, mf4, ta, ma
>+ .elseif \w == 8 && \vlen == 256
>+ vsetivli zero, \w, e16, mf2, ta, ma
>+ .elseif \w == 16 && \vlen == 256
>+ vsetivli zero, \w, e16, m1, ta, ma
>+ .elseif \w == 32 && \vlen == 256
>+ li t0, \w
>+ vsetvli zero, t0, e16, m2, ta, ma
>+ .elseif \w == 64 && \vlen == 256
>+ li t0, \w
>+ vsetvli zero, t0, e16, m4, ta, ma
>+ .else
>+ li t0, \w
>+ vsetvli zero, t0, e16, m8, ta, ma
>+ .endif
>+.endm
>+
>+.macro vsetvlstatic32 w, vlen
>+ .if \w == 2
>+ vsetivli zero, \w, e32, mf2, ta, ma
>+ .elseif \w == 4 && \vlen == 128
>+ vsetivli zero, \w, e32, m1, ta, ma
>+ .elseif \w == 8 && \vlen == 128
>+ vsetivli zero, \w, e32, m2, ta, ma
>+ .elseif \w == 16 && \vlen == 128
>+ vsetivli zero, \w, e32, m4, ta, ma
>+ .elseif \w == 4 && \vlen == 256
>+ vsetivli zero, \w, e32, mf2, ta, ma
>+ .elseif \w == 8 && \vlen == 256
>+ vsetivli zero, \w, e32, m1, ta, ma
>+ .elseif \w == 16 && \vlen == 256
>+ vsetivli zero, \w, e32, m2, ta, ma
>+ .elseif \w == 32 && \vlen == 256
>+ li t0, \w
>+ vsetvli zero, t0, e32, m4, ta, ma
>+ .else
>+ li t0, \w
>+ vsetvli zero, t0, e32, m8, ta, ma
>+ .endif
>+.endm
>+
>+.macro avg w, vlen, id
>+\id\w\vlen:
>+.if \w < 128
>+ vsetvlstatic16 \w, \vlen
>+ addi t0, a2, 128*2
>+ addi t1, a3, 128*2
>+ add t2, a0, a1
>+ vle16.v v0, (a2)
>+ vle16.v v8, (a3)
>+ addi a5, a5, -2
>+ vle16.v v16, (t0)
>+ vle16.v v24, (t1)
>+ vadd.vv v8, v8, v0
>+ vadd.vv v24, v24, v16
>+ vmax.vx v8, v8, zero
>+ vmax.vx v24, v24, zero
With short widths, scaling vertically (rather than horizontally) with strides is likely faster. See also the h.264 weight and biweight functions, which provide a similar algorithm.
>+ vsetvlstatic8 \w, \vlen
>+ addi a2, a2, 128*4
>+ vnclipu.wi v8, v8, 7
>+ vnclipu.wi v24, v24, 7
>+ addi a3, a3, 128*4
>+ vse8.v v8, (a0)
>+ vse8.v v24, (t2)
>+ sh1add a0, a1, a0
>+.else
>+ addi a5, a5, -1
>+ mv t1, a0
>+ mv t2, a2
>+ mv t3, a3
>+ mv t4, a4
>+1:
>+ vsetvli t0, a4, e16, m8, ta, ma
>+ sub a4, a4, t0
>+ vle16.v v0, (a2)
>+ vle16.v v8, (a3)
>+ vadd.vv v8, v8, v0
>+ vmax.vx v8, v8, zero
>+ vsetvli zero, zero, e8, m4, ta, ma
>+ vnclipu.wi v8, v8, 7
>+ vse8.v v8, (a0)
>+ sh1add a2, t0, a2
>+ sh1add a3, t0, a3
>+ add a0, a0, t0
>+ bnez a4, 1b
>+ add a0, t1, a1
>+ addi a2, t2, 128*2
>+ addi a3, t3, 128*2
>+ mv a4, t4
>+.endif
>+ bnez a5, \id\w\vlen\()b
>+ ret
>+.endm
>+
>+
>+.macro AVG_JMP_TABLE id, vlen
>+const jmp_table_\id\vlen
>+ .4byte \id\()2\vlen\()f - jmp_table_\id\vlen
>+ .4byte \id\()4\vlen\()f - jmp_table_\id\vlen
>+ .4byte \id\()8\vlen\()f - jmp_table_\id\vlen
>+ .4byte \id\()16\vlen\()f - jmp_table_\id\vlen
>+ .4byte \id\()32\vlen\()f - jmp_table_\id\vlen
>+ .4byte \id\()64\vlen\()f - jmp_table_\id\vlen
>+ .4byte \id\()128\vlen\()f - jmp_table_\id\vlen
>+endconst
>+.endm
>+
>+.macro AVG_J vlen, id
>+ clz t1, a4
>+ neg t1, t1
>+ lla t5, jmp_table_\id\vlen
>+ sh2add t1, t1, t5
>+ lw t1, ((__riscv_xlen-2)<<2)(t1)
>+ add t1, t1, t5
>+ jr t1
>+.endm
>+
>+.macro func_avg vlen
>+func ff_vvc_avg_8_rvv_\vlen\(), zve32x
>+ lpad 0
>+ AVG_JMP_TABLE 1, \vlen
>+ csrwi vxrm, 0
>+ AVG_J \vlen, 1
>+ .irp w,2,4,8,16,32,64,128
>+ avg \w, \vlen, 1
>+ .endr
>+endfunc
>+.endm
>+
>+func_avg 128
>+func_avg 256
>+
>+#if (__riscv_xlen == 64)
>+.macro w_avg w, vlen, id
>+\id\w\vlen:
>+.if \w <= 32 || (\w == 64 && \vlen == 256)
>+ vsetvlstatic16 \w, \vlen
>+ addi t0, a2, 128*2
>+ addi t1, a3, 128*2
>+ vle16.v v0, (a2)
>+ vle16.v v4, (a3)
>+ addi a5, a5, -2
>+ vle16.v v8, (t0)
>+ vle16.v v12, (t1)
>+ vwmul.vx v16, v0, a7
>+ vwmul.vx v24, v8, a7
>+ vwmacc.vx v16, t3, v4
>+ vwmacc.vx v24, t3, v12
>+ vsetvlstatic32 \w, \vlen
>+ add t2, a0, a1
>+ vadd.vx v16, v16, t4
>+ vadd.vx v24, v24, t4
>+ vsetvlstatic16 \w, \vlen
>+ vnsrl.wx v16, v16, t6
>+ vnsrl.wx v24, v24, t6
>+ vmax.vx v16, v16, zero
>+ vmax.vx v24, v24, zero
>+ vsetvlstatic8 \w, \vlen
>+ addi a2, a2, 128*4
>+ vnclipu.wi v16, v16, 0
>+ vnclipu.wi v24, v24, 0
>+ vse8.v v16, (a0)
>+ addi a3, a3, 128*4
>+ vse8.v v24, (t2)
>+ sh1add a0, a1, a0
>+.else
>+ addi a5, a5, -1
>+ mv t1, a0
>+ mv t2, a2
>+ mv t5, a3
>+ mv a6, a4
>+1:
>+ vsetvli t0, a4, e16, m4, ta, ma
>+ sub a4, a4, t0
>+ vle16.v v0, (a2)
>+ vle16.v v4, (a3)
>+ vwmul.vx v16, v0, a7
>+ vwmacc.vx v16, t3, v4
>+ vsetvli zero, zero, e32, m8, ta, ma
>+ vadd.vx v16, v16, t4
>+ vsetvli zero, zero, e16, m4, ta, ma
>+ vnsrl.wx v16, v16, t6
>+ vmax.vx v16, v16, zero
>+ vsetvli zero, zero, e8, m2, ta, ma
>+ vnclipu.wi v16, v16, 0
>+ vse8.v v16, (a0)
>+ sh1add a2, t0, a2
>+ sh1add a3, t0, a3
>+ add a0, a0, t0
>+ bnez a4, 1b
>+ add a0, t1, a1
>+ addi a2, t2, 128*2
>+ addi a3, t5, 128*2
>+ mv a4, a6
>+.endif
>+ bnez a5, \id\w\vlen\()b
>+ ret
>+.endm
>+
>+.macro func_w_avg vlen
>+func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x
>+ lpad 0
>+ AVG_JMP_TABLE 2, \vlen
>+ csrwi vxrm, 0
>+ addi t6, a6, 7
>+ ld t3, (sp)
>+ ld t4, 8(sp)
>+ ld t5, 16(sp)
>+ addi t4, t4, 1 // o0 + o1 + 1
>+ add t4, t4, t5
>+ addi t5, t6, -1 // shift - 1
>+ sll t4, t4, t5
>+ AVG_J \vlen, 2
>+ .irp w,2,4,8,16,32,64,128
>+ w_avg \w, \vlen, 2
>+ .endr
>+endfunc
>+.endm
>+
>+func_w_avg 128
>+func_w_avg 256
>+#endif
>diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c b/libavcodec/riscv/vvc/vvcdsp_init.c
>new file mode 100644
>index 0000000000..9819a7c570
>--- /dev/null
>+++ b/libavcodec/riscv/vvc/vvcdsp_init.c
>@@ -0,0 +1,72 @@
>+/*
>+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or
>+ * modify it under the terms of the GNU Lesser General Public
>+ * License as published by the Free Software Foundation; either
>+ * version 2.1 of the License, or (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>+ * Lesser General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU Lesser General Public
>+ * License along with FFmpeg; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+ */
>+
>+#include "config.h"
>+
>+#include "libavutil/attributes.h"
>+#include "libavutil/cpu.h"
>+#include "libavutil/riscv/cpu.h"
>+#include "libavcodec/vvc/dsp.h"
>+
>+#define bf(fn, bd, opt) fn##_##bd##_##opt
>+
>+#define AVG_PROTOTYPES(bd, opt) \
>+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
>+ const int16_t *src0, const int16_t *src1, int width, int height); \
>+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
>+ const int16_t *src0, const int16_t *src1, int width, int height, \
>+ int denom, int w0, int w1, int o0, int o1);
>+
>+AVG_PROTOTYPES(8, rvv_128)
>+AVG_PROTOTYPES(8, rvv_256)
>+
>+void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
>+{
>+#if HAVE_RVV
>+ const int flags = av_get_cpu_flags();
>+ int vlenb = ff_get_rv_vlenb();
>+
>+ if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
>+ vlenb >= 32) {
>+ switch (bd) {
>+ case 8:
>+ c->inter.avg = ff_vvc_avg_8_rvv_256;
>+# if (__riscv_xlen == 64)
>+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
>+# endif
>+ break;
>+ default:
>+ break;
>+ }
>+ } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
>+ vlenb >= 16) {
>+ switch (bd) {
>+ case 8:
>+ c->inter.avg = ff_vvc_avg_8_rvv_128;
>+# if (__riscv_xlen == 64)
>+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
>+# endif
>+ break;
>+ default:
>+ break;
>+ }
>+ }
>+#endif
>+}
>diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
>index 648d54ebb2..0d2e315395 100644
>--- a/libavcodec/vvc/dsp.c
>+++ b/libavcodec/vvc/dsp.c
>@@ -123,6 +123,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
>
> #if ARCH_AARCH64
> ff_vvc_dsp_init_aarch64(vvcdsp, bit_depth);
>+#elif ARCH_RISCV
>+ ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
> #elif ARCH_X86
> ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
> #endif
>diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
>index 0b49b97021..4933cca891 100644
>--- a/libavcodec/vvc/dsp.h
>+++ b/libavcodec/vvc/dsp.h
>@@ -181,6 +181,7 @@ typedef struct VVCDSPContext {
> void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
>
> void ff_vvc_dsp_init_aarch64(VVCDSPContext *hpc, const int bit_depth);
>+void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
> void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
>
> #endif /* AVCODEC_VVC_DSP_H */
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
end of thread, other threads:[~2024-08-15 8:10 UTC | newest]
Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-21 7:37 [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg uk7b
2024-05-21 7:38 ` flow gg
2024-05-21 7:47 ` uk7b
2024-05-21 7:48 ` flow gg
2024-05-21 16:03 ` Rémi Denis-Courmont
2024-05-21 17:24 ` flow gg
2024-05-21 19:24 ` uk7b
2024-05-21 19:26 ` flow gg
2024-05-25 8:27 ` Rémi Denis-Courmont
2024-07-08 15:41 [FFmpeg-devel] [PATCH v5] " Rémi Denis-Courmont
2024-07-10 10:02 ` [FFmpeg-devel] [PATCH] " uk7b
2024-07-16 14:21 ` Rémi Denis-Courmont
2024-07-18 15:02 ` uk7b
2024-07-18 15:04 ` flow gg
2024-07-19 15:55 ` Rémi Denis-Courmont
2024-07-21 13:43 ` uk7b
2024-07-21 13:45 ` flow gg
2024-08-03 10:30 uk7b
2024-08-03 10:31 ` flow gg
2024-08-15 8:10 ` Rémi Denis-Courmont
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git