Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH v4] libavcodec/riscv:add RVV optimized idct_32x32_8 for HEVC
@ 2025-05-20  7:58 daichengrong
  2025-05-24 16:18 ` Rémi Denis-Courmont
  2025-05-30  7:40 ` [FFmpeg-devel] [PATCH v5] " daichengrong
  0 siblings, 2 replies; 5+ messages in thread
From: daichengrong @ 2025-05-20  7:58 UTC (permalink / raw)
  To: ffmpeg-devel

From: daichengrong <daichengrong@iscas.ac.cn>

Since there are no comments for v2 and v3, we have continued to optimize according to the comments of v1.
We spilled the slide to memory to help improve performance,and optimized the extraction of elements from vector registers.

On Banana PI F3:
        hevc_idct_32x32_8_c:                                119920.0 ( 1.00x)
        hevc_idct_32x32_8_rvv_i64:                           20247.3 ( 5.92x) (V4)
        hevc_idct_32x32_8_rvv_i64:                           28718.3 ( 4.14x) (V3)
        hevc_idct_32x32_8_rvv_i64:                           28503.7 ( 4.17x) (V2)
        hevc_idct_32x32_8_rvv_i64:                           51254.4 ( 2.33x) (V1)

Changes in v4:
        Optimize unnecessary slide operations
        Extract more scalars from vector registers into purpose registers

Changes in v3:
        remove the slides in transposition and spill values from vector registers to stack

Changes in v2:
        deleted tabs
        remove the unnecessary t0 in vsetivli
        extract scalars directly into general registers

---
 libavcodec/riscv/Makefile           |   1 +
 libavcodec/riscv/hevcdsp_idct_rvv.S | 957 ++++++++++++++++++++++++++++
 libavcodec/riscv/hevcdsp_init.c     |  52 +-
 3 files changed, 990 insertions(+), 20 deletions(-)
 create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index a80d2fa2e7..dfc33afbee 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \
 OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
 RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 0000000000..586c97bdf9
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,957 @@
+/*
+ * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+const trans, align=4
+        .2byte          64, 83, 64, 36
+        .2byte          89, 75, 50, 18
+        .2byte          90, 87, 80, 70
+        .2byte          57, 43, 25, 9
+        .2byte          90, 90, 88, 85
+        .2byte          82, 78, 73, 67
+        .2byte          61, 54, 46, 38
+        .2byte          31, 22, 13, 4
+endconst
+
+const trans_index, align=4
+        .2byte  0, 16, 32, 48, 62, 46, 30, 14
+        .2byte  2, 18, 34, 50, 60, 44, 28, 12
+        .2byte  4, 20, 36, 52, 58, 42, 26, 10
+        .2byte  6, 22, 38, 54, 56, 40, 24, 8
+endconst
+
+.macro sum_sub out, in, c, op, p
+        mv t0, \c
+        .ifc \op, -
+                neg    t0, t0
+        .endif
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        .ifc \p, 2
+                vslidedown.vi    v8, \in, 4
+                vwmacc.vx    \out, t0, v8
+        .else
+                vwmacc.vx    \out, t0, \in
+        .endif
+.endm
+
+.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p
+        .ifc \op0, -
+                neg    \t0, \t0
+        .endif
+        .ifc \op1, -
+                neg    \t1, \t1
+        .endif
+        .ifc \op2, -
+                neg    \t2, \t2
+        .endif
+        .ifc \op3, -
+                neg    \t3, \t3
+        .endif
+
+.ifc \p, 2
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        vslidedown.vi    v8, \in, 4
+
+        vwmacc.vx    v24, \t0, v8
+        vwmacc.vx    v25, \t1, v8
+        vwmacc.vx    v26, \t2, v8
+        vwmacc.vx    v27, \t3, v8
+
+.else
+
+        vwmacc.vx    v24, \t0, \in
+        vwmacc.vx    v25, \t1, \in
+        vwmacc.vx    v26, \t2, \in
+        vwmacc.vx    v27, \t3, \in
+.endif
+
+       .ifc \op0, -
+                neg    \t0, \t0
+        .endif
+        .ifc \op1, -
+                neg    \t1, \t1
+        .endif
+        .ifc \op2, -
+                neg    \t2, \t2
+        .endif
+        .ifc \op3, -
+                neg    \t3, \t3
+        .endif
+.endm
+
+.macro butterfly e, o, tmp_p, tmp_m
+        vsetivli    zero, 4, e32, m1, ta, ma
+        vadd.vv             \tmp_p, \e, \o
+        vsub.vv             \tmp_m, \e, \o
+.endm
+
+.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
+        vsetivli    zero, 4, e32, m1, ta, ma
+        vadd.vv              v20, \in0, \in1
+        vsub.vv              \in0, \in0, \in1
+        vadd.vv              \in1, \in2, \in3
+        vsub.vv              \in2, \in2, \in3
+        vadd.vv              \in3, \in4, \in5
+        vsub.vv              \in4, \in4, \in5
+        vadd.vv              \in5, \in6, \in7
+        vsub.vv              \in6, \in6, \in7
+.endm
+
+.macro multiply in
+        vsetivli    zero, 4, e16, m1, ta, ma
+        vse16.v         \in, (s0)
+        ld      s2, 0*2(s0)
+        ld      s3, 1*2(s0)
+        ld      s4, 2*2(s0)
+        ld      s5, 3*2(s0)
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        vwmul.vx        v24, v4, s2
+        vwmul.vx        v25, v4, s3
+        vwmul.vx        v26, v4, s4
+        vwmul.vx        v27, v4, s5
+.endm
+
+func tr_block1, zve64x
+        multiply        v0
+
+        addi    sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        sd    x\i,8*(\i - 10)(sp)
+.endr
+        vsetivli    zero, 4, e16, m1, ta, ma
+        vse16.v         v0, (s0)
+        ld      x10, 0*2(s0)
+        ld      x11, 1*2(s0)
+        ld      x12, 2*2(s0)
+        ld      x13, 3*2(s0)
+        vse16.v         v1, (s0)
+        ld      x14, 0*2(s0)
+        ld      x15, 1*2(s0)
+        ld      x16, 2*2(s0)
+        ld      x17, 3*2(s0)
+        vse16.v         v2, (s0)
+        ld      x18, 0*2(s0)
+        ld      x19, 1*2(s0)
+        ld      x20, 2*2(s0)
+        ld      x21, 3*2(s0)
+        vse16.v         v3, (s0)
+        ld      x22, 0*2(s0)
+        ld      x23, 1*2(s0)
+        ld      x24, 2*2(s0)
+        ld      x25, 3*2(s0)
+
+        add_member32    v4,  x11, x14, x17, x20, +, +, +, +, 2
+        add_member32    v5,  x12, x17, x22, x24, +, +, +, -
+        add_member32    v5,  x13, x20, x24, x17, +, +, -, -, 2
+        add_member32    v6,  x14, x23, x19, x10, +, +, -, -
+        add_member32    v6,  x15, x25, x14, x16, +, -, -, -, 2
+        add_member32    v7,  x16, x22, x10, x23, +, -, -, -
+        add_member32    v7,  x17, x19, x15, x21, +, -, -, +, 2
+        add_member32    v16, x18, x16, x20, x14, +, -, -, +
+        add_member32    v16, x19, x13, x25, x12, +, -, -, +, 2
+        add_member32    v17, x20, x11, x21, x19, +, -, +, +
+        add_member32    v17, x21, x12, x16, x25, +, -, +, -, 2
+        add_member32    v18, x22, x15, x11, x18, +, -, +, -
+        add_member32    v18, x23, x18, x13, x11, +, -, +, -, 2
+        add_member32    v19, x24, x21, x18, x15, +, -, +, -
+        add_member32    v19, x25, x24, x23, x22, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        ld    x\i, 8*(\i - 10)(sp)
+.endr
+        addi    sp, sp, 8*16
+
+        ret
+endfunc
+
+func tr_block2, zve64x
+        multiply        v1
+
+        addi    sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        sd    x\i,8*(\i - 10)(sp)
+.endr
+        vsetivli    zero, 4, e16, m1, ta, ma
+        vse16.v         v0, (s0)
+        ld      x10, 0*2(s0)
+        ld      x11, 1*2(s0)
+        ld      x12, 2*2(s0)
+        ld      x13, 3*2(s0)
+        vse16.v         v1, (s0)
+        ld      x14, 0*2(s0)
+        ld      x15, 1*2(s0)
+        ld      x16, 2*2(s0)
+        ld      x17, 3*2(s0)
+        vse16.v         v2, (s0)
+        ld      x18, 0*2(s0)
+        ld      x19, 1*2(s0)
+        ld      x20, 2*2(s0)
+        ld      x21, 3*2(s0)
+        vse16.v         v3, (s0)
+        ld      x22, 0*2(s0)
+        ld      x23, 1*2(s0)
+        ld      x24, 2*2(s0)
+        ld      x25, 3*2(s0)
+
+        add_member32    v4,  x23, x25, x22, x19, +, -, -, -, 2
+        add_member32    v5,  x19, x14, x10, x15, -, -, -, -
+        add_member32    v5,  x10, x16, x23, x21, -, -, -, +, 2
+        add_member32    v6,  x18, x24, x15, x13, -, +, +, +
+        add_member32    v6,  x24, x13, x17, x23, +, +, +, -, 2
+        add_member32    v7,  x15, x17, x21, x10, +, +, -, -
+        add_member32    v7,  x13, x23, x11, x25, +, -, -, +, 2
+        add_member32    v16, x22, x12, x24, x11, +, -, -, +
+        add_member32    v16, x20, x18, x14, x24, -, -, +, +, 2
+        add_member32    v17, x11, x22, x18, x12, -, +, +, -
+        add_member32    v17, x17, x11, x20, x22, -, +, -, -, 2
+        add_member32    v18, x25, x19, x12, x14, +, +, -, +
+        add_member32    v18, x16, x21, x25, x20, +, -, -, +, 2
+        add_member32    v19, x12, x11, x13, x16, +, -, +, -
+        add_member32    v19, x21, x20, x19, x18, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        ld    x\i, 8*(\i - 10)(sp)
+.endr
+        addi    sp, sp, 8*16
+
+        ret
+endfunc
+
+func tr_block3, zve64x
+        multiply        v2
+        addi    sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        sd    x\i,8*(\i - 10)(sp)
+.endr
+        vsetivli    zero, 4, e16, m1, ta, ma
+        vse16.v         v0, (s0)
+        ld      x10, 0*2(s0)
+        ld      x11, 1*2(s0)
+        ld      x12, 2*2(s0)
+        ld      x13, 3*2(s0)
+        vse16.v         v1, (s0)
+        ld      x14, 0*2(s0)
+        ld      x15, 1*2(s0)
+        ld      x16, 2*2(s0)
+        ld      x17, 3*2(s0)
+        vse16.v         v2, (s0)
+        ld      x18, 0*2(s0)
+        ld      x19, 1*2(s0)
+        ld      x20, 2*2(s0)
+        ld      x21, 3*2(s0)
+        vse16.v         v3, (s0)
+        ld      x22, 0*2(s0)
+        ld      x23, 1*2(s0)
+        ld      x24, 2*2(s0)
+        ld      x25, 3*2(s0)
+
+        add_member32    v4,  x16, x13, x10, x12, -, -, -, -, 2
+        add_member32    v5,  x20, x25, x21, x16, -, -, +, +
+        add_member32    v5,  x14, x12, x19, x25, +, +, +, -, 2
+        add_member32    v6,  x22, x20, x11, x17, +, -, -, -
+        add_member32    v6,  x12, x18, x22, x10, -, -, +, +, 2
+        add_member32    v7,  x24, x14, x18, x20, -, +, +, -
+        add_member32    v7,  x10, x24, x12, x22, +, +, -, -, 2
+        add_member32    v16, x25, x11, x23, x13, -, -, +, +
+        add_member32    v16, x11, x21, x17, x15, -, +, +, -, 2
+        add_member32    v17, x23, x17, x13, x24, +, +, -, +
+        add_member32    v17, x13, x15, x24, x18, +, -, +, +, 2
+        add_member32    v18, x21, x23, x16, x11, -, -, +, -
+        add_member32    v18, x15, x10, x14, x19, -, +, -, +, 2
+        add_member32    v19, x19, x22, x25, x23, +, -, +, +
+        add_member32    v19, x17, x16, x15, x14, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        ld    x\i, 8*(\i - 10)(sp)
+.endr
+        addi    sp, sp, 8*16
+
+        ret
+endfunc
+
+func tr_block4, zve64x
+        multiply        v3
+        addi    sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        sd    x\i,8*(\i - 10)(sp)
+.endr
+        vsetivli    zero, 4, e16, m1, ta, ma
+        vse16.v         v0, (s0)
+        ld      x10, 0*2(s0)
+        ld      x11, 1*2(s0)
+        ld      x12, 2*2(s0)
+        ld      x13, 3*2(s0)
+        vse16.v         v1, (s0)
+        ld      x14, 0*2(s0)
+        ld      x15, 1*2(s0)
+        ld      x16, 2*2(s0)
+        ld      x17, 3*2(s0)
+        vse16.v         v2, (s0)
+        ld      x18, 0*2(s0)
+        ld      x19, 1*2(s0)
+        ld      x20, 2*2(s0)
+        ld      x21, 3*2(s0)
+        vse16.v         v3, (s0)
+        ld      x22, 0*2(s0)
+        ld      x23, 1*2(s0)
+        ld      x24, 2*2(s0)
+        ld      x25, 3*2(s0)
+
+        add_member32    v4,  x15, x18, x21, x24, -, -, -, -, 2
+        add_member32    v5,  x10, x13, x18, x23, +, +, +, +
+        add_member32    v5,  x18, x10, x15, x22, -, -, -, -, 2
+        add_member32    v6,  x25, x16, x12, x21, +, +, +, +
+        add_member32    v6,  x19, x21, x10, x20, +, -, -, -, 2
+        add_member32    v7,  x12, x25, x13, x19, -, -, +, +
+        add_member32    v7,  x14, x20, x16, x18, +, +, -, -, 2
+        add_member32    v16, x21, x15, x19, x17, -, -, +, +
+        add_member32    v16, x23, x11, x22, x16, -, +, -, -, 2
+        add_member32    v17, x16, x14, x25, x15, +, -, +, +
+        add_member32    v17, x11, x19, x23, x14, -, +, +, -, 2
+        add_member32    v18, x17, x24, x20, x13, +, -, -, +
+        add_member32    v18, x24, x22, x17, x12, -, -, +, -, 2
+        add_member32    v19, x20, x17, x14, x11, -, +, -, +
+        add_member32    v19, x13, x12, x11, x10, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        ld    x\i, 8*(\i - 10)(sp)
+.endr
+        addi    sp, sp, 8*16
+
+        ret
+endfunc
+
+.macro butterfly32 in0, in1, in2, in3, out
+        vsetivli    zero, 4, e32, m1, ta, ma
+        vadd.vv             \out, \in0, \in1
+        vsub.vv             \in0, \in0, \in1
+        vadd.vv             \in1, \in2, \in3
+        vsub.vv             \in2, \in2, \in3
+.endm
+
+.macro load16 in0, in1, in2, in3
+        sub t0, a3, a1
+
+        vsetivli    zero, 2, e64, m1, ta, ma
+        vlse64.v \in0, (a1), t0
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vlse64.v \in1, (a1), t0
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vlse64.v \in2, (a1), t0
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vlse64.v \in3, (a1), t0
+        add a1, a1, a2
+        add a3, a3, a2
+.endm
+
+.macro store16 in0, in1, in2, in3, rx
+        sub t0, a3, a1
+        vsetivli    zero, 2, e64, m1, ta, ma
+        vsse64.v \in0, (a1), t0
+        add a1, a1, a2
+        add a3, a3, \rx
+
+        sub t0, a3, a1
+        vsse64.v \in1, (a1), t0
+        add a1, a1, a2
+        add a3, a3, \rx
+
+        sub t0, a3, a1
+        vsse64.v \in2, (a1), t0
+        add a1, a1, a2
+        add a3, a3, \rx
+
+        sub t0, a3, a1
+        vsse64.v \in3, (a1), t0
+        add a1, a1, a2
+        add a3, a3, \rx
+.endm
+
+.macro load32
+        addi             a1,  a5, 64
+        addi             a3,  a1, 128
+        li               a2,  256
+
+        sub t0, a3, a1
+        vsetivli    zero, 2, e64, m1, ta, ma
+        vlse64.v v4, (a1), t0
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vlse64.v v5, (a1), t0
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vlse64.v v6, (a1), t0
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vlse64.v v7, (a1), t0
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vlse64.v v16, (a1), t0
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vlse64.v v17, (a1), t0
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vlse64.v v18, (a1), t0
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vlse64.v v19, (a1), t0
+        add a1, a1, a2
+        add a3, a3, a2
+
+.endm
+
+.macro add_member in, tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7, op0, op1, op2, op3, op4, op5, op6, op7, p
+        .ifc \op0, -
+                neg    \tt0, \tt0
+        .endif
+        .ifc \op1, -
+                neg    \tt1, \tt1
+        .endif
+        .ifc \op2, -
+                neg    \tt2, \tt2
+        .endif
+        .ifc \op3, -
+                neg    \tt3, \tt3
+        .endif
+        .ifc \op4, -
+                neg    \tt4, \tt4
+        .endif
+        .ifc \op5, -
+                neg    \tt5, \tt5
+        .endif
+        .ifc \op6, -
+                neg    \tt6, \tt6
+        .endif
+        .ifc \op7, -
+                neg    \tt7, \tt7
+        .endif
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        .ifc \p, 2
+                vslidedown.vi    v8, \in, 4
+        .else
+                vmv.v.v         v8, \in
+        .endif
+
+        vwmacc.vx    v21, \tt0, v8
+        vwmacc.vx    v22, \tt1, v8
+        vwmacc.vx    v23, \tt2, v8
+        vwmacc.vx    v24, \tt3, v8
+        vwmacc.vx    v25, \tt4, v8
+        vwmacc.vx    v26, \tt5, v8
+        vwmacc.vx    v27, \tt6, v8
+        vwmacc.vx    v28, \tt7, v8
+
+        .ifc \op0, -
+                neg    \tt0, \tt0
+        .endif
+        .ifc \op1, -
+                neg    \tt1, \tt1
+        .endif
+        .ifc \op2, -
+                neg    \tt2, \tt2
+        .endif
+        .ifc \op3, -
+                neg    \tt3, \tt3
+        .endif
+        .ifc \op4, -
+                neg    \tt4, \tt4
+        .endif
+        .ifc \op5, -
+                neg    \tt5, \tt5
+        .endif
+        .ifc \op6, -
+                neg    \tt6, \tt6
+        .endif
+        .ifc \op7, -
+                neg    \tt7, \tt7
+        .endif
+.endm
+
+.macro scale_store shift
+        vsetivli    zero, 8, e16, m1, ta, ma
+        vle16.v        v28, (a4)
+        addi            a4, a4, 2*8
+        vle16.v        v29, (a4)
+        addi            a4, a4, 2*8
+        vle16.v        v30, (a4)
+        addi            a4, a4, 2*8
+        vle16.v        v31, (a4)
+        addi            a4, a4, 2*8
+
+        butterfly32     v28, v24, v29, v25, v2
+        butterfly32     v30, v26, v31, v27, v3
+
+        scale           v20, v21, v22, v23, v2, v28, v24, v29, v3, v30, v26, v31, \shift
+
+        transpose16_4x4_2 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+
+        store16         v20, v21, v22, v23, t1
+
+        vsetivli    zero, 4, e16, m1, ta, ma
+        vle16.v         v2, (t2)
+        addi            t2, t2, 8
+        vle16.v         v3, (t2)
+        addi            t2, t2, -8
+.endm
+
+.macro store_to_stack off1, off2, in0, in2, in4, in6, in7, in5, in3, in1
+        li      a7, \off1
+        add     a1, sp, a7
+        li      a7, \off2
+        add     a3, sp, a7
+        li      a2, -16
+        li      a4, 16
+
+        vsetivli    zero, 4, e32, m1, ta, ma
+        vse32.v         \in0, (a1)
+        add             a1, a1, a4
+        vse32.v         \in1, (a3)
+        add             a3, a3, a2
+        vse32.v         \in2, (a1)
+        add             a1, a1, a4
+        vse32.v         \in3, (a3)
+        add             a3, a3, a2
+        vse32.v         \in4, (a1)
+        add             a1, a1, a4
+        vse32.v         \in5, (a3)
+        add             a3, a3, a2
+        vse32.v         \in6, (a1)
+        vse32.v         \in7, (a3)
+.endm
+
+.macro transpose16_4x4_2 r0, r1, r2, r3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5
+        vsetivli    zero, 8, e16, m1, ta, ma
+        vse16.v    v\r0, (s1)
+        add t0, s1, 16
+        vse16.v    v\r1, (t0)
+        add t0, t0, 16
+        vse16.v    v\r2, (t0)
+        add t0, t0, 16
+        vse16.v    v\r3, (t0)
+
+        li t0, 32
+        vsetvli    zero, t0, e16, m4, ta, ma
+        lla        t0, trans_index
+        vle16.v    v\tmp0, (t0)
+        vluxei16.v v\tmp0, (s1), v\tmp0
+        vse16.v    v\tmp0, (s1)
+
+        vsetivli    zero, 8, e16, m1, ta, ma
+        vle16.v    v\r0, (s1)
+        add t0, s1, 16
+        vle16.v    v\r1, (t0)
+        add t0, t0, 16
+        vle16.v    v\r2, (t0)
+        add t0, t0, 16
+        vle16.v    v\r3, (t0)
+.endm
+
+.macro tr16_8x4 in0, in1, in2, in3, offset
+        tr_4x4_8        \in0, \in1, \in2, \in3, v24, v25, v26, v27
+
+        vsetivli    zero, 8, e16, m1, ta, ma
+        vse16.v     v0, (s0)
+        lh      s6, 4*2(s0)
+        lh      s7, 5*2(s0)
+        lh      s8, 6*2(s0)
+        lh      s9, 7*2(s0)
+
+        neg    s2, s6
+        neg    s4, s8
+        neg    s5, s9
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        vslidedown.vi    v8, \in0, 4
+        vwmul.vx    v28, v8, s6
+        vwmul.vx    v29, v8, s7
+        vwmul.vx    v30, v8, s8
+        vwmul.vx    v31, v8, s9
+
+        vslidedown.vi    v8, \in1, 4
+        vwmacc.vx    v28, s7, v8
+        vwmacc.vx    v29, s5, v8
+        vwmacc.vx    v30, s2, v8
+        vwmacc.vx    v31, s4, v8
+
+
+        vslidedown.vi    v8, \in2, 4
+        vwmacc.vx    v28, s8, v8
+        vwmacc.vx    v29, s2, v8
+        vwmacc.vx    v30, s9, v8
+        vwmacc.vx    v31, s7, v8
+
+        vslidedown.vi    v8, \in3, 4
+        vwmacc.vx    v28, s9, v8
+        vwmacc.vx    v29, s4, v8
+        vwmacc.vx    v30, s7, v8
+        vwmacc.vx    v31, s2, v8
+
+
+        butterfly       v24, v28, v16, v23
+        butterfly       v25, v29, v17, v22
+        butterfly       v26, v30, v18, v21
+        butterfly       v27, v31, v19, v20
+
+        li a7, \offset
+        add a4, sp, a7
+
+        vsetivli    zero, 4, e32, m1, ta, ma
+        vse32.v v16, (a4)
+        add a4, a4, 16
+        vse32.v v17, (a4)
+        add a4, a4, 16
+        vse32.v v18, (a4)
+        add a4, a4, 16
+        vse32.v v19, (a4)
+        add a4, a4, 16
+
+        vse32.v v20, (a4)
+        add a4, a4, 16
+        vse32.v v21, (a4)
+        add a4, a4, 16
+        vse32.v v22, (a4)
+        add a4, a4, 16
+        vse32.v v23, (a4)
+        add a4, a4, 16
+
+        add a4, a4, -64
+.endm
+
+.macro scale out0, out1, out2, out3, in0, in1, in2, in3, in4, in5, in6, in7, shift
+        vsetivli        zero, 4, e16, mf2, ta, ma
+        vnclip.wi       v8, \in1\(), \shift
+        vsetivli        zero, 2, e64, m1, ta, ma
+        vslideup.vi     \out0\(), v8, 1
+        vsetivli        zero, 4, e16, mf2, ta, ma
+        vnclip.wi       \out0\(), \in0\(), \shift
+
+        vsetivli        zero, 4, e16, mf2, ta, ma
+        vnclip.wi       v8, \in3\(), \shift
+        vsetivli        zero, 2, e64, m1, ta, ma
+        vslideup.vi     \out1\(), v8, 1
+        vsetivli        zero, 4, e16, mf2, ta, ma
+        vnclip.wi       \out1\(), \in2\(), \shift
+
+        vsetivli        zero, 4, e16, mf2, ta, ma
+        vnclip.wi       v8, \in5\(), \shift
+        vsetivli        zero, 2, e64, m1, ta, ma
+        vslideup.vi     \out2\(), v8, 1
+        vsetivli        zero, 4, e16, mf2, ta, ma
+        vnclip.wi       \out2\(), \in4\(), \shift
+
+        vsetivli        zero, 4, e16, mf2, ta, ma
+        vnclip.wi       v8, \in7\(), \shift
+        vsetivli        zero, 2, e64, m1, ta, ma
+        vslideup.vi     \out3\(), v8, 1
+        vsetivli        zero, 4, e16, mf2, ta, ma
+        vnclip.wi       \out3\(), \in6\(), \shift
+.endm
+
+.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
+        vsetivli    zero, 4, e16, m1, ta, ma
+        vwcvt.x.x.v     v8, \in0
+        vsetivli    zero, 4, e32, m1, ta, ma
+        vsll.vi         v28, v8, 6
+
+        vsetivli    zero, 16, e8, m1, ta, ma
+        vmv.v.v v29, v28
+
+        vsetivli    zero, 4, e16, m1, ta, ma
+        vse16.v     v0, (s0)
+        lh      s2, 0*2(s0)
+        lh      s3, 1*2(s0)
+        lh      s5, 3*2(s0)
+
+        neg    s6, s2
+        neg    s7, s3
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        vwmul.vx    v30, \in1, s3
+        vwmul.vx    v31, \in1, s5
+        vwmacc.vx    v28, s2, \in2
+
+        vwmacc.vx    v29, s6, \in2
+        vwmacc.vx    v30, s5, \in3
+        vwmacc.vx    v31, s7, \in3
+
+        vsetivli    zero, 4, e32, m1, ta, ma
+        vadd.vv             \out0, v28, v30
+        vadd.vv             \out1, v29, v31
+        vsub.vv             \out2, v29, v31
+        vsub.vv             \out3, v28, v30
+.endm
+
+.macro tr_16x4 name, shift, offset, step
+func func_tr_16x4_\name, zve64x
+        mv             a1,  a5
+        addi           a3,  a5, \step * 64
+        li             a2,  \step * 128
+        load16          v16, v17, v18, v19
+
+        lla              a1, trans
+
+        vsetivli    zero, 8, e16, m1, ta, ma
+        vle16.v         v0, (a1)
+
+        tr16_8x4        v16, v17, v18, v19, \offset
+
+        addi             a1,  a5, \step * 32
+        addi             a3,  a5, \step * 3 *32
+        li             a2,  \step * 128
+        load16          v20, v17, v18, v19
+
+        lla              a1, trans
+        addi             a1, a1, 16
+
+        vsetivli    zero, 8, e16, m1, ta, ma
+        vle16.v          v1, (a1)
+
+        lh      s2, 0*2(a1)
+        lh      s3, 1*2(a1)
+        lh      s4, 2*2(a1)
+        lh      s5, 3*2(a1)
+        lh      s6, 4*2(a1)
+        lh      s7, 5*2(a1)
+        lh      s8, 6*2(a1)
+        lh      s9, 7*2(a1)
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        vwmul.vx            v21, v20, s2
+        vwmul.vx            v22, v20, s3
+        vwmul.vx            v23, v20, s4
+        vwmul.vx            v24, v20, s5
+        vwmul.vx            v25, v20, s6
+        vwmul.vx            v26, v20, s7
+        vwmul.vx            v27, v20, s8
+        vwmul.vx            v28, v20, s9
+
+        add_member      v20, s3, s6, s9, s7, s4, s2, s5, s8, +, +, +, -, -, -, -, -, 2
+        add_member      v17, s4, s9, s5, s3, s8, s6, s2, s7, +, +, -, -, -, +, +, +
+        add_member      v17, s5, s7, s3, s9, s2, s8, s4, s6, +, -, -, +, +, +, -, -, 2
+        add_member      v18, s6, s4, s8, s2, s9, s3, s7, s5, +, -, -, +, -, -, +, +
+        add_member      v18, s7, s2, s6, s8, s3, s5, s9, s4, +, -, +, +, -, +, +, -, 2
+        add_member      v19, s8, s5, s2, s4, s7, s9, s6, s3, +, -, +, -, +, +, -, +
+        add_member      v19, s9, s8, s7, s6, s5, s4, s3, s2, +, -, +, -, +, -, +, -, 2
+
+        li a7, \offset
+        add a4, sp, a7
+
+        vsetivli    zero, 4, e32, m1, ta, ma
+        vle32.v        v16, (a4)
+        addi            a4, a4, 16
+        vle32.v        v17, (a4)
+        addi            a4, a4, 16
+        vle32.v        v18, (a4)
+        addi            a4, a4, 16
+        vle32.v        v19, (a4)
+        addi            a4, a4, 16
+
+        butterfly16       v16, v21, v17, v22, v18, v23, v19, v24
+    .if \shift > 0
+        scale             v29, v30, v31, v24, v20, v16, v21, v17, v22, v18, v23, v19, \shift
+
+        transpose16_4x4_2 29, 30, 31, 24, 4, 5, 6, 7, 2, 3
+
+        mv             a1,  a6
+        addi             a3,  a6, 24 +3*32
+        li             a2, 32
+        li             a4, -32
+
+        store16         v29, v30, v31, v24, a4
+    .else
+        store_to_stack  \offset, (\offset + 240), v20, v21, v22, v23, v19, v18, v17, v16
+    .endif
+
+        li a7, \offset+64
+        add a4, sp, a7
+
+        vsetivli    zero, 4, e32, m1, ta, ma
+        vle32.v        v16, (a4)
+        addi            a4, a4, 16
+        vle32.v        v17, (a4)
+        addi            a4, a4, 16
+        vle32.v        v18, (a4)
+        addi            a4, a4, 16
+        vle32.v        v19, (a4)
+        addi            a4, a4, 16
+
+        butterfly16     v16, v25, v17, v26, v18, v27, v19, v28
+   .if \shift > 0
+        scale             v29, v30, v31, v20, v20, v16, v25, v17, v26, v18, v27, v19, \shift
+
+        transpose16_4x4_2 29, 30, 31, 20, 4, 5, 6, 7, 2, 3
+
+        add            a1,  a6, 8
+        add            a3,  a6, (16 + 3 * 32)
+        li             a2, 32
+        li             a4, -32
+        store16        v29, v30, v31, v20, a4
+   .else
+        store_to_stack  (\offset + 64), (\offset + 176), v20, v25, v26, v27, v19, v18, v17, v16
+   .endif
+        ret
+endfunc
+.endm
+
+tr_16x4 noscale, 0, 2048, 4
+
+.macro tr_32x4 name, shift
+func func_tr_32x4_\name, zve64x
+        mv             t3, ra
+        jal              func_tr_16x4_noscale
+
+        load32
+
+        lla              t2, trans
+        addi             t2, t2, 32
+
+        vsetivli    zero, 4, e16, m1, ta, ma
+        vle16.v v0, (t2)
+        addi t2, t2, 2*4
+        vle16.v v1, (t2)
+        addi t2, t2, 2*4
+        vle16.v v2, (t2)
+        addi t2, t2, 2*4
+        vle16.v v3, (t2)
+        addi t2, t2, -2*4
+
+        li a7, 2048
+        add a4, sp, a7
+
+        li               a2, 64
+        li               t1, -64
+
+        jal               tr_block1
+        mv                a1, t4
+        addi              a3, t4, (56 + 3 * 64)
+        scale_store       \shift
+
+        jal              tr_block2
+        addi             a1, t4, 8
+        addi             a3, t4, (48 + 3 * 64)
+        scale_store      \shift
+
+        jal              tr_block3
+        addi             a1, t4, 16
+        addi             a3, t4, (40 + 3 * 64)
+        scale_store      \shift
+
+        jal              tr_block4
+        addi             a1, t4, 24
+        addi             a3, t4, (32 + 3 * 64)
+        scale_store      \shift
+
+        jr t3
+endfunc
+.endm
+
+tr_32x4 firstpass, 7
+tr_32x4 secondpass_8, 20 - 8
+
+.macro idct_32x32 bitdepth
+func ff_hevc_idct_32x32_\bitdepth\()_rvv, zve64x
+        mv t6, ra
+        addi    sp, sp, -8*13
+        sd    ra, 8*12(sp)
+        sd    s0, 8*11(sp)
+        sd    s1, 8*10(sp)
+        sd    s2, 8*9(sp)
+        sd    s3, 8*8(sp)
+        sd    s4, 8*7(sp)
+        sd    s5, 8*6(sp)
+        sd    s6, 8*5(sp)
+        sd    s7, 8*4(sp)
+        sd    s8, 8*3(sp)
+        sd    s9, 8*2(sp)
+        sd    s10, 8*1(sp)
+        sd    s11, 8*0(sp)
+
+        add sp, sp, -16
+        mv s0, sp
+
+        add sp, sp, -64
+        mv s1, sp
+
+        csrwi   vxrm, 1
+        li  a7, 2432
+        sub sp, sp, a7
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+        li               a7, 8 * \i
+        add              a5, a0, a7
+
+        li               a7, 8 * \i * 32
+        add              t4, sp, a7
+        jal              func_tr_32x4_firstpass
+.endr
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+        addi             a5, sp, 8 * \i
+        addi             t4, a0, 8 * \i * 32
+        jal              func_tr_32x4_secondpass_\bitdepth
+.endr
+
+        li              a7, 2432
+        add             sp,  sp,  a7
+
+        add  sp, sp, 80
+
+        ld    ra, 8*12(sp)
+        ld    s0, 8*11(sp)
+        ld    s1, 8*10(sp)
+        ld    s2, 8*9(sp)
+        ld    s3, 8*8(sp)
+        ld    s4, 8*7(sp)
+        ld    s5, 8*6(sp)
+        ld    s6, 8*5(sp)
+        ld    s7, 8*4(sp)
+        ld    s8, 8*3(sp)
+        ld    s9, 8*2(sp)
+        ld    s10, 8*1(sp)
+        ld    s11, 8*0(sp)
+        addi    sp, sp, 8*13
+
+        jr t6
+endfunc
+.endm
+
+idct_32x32 8
diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c
index 1d8326a573..6dfb889eec 100644
--- a/libavcodec/riscv/hevcdsp_init.c
+++ b/libavcodec/riscv/hevcdsp_init.c
@@ -27,6 +27,8 @@
 #include "libavcodec/hevc/dsp.h"
 #include "libavcodec/riscv/h26x/h2656dsp.h"
 
+void ff_hevc_idct_32x32_8_rvv(int16_t *coeffs, int col_limit);
+
 #define RVV_FNASSIGN(member, v, h, fn, ext) \
         member[1][v][h] = ff_h2656_put_pixels_##8_##ext;  \
         member[3][v][h] = ff_h2656_put_pixels_##8_##ext;  \
@@ -40,27 +42,37 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int bit_depth)
     const int flags = av_get_cpu_flags();
     int vlenb;
 
-    if (!(flags & AV_CPU_FLAG_RVV_I32) || !(flags & AV_CPU_FLAG_RVB))
-        return;
-
     vlenb = ff_get_rv_vlenb();
-    if (vlenb >= 32) {
-        switch (bit_depth) {
-            case 8:
-                RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256);
-                RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256);
-                break;
-            default:
-                break;
-        }
-    } else if (vlenb >= 16) {
-        switch (bit_depth) {
-            case 8:
-                RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128);
-                RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128);
-                break;
-            default:
-                break;
+
+    if (flags & AV_CPU_FLAG_RVV_I64)
+        if (vlenb >= 16)
+            switch (bit_depth) {
+                case 8:
+                    c->idct[3]                     = ff_hevc_idct_32x32_8_rvv;
+                    break;
+                default:
+                    break;
+            }
+
+    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB)){
+        if (vlenb >= 32) {
+            switch (bit_depth) {
+                case 8:
+                    RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256);
+                    RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256);
+                    break;
+                default:
+                    break;
+            }
+        } else if (vlenb >= 16) {
+            switch (bit_depth) {
+                case 8:
+                    RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128);
+                    RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128);
+                    break;
+                default:
+                    break;
+            }
         }
     }
 #endif
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4] libavcodec/riscv:add RVV optimized idct_32x32_8 for HEVC
  2025-05-20  7:58 [FFmpeg-devel] [PATCH v4] libavcodec/riscv:add RVV optimized idct_32x32_8 for HEVC daichengrong
@ 2025-05-24 16:18 ` Rémi Denis-Courmont
  2025-05-30  7:40 ` [FFmpeg-devel] [PATCH v5] " daichengrong
  1 sibling, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2025-05-24 16:18 UTC (permalink / raw)
  To: ffmpeg-devel

Le tiistaina 20. toukokuuta 2025, 10.58.06 Itä-Euroopan kesäaika 
daichengrong@iscas.ac.cn a écrit :
> From: daichengrong <daichengrong@iscas.ac.cn>
> 
> Since there are no comments for v2 and v3, we have continued to optimize
> according to the comments of v1. We spilled the slide to memory to help
> improve performance,and optimized the extraction of elements from vector
> registers.

You still seem to be flip-flopping values in X registers. You may need to go 
easier on macros to get a better view of the actual generated code.

Also it seems that this uses half-vectors a lot. I am not sure if this can be 
avoided, but typically that leads to very poor performance.

Also you're resetting `vl` with its current value, which can hurt performance 
depending on the implementation. If you don't need to change `vl`, then use 
`zero`.

Lastly, you seem to be changing vtype when it's not actually needed, e.g.:

vsetvli zero, 4, e16, mf2...
...
vsetvli zero, 4, e32, mf1...
vse32.v ...




-- 
德尼-库尔蒙‧雷米
Hagalund ny stad, f.d. Finska republik Nylands



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH v5] libavcodec/riscv:add RVV optimized idct_32x32_8 for HEVC
  2025-05-20  7:58 [FFmpeg-devel] [PATCH v4] libavcodec/riscv:add RVV optimized idct_32x32_8 for HEVC daichengrong
  2025-05-24 16:18 ` Rémi Denis-Courmont
@ 2025-05-30  7:40 ` daichengrong
  2025-06-02 20:13   ` Rémi Denis-Courmont
  2025-06-06  9:26   ` [FFmpeg-devel] [PATCH v6] libavcodec/riscv: add " daichengrong
  1 sibling, 2 replies; 5+ messages in thread
From: daichengrong @ 2025-05-30  7:40 UTC (permalink / raw)
  To: ffmpeg-devel

From: daichengrong <daichengrong@iscas.ac.cn>

On Banana PI F3:
        hevc_idct_32x32_8_c:                                118807.4 ( 1.00x)
        hevc_idct_32x32_8_rvv_i64:                           13853.3 ( 8.58x)
        hevc_idct_32x32_8_rvv_i64:                           20247.3 ( 5.92x) (before)

Changes in v5:
        Improve the continuity of vector operations
        Optimize loading matrices from memory to using immediate instructions

Changes in v4:
        Optimize unnecessary slide operations
        Extract more scalars from vector registers into purpose registers

Changes in v3:
        remove the slides in transposition and spill values from vector registers to stack

Changes in v2:
        deleted tabs
        remove the unnecessary t0 in vsetivli
        extract scalars directly into general registers

---
 libavcodec/riscv/Makefile           |   1 +
 libavcodec/riscv/hevcdsp_idct_rvv.S | 719 ++++++++++++++++++++++++++++
 libavcodec/riscv/hevcdsp_init.c     |  52 +-
 3 files changed, 752 insertions(+), 20 deletions(-)
 create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index a80d2fa2e7..dfc33afbee 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \
 OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
 RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 0000000000..4628415631
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,719 @@
+/*
+ * Copyright (c) 2025 Institute of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3
+        .ifc \op0, -
+                neg    t0, \t0
+        .endif
+        .ifc \op1, -
+                neg    t1, \t1
+        .endif
+        .ifc \op2, -
+                neg    t4, \t2
+        .endif
+        .ifc \op3, -
+                neg    t5, \t3
+        .endif
+
+        .ifc \op0, -
+                vwmacc.vx    v24, t0, \in
+        .else
+                vwmacc.vx    v24, \t0, \in
+        .endif
+        .ifc \op1, -
+                vwmacc.vx    v25, t1, \in
+        .else
+                vwmacc.vx    v25, \t1, \in
+        .endif
+        .ifc \op2, -
+                vwmacc.vx    v26, t4, \in
+        .else
+                vwmacc.vx    v26, \t2, \in
+        .endif
+        .ifc \op3, -
+                vwmacc.vx    v27, t5, \in
+        .else
+                vwmacc.vx    v27, \t3, \in
+        .endif
+.endm
+
+.macro tr_block_init
+        vslidedown.vi    v12, v4, 4
+        vslidedown.vi    v13, v5, 4
+        vslidedown.vi    v14, v6, 4
+        vslidedown.vi    v15, v7, 4
+
+        vslidedown.vi    v20, v16, 4
+        vslidedown.vi    v21, v17, 4
+        vslidedown.vi    v22, v18, 4
+        vslidedown.vi    v23, v19, 4
+.endm
+
+.macro tr_block1
+        tr_block_init
+
+        vwmul.vx        v24, v4, x12
+        vwmul.vx        v25, v4, x13
+        vwmul.vx        v26, v4, x14
+        vwmul.vx        v27, v4, x15
+
+        add_member32    v12, x13, x16, x19, x22,  +,  +,  +,  +
+        add_member32    v5, x14, x19, x24, x26,  +,  +,  +,  -
+        add_member32    v13, x15, x22, x26, x19,  +,  +,  -,  -
+        add_member32    v6, x16, x25, x21, x12,  +,  +,  -,  -
+        add_member32    v14, x17, x27, x16, x18,  +,  -,  -,  -
+        add_member32    v7, x18, x24, x12, x25,  +,  -,  -,  -
+        add_member32    v15, x19, x21, x17, x23,  +,  -,  -,  +
+
+
+        add_member32    v16, x20, x18, x22, x16,  +,  -,  -,  +
+        add_member32    v20, x21, x15, x27, x14,  +,  -,  -,  +
+        add_member32    v17, x22, x13, x23, x21,  +,  -,  +,  +
+        add_member32    v21, x23, x14, x18, x27,  +,  -,  +,  -
+        add_member32    v18, x24, x17, x13, x20,  +,  -,  +,  -
+        add_member32    v22, x25, x20, x15, x13,  +,  -,  +,  -
+        add_member32    v19, x26, x23, x20, x17,  +,  -,  +,  -
+        add_member32    v23, x27, x26, x25, x24,  +,  -,  +,  -
+.endm
+
+.macro tr_block2
+        tr_block_init
+
+        vwmul.vx        v24, v4, x16
+        vwmul.vx        v25, v4, x17
+        vwmul.vx        v26, v4, x18
+        vwmul.vx        v27, v4, x19
+
+        add_member32    v12, x25, x27, x24, x21,  +,  -,  -,  -
+        add_member32    v5, x21, x16, x12, x17,  -,  -,  -,  -
+        add_member32    v13, x12, x18, x25, x23,  -,  -,  -,  +
+        add_member32    v6, x20, x26, x17, x15,  -,  +,  +,  +
+        add_member32    v14, x26, x15, x19, x25,  +,  +,  +,  -
+        add_member32    v7, x17, x19, x23, x12,  +,  +,  -,  -
+        add_member32    v15, x15, x25, x13, x27,  +,  -,  -,  +
+
+        add_member32    v16, x24, x14, x26, x13,  +,  -,  -,  +
+        add_member32    v20, x22, x20, x16, x26,  -,  -,  +,  +
+        add_member32    v17, x13, x24, x20, x14,  -,  +,  +,  -
+        add_member32    v21, x19, x13, x22, x24,  -,  +,  -,  -
+        add_member32    v18, x27, x21, x14, x16,  +,  +,  -,  +
+        add_member32    v22, x18, x23, x27, x22,  +,  -,  -,  +
+        add_member32    v19, x14, x13, x15, x18,  +,  -,  +,  -
+        add_member32    v23, x23, x22, x21, x20,  +,  -,  +,  -
+.endm
+
+.macro tr_block3
+        tr_block_init
+
+        vwmul.vx        v24, v4, x20
+        vwmul.vx        v25, v4, x21
+        vwmul.vx        v26, v4, x22
+        vwmul.vx        v27, v4, x23
+
+        add_member32    v12, x18, x15, x12, x14,  -,  -,  -,  -
+        add_member32    v5, x22, x27, x23, x18,  -,  -,  +,  +
+        add_member32    v13, x16, x14, x21, x27,  +,  +,  +,  -
+        add_member32    v6, x24, x22, x13, x19,  +,  -,  -,  -
+        add_member32    v14, x14, x20, x24, x12,  -,  -,  +,  +
+        add_member32    v7, x26, x16, x20, x22,  -,  +,  +,  -
+        add_member32    v15, x12, x26, x14, x24,  +,  +,  -,  -
+        add_member32    v16, x27, x13, x25, x15,  -,  -,  +,  +
+        add_member32    v20, x13, x23, x19, x17,  -,  +,  +,  -
+        add_member32    v17, x25, x19, x15, x26,  +,  +,  -,  +
+        add_member32    v21, x15, x17, x26, x20,  +,  -,  +,  +
+        add_member32    v18, x23, x25, x18, x13,  -,  -,  +,  -
+        add_member32    v22, x17, x12, x16, x21,  -,  +,  -,  +
+        add_member32    v19, x21, x24, x27, x25,  +,  -,  +,  +
+        add_member32    v23, x19, x18, x17, x16,  +,  -,  +,  -
+.endm
+
+.macro tr_block4
+        tr_block_init
+
+        vwmul.vx        v24, v4, x24
+        vwmul.vx        v25, v4, x25
+        vwmul.vx        v26, v4, x26
+        vwmul.vx        v27, v4, x27
+
+        add_member32    v12, x17, x20, x23, x26,  -,  -,  -,  -
+        add_member32    v5, x12, x15, x20, x25,  +,  +,  +,  +
+        add_member32    v13, x20, x12, x17, x24,  -,  -,  -,  -
+        add_member32    v6, x27, x18, x14, x23,  +,  +,  +,  +
+        add_member32    v14, x21, x23, x12, x22,  +,  -,  -,  -
+        add_member32    v7, x14, x27, x15, x21,  -,  -,  +,  +
+        add_member32    v15, x16, x22, x18, x20,  +,  +,  -,  -
+        add_member32    v16, x23, x17, x21, x19,  -,  -,  +,  +
+        add_member32    v20, x25, x13, x24, x18,  -,  +,  -,  -
+        add_member32    v17, x18, x16, x27, x17,  +,  -,  +,  +
+        add_member32    v21, x13, x21, x25, x16,  -,  +,  +,  -
+        add_member32    v18, x19, x26, x22, x15,  +,  -,  -,  +
+        add_member32    v22, x26, x24, x19, x14,  -,  -,  +,  -
+        add_member32    v19, x22, x19, x16, x13,  -,  +,  -,  +
+        add_member32    v23, x15, x14, x13, x12,  +,  -,  +,  -
+.endm
+
+.macro butterfly e, o, tmp_p, tmp_m
+        vadd.vv             \tmp_p, \e, \o
+        vsub.vv             \tmp_m, \e, \o
+.endm
+
+.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
+        vadd.vv              v20, \in0, \in1
+        vsub.vv              \in0, \in0, \in1
+        vadd.vv              \in1, \in2, \in3
+        vsub.vv              \in2, \in2, \in3
+        vadd.vv              \in3, \in4, \in5
+        vsub.vv              \in4, \in4, \in5
+        vadd.vv              \in5, \in6, \in7
+        vsub.vv              \in6, \in6, \in7
+.endm
+
+.macro butterfly32 in0, in1, in2, in3, out
+        vadd.vv             \out, \in0, \in1
+        vsub.vv             \in0, \in0, \in1
+        vadd.vv             \in1, \in2, \in3
+        vsub.vv             \in2, \in2, \in3
+.endm
+
+.macro add_member in, tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7
+        vwmacc.vx    v21, \tt0, \in
+        vwmacc.vx    v22, \tt1, \in
+        vwmacc.vx    v23, \tt2, \in
+        vwmacc.vx    v24, \tt3, \in
+        vwmacc.vx    v25, \tt4, \in
+        vwmacc.vx    v26, \tt5, \in
+        vwmacc.vx    v27, \tt6, \in
+        vwmacc.vx    v28, \tt7, \in
+.endm
+
+.macro load16_rvv in0, in1, in2, in3, lo, hi, step
+        sub t0, \hi, \lo
+
+        vlse64.v \in0, (\lo), t0
+        add \lo, \lo, \step
+
+        vlse64.v \in1, (\lo), t0
+        add \lo, \lo, \step
+
+        vlse64.v \in2, (\lo), t0
+        add \lo, \lo, \step
+
+        vlse64.v \in3, (\lo), t0
+.endm
+
+.macro store16_rvv in0, in1, in2, in3, lo, hi, step
+        sub t0, \hi, \lo
+        li  t1, 2 * \step
+        vsse64.v \in0, (\lo), t0
+
+        addi \lo, \lo, \step
+        sub t0, t0, t1
+        vsse64.v \in1, (\lo), t0
+
+        addi \lo, \lo, \step
+        sub t0, t0, t1
+        vsse64.v \in2, (\lo), t0
+
+        addi \lo, \lo, \step
+        sub t0, t0, t1
+        vsse64.v \in3, (\lo), t0
+.endm
+
+.macro load32_rvv src
+        addi             a2,  \src, 64
+        li               a3,  256
+        li               t0, 128
+
+        vlse64.v v4, (a2), t0
+
+        add s2, a2, a3
+        vlse64.v v5, (s2), t0
+
+        add s3, s2, a3
+        vlse64.v v6, (s3), t0
+
+        add s4, s3, a3
+        vlse64.v v7, (s4), t0
+
+        add s5, s4, a3
+        vlse64.v v16, (s5), t0
+
+        add s6, s5, a3
+        vlse64.v v17, (s6), t0
+
+        add s7, s6, a3
+        vlse64.v v18, (s7), t0
+
+        add s8, s7, a3
+        vlse64.v v19, (s8), t0
+.endm
+
+.macro scale_store_rvv shift, dstL, dstH, step
+        vsetivli    zero, 8, e16, m1, ta, ma
+        vle16.v        v28, (t2)
+        addi            t2, t2, 2*8
+        vle16.v        v29, (t2)
+        addi            t2, t2, 2*8
+        vle16.v        v30, (t2)
+        addi            t2, t2, 2*8
+        vle16.v        v31, (t2)
+        addi            t2, t2, 2*8
+
+        vsetivli    zero, 4, e32, m1, ta, ma
+        butterfly32     v28, v24, v29, v25, v2
+        butterfly32     v30, v26, v31, v27, v3
+
+        scale           v20, v21, v22, v23, v2, v28, v24, v29, v3, v30, v26, v31, \shift
+
+        transpose16_4x4_2 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+
+        vsetivli    zero, 2, e64, m1, ta, ma
+        store16_rvv         v20, v21, v22, v23, \dstL, \dstH, \step
+.endm
+
+.macro store_to_stack_rvv off1, off2, in0, in2, in4, in6, in7, in5, in3, in1
+        li      t0, \off1
+        add     a2, sp, t0
+
+        li      t0, \off2
+        add     a3, sp, t0
+
+        vse32.v         \in0, (a2)
+        vse32.v         \in1, (a3)
+        addi             a2, a2, 16
+        addi             a3, a3, -16
+        vse32.v         \in2, (a2)
+        vse32.v         \in3, (a3)
+        addi             a2, a2, 16
+        addi             a3, a3, -16
+        vse32.v         \in4, (a2)
+        vse32.v         \in5, (a3)
+        addi             a2, a2, 16
+        addi             a3, a3, -16
+        vse32.v         \in6, (a2)
+        vse32.v         \in7, (a3)
+.endm
+
+.macro transpose16_4x4_2 r0, r1, r2, r3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5
+        vsetivli    zero, 8, e16, m1, ta, ma
+        vid.v     v0
+        vand.vi   v8, v0, 1
+        vmsne.vi  v0, v8, 0
+
+        vslideup.vi    v8, v\r1, 1
+        vsetivli    zero, 4, e16, m1, ta, ma
+        vmerge.vvm       v\tmp0\(), v\r0\(), v8, v0
+
+        vslidedown.vi    v8, v\r0, 1
+        vmerge.vvm       v\tmp1\(), v8, v\r1\(), v0
+
+        vslideup.vi    v8, v\r3, 1
+        vmerge.vvm       v\tmp2\(), v\r2\(), v8, v0
+
+        vslidedown.vi    v8, v\r2\(), 1
+        vmerge.vvm       v\tmp3\(), v8, v\r3\(), v0
+
+        vsetivli    zero, 2, e32, m1, ta, ma
+
+        vslideup.vi    v8, v\tmp2\(), 1
+        vmerge.vvm       v\tmp4\(), v\tmp0\(), v8, v0
+
+        vslidedown.vi    v8, v\tmp0\(), 1
+        vmerge.vvm       v\tmp5\(), v8, v\tmp2\(), v0
+
+        vslideup.vi    v8, v\tmp3\(), 1
+        vmerge.vvm       v\tmp0\(), v\tmp1\(), v8, v0
+
+        vslidedown.vi    v8, v\tmp1\(), 1
+        vmerge.vvm       v\tmp2\(), v8, v\tmp3\(), v0
+
+        vsetivli    zero, 1, e64, m1, ta, ma
+        vmv.v.v  v\r0\(), v\tmp4\()
+        vmv.v.v  v\r2\(), v\tmp5\()
+        vmv.v.v  v\r1\(), v\tmp0\()
+        vmv.v.v  v\r3\(), v\tmp2\()
+
+        vsetivli    zero, 8, e16, m1, ta, ma
+
+        vslideup.vi    v8, v\r2\(), 1
+        vmerge.vvm       v\tmp0\(), v\r3\(), v8, v0
+
+        vslidedown.vi    v8, v\r3\(), 1
+        vmerge.vvm        v\tmp1\(), v8, v\r2\(), v0
+
+        vslideup.vi    v8, v\r0\(), 1
+        vmerge.vvm       v\tmp2\(), v\r1\(),v8, v0
+
+        vslidedown.vi    v8, v\r1\(), 1
+        vmerge.vvm        v\tmp3\(), v8, v\r0\(), v0
+
+        vsetivli    zero, 4, e32, m1, ta, ma
+
+        vslideup.vi    v8, v\tmp2\(), 1
+        vmerge.vvm       v\tmp4\(), v\tmp0\(), v8, v0
+
+        vslidedown.vi    v8, v\tmp0\(), 1
+        vmerge.vvm       v\tmp5\(), v8, v\tmp2\(), v0
+
+        vslideup.vi    v8, v\tmp3\(), 1
+        vmerge.vvm       v\tmp0\(), v\tmp1\(), v8, v0
+
+        vslidedown.vi    v8, v\tmp1\(), 1
+        vmerge.vvm       v\tmp2\(), v8, v\tmp3\(), v0
+
+        vsetivli    zero, 2, e64, m1, ta, ma
+
+        vmerge.vvm       v\r3\(), v\r3\(), v\tmp4\(), v0
+        vmerge.vvm       v\r1\(), v\r1\(), v\tmp5\(), v0
+        vmerge.vvm       v\r2\(), v\r2\(), v\tmp0\(), v0
+        vmerge.vvm       v\r0\(), v\r0\(), v\tmp2\(), v0
+.endm
+
+.macro load_trans_8x4
+        li s6, 89
+        li s7, 75
+        li s8, 50
+        li s9, 18
+
+        neg    s2, s6
+        neg    s4, s8
+        neg    s5, s9
+.endm
+
+.macro scale out0, out1, out2, out3, in0, in1, in2, in3, in4, in5, in6, in7, shift
+        vsetivli        zero, 4, e16, mf2, ta, ma
+        vnclip.wi       \out0\(), \in0\(), \shift
+        vnclip.wi       \out1\(), \in2\(), \shift
+        vnclip.wi       \out2\(), \in4\(), \shift
+        vnclip.wi       \out3\(), \in6\(), \shift
+
+        vnclip.wi       \in1\(), \in1\(), \shift
+        vnclip.wi       \in3\(), \in3\(), \shift
+        vnclip.wi       \in5\(), \in5\(), \shift
+        vnclip.wi       \in7\(), \in7\(), \shift
+
+        vsetivli        zero, 2, e64, m1, ta, ma
+        vslideup.vi     \out0\(), \in1\(), 1
+        vslideup.vi     \out1\(), \in3\(), 1
+        vslideup.vi     \out2\(), \in5\(), 1
+        vslideup.vi     \out3\(), \in7\(), 1
+.endm
+
+.macro load_trans_4x4
+        li s2, 64
+        li s3, 83
+
+        li s5, 36
+
+        neg     s6, s2
+        neg     s7, s3
+.endm
+
+.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3
+        vsetivli    zero, 4, e16, m1, ta, ma
+        vwcvt.x.x.v     v8, \in0
+        vsetivli    zero, 4, e32, m1, ta, ma
+        vsll.vi         v28, v8, 6
+
+        vsetivli    zero, 16, e8, m1, ta, ma
+        vmv.v.v v29, v28
+
+        load_trans_4x4
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        vwmul.vx    v30, \in1, s3
+        vwmul.vx    v31, \in1, s5
+        vwmacc.vx    v28, s2, \in2
+
+        vwmacc.vx    v29, s6, \in2
+        vwmacc.vx    v30, s5, \in3
+        vwmacc.vx    v31, s7, \in3
+
+        vsetivli    zero, 4, e32, m1, ta, ma
+        vadd.vv             \out0, v28, v30
+        vadd.vv             \out1, v29, v31
+        vsub.vv             \out2, v29, v31
+        vsub.vv             \out3, v28, v30
+.endm
+
+.macro tr16_8x4 in0, in1, in2, in3, offset
+        tr_4x4_8        \in0, \in1, \in2, \in3, v24, v25, v26, v27
+
+        load_trans_8x4
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        vslidedown.vi    v8, \in0, 4
+        vwmul.vx    v28, v8, s6
+        vwmul.vx    v29, v8, s7
+        vwmul.vx    v30, v8, s8
+        vwmul.vx    v31, v8, s9
+
+        vslidedown.vi    v8, \in1, 4
+        vwmacc.vx    v28, s7, v8
+        vwmacc.vx    v29, s5, v8
+        vwmacc.vx    v30, s2, v8
+        vwmacc.vx    v31, s4, v8
+
+        vslidedown.vi    v8, \in2, 4
+        vwmacc.vx    v28, s8, v8
+        vwmacc.vx    v29, s2, v8
+        vwmacc.vx    v30, s9, v8
+        vwmacc.vx    v31, s7, v8
+
+        vslidedown.vi    v8, \in3, 4
+        vwmacc.vx    v28, s9, v8
+        vwmacc.vx    v29, s4, v8
+        vwmacc.vx    v30, s7, v8
+        vwmacc.vx    v31, s2, v8
+
+        vsetivli    zero, 4, e32, m1, ta, ma
+        butterfly       v24, v28, v16, v23
+        butterfly       v25, v29, v17, v22
+        butterfly       v26, v30, v18, v21
+        butterfly       v27, v31, v19, v20
+
+        li t0, \offset
+        add t0, sp, t0
+
+        vse32.v v16, (t0)
+        add t0, t0, 16
+        vse32.v v17, (t0)
+        add t0, t0, 16
+        vse32.v v18, (t0)
+        add t0, t0, 16
+        vse32.v v19, (t0)
+        add t0, t0, 16
+
+        vse32.v v20, (t0)
+        add t0, t0, 16
+        vse32.v v21, (t0)
+        add t0, t0, 16
+        vse32.v v22, (t0)
+        add t0, t0, 16
+        vse32.v v23, (t0)
+.endm
+
+.macro load_trans_16x4
+        li x12, 90
+        li x13, 87
+        li x14, 80
+        li x15, 70
+        li x16, 57
+        li x17, 43
+        li x18, 25
+        li x19, 9
+
+        neg x20, x12
+        neg x21, x13
+        neg x22, x14
+        neg x23, x15
+
+        neg x24, x16
+        neg x25, x17
+        neg x26, x18
+        neg x27, x19
+.endm
+
+.macro tr_16x4_rvv name, shift, offset, step
+func func_tr_16x4_\name\()_rvv, zve64x
+        mv             a2,  a0
+        addi           a3,  a0, \step * 64
+        li             a4,  \step * 128
+        vsetivli    zero, 2, e64, m1, ta, ma
+        load16_rvv     v16, v17, v18, v19, a2, a3, a4
+
+        tr16_8x4        v16, v17, v18, v19, \offset
+
+        addi            a2,  a0, \step * 32
+        addi            a3,  a0, \step * 3 *32
+        li              a4,  \step * 128
+        vsetivli        zero, 2, e64, m1, ta, ma
+        load16_rvv          v20, v17, v18, v19, a2, a3, a4
+
+        load_trans_16x4
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        vslidedown.vi    v0, v17, 4
+        vslidedown.vi    v1, v18, 4
+        vslidedown.vi    v2, v19, 4
+        vslidedown.vi    v3, v20, 4
+
+        vwmul.vx            v21, v20, x12
+        vwmul.vx            v22, v20, x13
+        vwmul.vx            v23, v20, x14
+        vwmul.vx            v24, v20, x15
+
+        vwmul.vx            v25, v20, x16
+        vwmul.vx            v26, v20, x17
+        vwmul.vx            v27, v20, x18
+        vwmul.vx            v28, v20, x19
+
+        add_member      v3, x13, x16, x19, x25, x22, x20, x23, x26
+        add_member      v17, x14, x19, x23, x21, x26, x16, x12, x17
+        add_member      v0, x15, x25, x21, x19, x12, x18, x22, x24
+        add_member      v18, x16, x22, x26, x12, x27, x21, x17, x15
+        add_member      v1, x17, x20, x16, x18, x21, x15, x19, x22
+        add_member      v19, x18, x23, x12, x22, x17, x19, x24, x13
+        add_member      v2, x19, x26, x17, x24, x15, x22, x13, x20
+
+        li t0, \offset
+        add t0, sp, t0
+        vle32.v        v16, (t0)
+
+        addi            t0, t0, 16
+        vle32.v        v17, (t0)
+
+        addi            t0, t0, 16
+        vle32.v        v18, (t0)
+
+        addi            t0, t0, 16
+        vle32.v        v19, (t0)
+
+        vsetivli    zero, 4, e32, m1, ta, ma
+        butterfly16       v16, v21, v17, v22, v18, v23, v19, v24
+        store_to_stack_rvv  \offset, (\offset + 240), v20, v21, v22, v23, v19, v18, v17, v16
+
+        li t0, \offset+64
+        add t0, sp, t0
+
+        vle32.v         v16, (t0)
+        addi            t0, t0, 16
+        vle32.v         v17, (t0)
+        addi            t0, t0, 16
+        vle32.v         v18, (t0)
+        addi            t0, t0, 16
+        vle32.v         v19, (t0)
+
+        butterfly16     v16, v25, v17, v26, v18, v27, v19, v28
+        store_to_stack_rvv  (\offset + 64), (\offset + 176), v20, v25, v26, v27, v19, v18, v17, v16
+        ret
+endfunc
+.endm
+
+tr_16x4_rvv noscale, 0, 2048, 4
+
+.macro load_trans_32x4
+        li x12, 90
+        li x13, 90
+        li x14, 88
+        li x15, 85
+        li x16, 82
+        li x17, 78
+        li x18, 73
+        li x19, 67
+        li x20, 61
+        li x21, 54
+        li x22, 46
+        li x23, 38
+        li x24, 31
+        li x25, 22
+        li x26, 13
+        li x27, 4
+.endm
+
+.macro tr_32x4_rvv name, shift
+func func_tr_32x4_\name\()_rvv, zve64x
+        mv             t3, ra
+
+        jal              func_tr_16x4_noscale_rvv
+
+        vsetivli    zero, 2, e64, m1, ta, ma
+        load32_rvv a0
+
+        li t0, 2048
+        add t2, sp, t0
+
+        load_trans_32x4
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        tr_block1
+        mv                      t5, a1
+        addi                    t1, a1, (56 + 3 * 64)
+        scale_store_rvv         \shift, t5, t1, 64
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        tr_block2
+        addi                    t5, a1, 8
+        addi                    t1, a1, (48 + 3 * 64)
+        scale_store_rvv         \shift, t5, t1, 64
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        tr_block3
+        addi                    t5, a1, 16
+        addi                    t1, a1, (40 + 3 * 64)
+        scale_store_rvv         \shift, t5, t1, 64
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        tr_block4
+        addi                    t5, a1, 24
+        addi                    t1, a1, (32 + 3 * 64)
+        scale_store_rvv         \shift, t5, t1, 64
+
+        jr t3
+endfunc
+.endm
+
+tr_32x4_rvv firstpass, 7
+tr_32x4_rvv secondpass_8, 20 - 8
+
+.macro idct_32x32 bitdepth
+func ff_hevc_idct_32x32_\bitdepth\()_rvv, zve64x
+        addi    sp, sp, -8*13
+        sd      ra, 8*12(sp)
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+        sd    s\i, 8*(11-\i)(sp)
+.endr
+        mv t6, a0
+
+        csrwi   vxrm, 1
+        li      t0, 2432
+        sub     sp, sp, t0
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+        li               t0, 8 * \i
+        add              a0, t6, t0
+
+        li               t0, 8 * \i * 32
+        add              a1, sp, t0
+
+        jal              func_tr_32x4_firstpass_rvv
+.endr
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+        addi             a0, sp, 8 * \i
+        addi             a1, t6, 8 * \i * 32
+        jal              func_tr_32x4_secondpass_\bitdepth\()_rvv
+.endr
+
+        li              t0, 2432
+        add             sp,  sp,  t0
+
+        ld    ra, 8*12(sp)
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+        ld    s\i, 8*(11-\i)(sp)
+.endr
+        addi    sp, sp, 8*13
+        ret
+endfunc
+.endm
+
+idct_32x32 8
diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c
index 1d8326a573..6dfb889eec 100644
--- a/libavcodec/riscv/hevcdsp_init.c
+++ b/libavcodec/riscv/hevcdsp_init.c
@@ -27,6 +27,8 @@
 #include "libavcodec/hevc/dsp.h"
 #include "libavcodec/riscv/h26x/h2656dsp.h"
 
+void ff_hevc_idct_32x32_8_rvv(int16_t *coeffs, int col_limit);
+
 #define RVV_FNASSIGN(member, v, h, fn, ext) \
         member[1][v][h] = ff_h2656_put_pixels_##8_##ext;  \
         member[3][v][h] = ff_h2656_put_pixels_##8_##ext;  \
@@ -40,27 +42,37 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int bit_depth)
     const int flags = av_get_cpu_flags();
     int vlenb;
 
-    if (!(flags & AV_CPU_FLAG_RVV_I32) || !(flags & AV_CPU_FLAG_RVB))
-        return;
-
     vlenb = ff_get_rv_vlenb();
-    if (vlenb >= 32) {
-        switch (bit_depth) {
-            case 8:
-                RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256);
-                RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256);
-                break;
-            default:
-                break;
-        }
-    } else if (vlenb >= 16) {
-        switch (bit_depth) {
-            case 8:
-                RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128);
-                RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128);
-                break;
-            default:
-                break;
+
+    if (flags & AV_CPU_FLAG_RVV_I64)
+        if (vlenb >= 16)
+            switch (bit_depth) {
+                case 8:
+                    c->idct[3]                     = ff_hevc_idct_32x32_8_rvv;
+                    break;
+                default:
+                    break;
+            }
+
+    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB)){
+        if (vlenb >= 32) {
+            switch (bit_depth) {
+                case 8:
+                    RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256);
+                    RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256);
+                    break;
+                default:
+                    break;
+            }
+        } else if (vlenb >= 16) {
+            switch (bit_depth) {
+                case 8:
+                    RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128);
+                    RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128);
+                    break;
+                default:
+                    break;
+            }
         }
     }
 #endif
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH v5] libavcodec/riscv:add RVV optimized idct_32x32_8 for HEVC
  2025-05-30  7:40 ` [FFmpeg-devel] [PATCH v5] " daichengrong
@ 2025-06-02 20:13   ` Rémi Denis-Courmont
  2025-06-06  9:26   ` [FFmpeg-devel] [PATCH v6] libavcodec/riscv: add " daichengrong
  1 sibling, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2025-06-02 20:13 UTC (permalink / raw)
  To: ffmpeg-devel

Le perjantaina 30. toukokuuta 2025, 10.40.45 Itä-Euroopan kesäaika 
daichengrong@iscas.ac.cn a écrit :
> From: daichengrong <daichengrong@iscas.ac.cn>
> 
> On Banana PI F3:
>         hevc_idct_32x32_8_c:                                118807.4 (
> 1.00x) hevc_idct_32x32_8_rvv_i64:                           13853.3 (
> 8.58x) hevc_idct_32x32_8_rvv_i64:                           20247.3 (
> 5.92x) (before)
> 
> Changes in v5:
>         Improve the continuity of vector operations
>         Optimize loading matrices from memory to using immediate
> instructions
> 
> Changes in v4:
>         Optimize unnecessary slide operations
>         Extract more scalars from vector registers into purpose registers

It might actually be faster to keep constant coefficients in vectors and avoid 
sliding half-sized vectors. On the other hand, this would increase vector 
register pressure, so it's not clear to me.

Also see inline...

> Changes in v3:
>         remove the slides in transposition and spill values from vector
> registers to stack
> 
> Changes in v2:
>         deleted tabs
>         remove the unnecessary t0 in vsetivli
>         extract scalars directly into general registers
> 
> ---
>  libavcodec/riscv/Makefile           |   1 +
>  libavcodec/riscv/hevcdsp_idct_rvv.S | 719 ++++++++++++++++++++++++++++
>  libavcodec/riscv/hevcdsp_init.c     |  52 +-
>  3 files changed, 752 insertions(+), 20 deletions(-)
>  create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S
> 
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index a80d2fa2e7..dfc33afbee 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o
> riscv/h264dsp_rvv.o \ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
>  RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
>  OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
> +OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
>  RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
>  OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
>  RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
> diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S
> b/libavcodec/riscv/hevcdsp_idct_rvv.S new file mode 100644
> index 0000000000..4628415631
> --- /dev/null
> +++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
> @@ -0,0 +1,719 @@
> +/*
> + * Copyright (c) 2025 Institute of Software Chinese Academy of Sciences
> (ISCAS). + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3
> +        .ifc \op0, -
> +                neg    t0, \t0
> +        .endif
> +        .ifc \op1, -
> +                neg    t1, \t1
> +        .endif
> +        .ifc \op2, -
> +                neg    t4, \t2
> +        .endif
> +        .ifc \op3, -
> +                neg    t5, \t3
> +        .endif
> +
> +        .ifc \op0, -
> +                vwmacc.vx    v24, t0, \in
> +        .else
> +                vwmacc.vx    v24, \t0, \in
> +        .endif
> +        .ifc \op1, -
> +                vwmacc.vx    v25, t1, \in
> +        .else
> +                vwmacc.vx    v25, \t1, \in
> +        .endif
> +        .ifc \op2, -
> +                vwmacc.vx    v26, t4, \in
> +        .else
> +                vwmacc.vx    v26, \t2, \in
> +        .endif
> +        .ifc \op3, -
> +                vwmacc.vx    v27, t5, \in
> +        .else
> +                vwmacc.vx    v27, \t3, \in
> +        .endif
> +.endm
> +
> +.macro tr_block_init
> +        vslidedown.vi    v12, v4, 4
> +        vslidedown.vi    v13, v5, 4
> +        vslidedown.vi    v14, v6, 4
> +        vslidedown.vi    v15, v7, 4
> +
> +        vslidedown.vi    v20, v16, 4
> +        vslidedown.vi    v21, v17, 4
> +        vslidedown.vi    v22, v18, 4
> +        vslidedown.vi    v23, v19, 4
> +.endm
> +
> +.macro tr_block1
> +        tr_block_init
> +
> +        vwmul.vx        v24, v4, x12
> +        vwmul.vx        v25, v4, x13
> +        vwmul.vx        v26, v4, x14
> +        vwmul.vx        v27, v4, x15
> +
> +        add_member32    v12, x13, x16, x19, x22,  +,  +,  +,  +
> +        add_member32    v5, x14, x19, x24, x26,  +,  +,  +,  -
> +        add_member32    v13, x15, x22, x26, x19,  +,  +,  -,  -
> +        add_member32    v6, x16, x25, x21, x12,  +,  +,  -,  -
> +        add_member32    v14, x17, x27, x16, x18,  +,  -,  -,  -
> +        add_member32    v7, x18, x24, x12, x25,  +,  -,  -,  -
> +        add_member32    v15, x19, x21, x17, x23,  +,  -,  -,  +
> +
> +
> +        add_member32    v16, x20, x18, x22, x16,  +,  -,  -,  +
> +        add_member32    v20, x21, x15, x27, x14,  +,  -,  -,  +
> +        add_member32    v17, x22, x13, x23, x21,  +,  -,  +,  +
> +        add_member32    v21, x23, x14, x18, x27,  +,  -,  +,  -
> +        add_member32    v18, x24, x17, x13, x20,  +,  -,  +,  -
> +        add_member32    v22, x25, x20, x15, x13,  +,  -,  +,  -
> +        add_member32    v19, x26, x23, x20, x17,  +,  -,  +,  -
> +        add_member32    v23, x27, x26, x25, x24,  +,  -,  +,  -
> +.endm
> +
> +.macro tr_block2
> +        tr_block_init
> +
> +        vwmul.vx        v24, v4, x16
> +        vwmul.vx        v25, v4, x17
> +        vwmul.vx        v26, v4, x18
> +        vwmul.vx        v27, v4, x19
> +
> +        add_member32    v12, x25, x27, x24, x21,  +,  -,  -,  -
> +        add_member32    v5, x21, x16, x12, x17,  -,  -,  -,  -
> +        add_member32    v13, x12, x18, x25, x23,  -,  -,  -,  +
> +        add_member32    v6, x20, x26, x17, x15,  -,  +,  +,  +
> +        add_member32    v14, x26, x15, x19, x25,  +,  +,  +,  -
> +        add_member32    v7, x17, x19, x23, x12,  +,  +,  -,  -
> +        add_member32    v15, x15, x25, x13, x27,  +,  -,  -,  +
> +
> +        add_member32    v16, x24, x14, x26, x13,  +,  -,  -,  +
> +        add_member32    v20, x22, x20, x16, x26,  -,  -,  +,  +
> +        add_member32    v17, x13, x24, x20, x14,  -,  +,  +,  -
> +        add_member32    v21, x19, x13, x22, x24,  -,  +,  -,  -
> +        add_member32    v18, x27, x21, x14, x16,  +,  +,  -,  +
> +        add_member32    v22, x18, x23, x27, x22,  +,  -,  -,  +
> +        add_member32    v19, x14, x13, x15, x18,  +,  -,  +,  -
> +        add_member32    v23, x23, x22, x21, x20,  +,  -,  +,  -
> +.endm
> +
> +.macro tr_block3
> +        tr_block_init
> +
> +        vwmul.vx        v24, v4, x20
> +        vwmul.vx        v25, v4, x21
> +        vwmul.vx        v26, v4, x22
> +        vwmul.vx        v27, v4, x23
> +
> +        add_member32    v12, x18, x15, x12, x14,  -,  -,  -,  -
> +        add_member32    v5, x22, x27, x23, x18,  -,  -,  +,  +
> +        add_member32    v13, x16, x14, x21, x27,  +,  +,  +,  -
> +        add_member32    v6, x24, x22, x13, x19,  +,  -,  -,  -
> +        add_member32    v14, x14, x20, x24, x12,  -,  -,  +,  +
> +        add_member32    v7, x26, x16, x20, x22,  -,  +,  +,  -
> +        add_member32    v15, x12, x26, x14, x24,  +,  +,  -,  -
> +        add_member32    v16, x27, x13, x25, x15,  -,  -,  +,  +
> +        add_member32    v20, x13, x23, x19, x17,  -,  +,  +,  -
> +        add_member32    v17, x25, x19, x15, x26,  +,  +,  -,  +
> +        add_member32    v21, x15, x17, x26, x20,  +,  -,  +,  +
> +        add_member32    v18, x23, x25, x18, x13,  -,  -,  +,  -
> +        add_member32    v22, x17, x12, x16, x21,  -,  +,  -,  +
> +        add_member32    v19, x21, x24, x27, x25,  +,  -,  +,  +
> +        add_member32    v23, x19, x18, x17, x16,  +,  -,  +,  -
> +.endm
> +
> +.macro tr_block4
> +        tr_block_init
> +
> +        vwmul.vx        v24, v4, x24
> +        vwmul.vx        v25, v4, x25
> +        vwmul.vx        v26, v4, x26
> +        vwmul.vx        v27, v4, x27
> +
> +        add_member32    v12, x17, x20, x23, x26,  -,  -,  -,  -
> +        add_member32    v5, x12, x15, x20, x25,  +,  +,  +,  +
> +        add_member32    v13, x20, x12, x17, x24,  -,  -,  -,  -
> +        add_member32    v6, x27, x18, x14, x23,  +,  +,  +,  +
> +        add_member32    v14, x21, x23, x12, x22,  +,  -,  -,  -
> +        add_member32    v7, x14, x27, x15, x21,  -,  -,  +,  +
> +        add_member32    v15, x16, x22, x18, x20,  +,  +,  -,  -
> +        add_member32    v16, x23, x17, x21, x19,  -,  -,  +,  +
> +        add_member32    v20, x25, x13, x24, x18,  -,  +,  -,  -
> +        add_member32    v17, x18, x16, x27, x17,  +,  -,  +,  +
> +        add_member32    v21, x13, x21, x25, x16,  -,  +,  +,  -
> +        add_member32    v18, x19, x26, x22, x15,  +,  -,  -,  +
> +        add_member32    v22, x26, x24, x19, x14,  -,  -,  +,  -
> +        add_member32    v19, x22, x19, x16, x13,  -,  +,  -,  +
> +        add_member32    v23, x15, x14, x13, x12,  +,  -,  +,  -
> +.endm
> +
> +.macro butterfly e, o, tmp_p, tmp_m
> +        vadd.vv             \tmp_p, \e, \o
> +        vsub.vv             \tmp_m, \e, \o
> +.endm
> +
> +.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
> +        vadd.vv              v20, \in0, \in1
> +        vsub.vv              \in0, \in0, \in1
> +        vadd.vv              \in1, \in2, \in3
> +        vsub.vv              \in2, \in2, \in3
> +        vadd.vv              \in3, \in4, \in5
> +        vsub.vv              \in4, \in4, \in5
> +        vadd.vv              \in5, \in6, \in7
> +        vsub.vv              \in6, \in6, \in7
> +.endm
> +
> +.macro butterfly32 in0, in1, in2, in3, out
> +        vadd.vv             \out, \in0, \in1
> +        vsub.vv             \in0, \in0, \in1
> +        vadd.vv             \in1, \in2, \in3
> +        vsub.vv             \in2, \in2, \in3
> +.endm
> +
> +.macro add_member in, tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7
> +        vwmacc.vx    v21, \tt0, \in
> +        vwmacc.vx    v22, \tt1, \in
> +        vwmacc.vx    v23, \tt2, \in
> +        vwmacc.vx    v24, \tt3, \in
> +        vwmacc.vx    v25, \tt4, \in
> +        vwmacc.vx    v26, \tt5, \in
> +        vwmacc.vx    v27, \tt6, \in
> +        vwmacc.vx    v28, \tt7, \in
> +.endm
> +
> +.macro load16_rvv in0, in1, in2, in3, lo, hi, step
> +        sub t0, \hi, \lo
> +
> +        vlse64.v \in0, (\lo), t0
> +        add \lo, \lo, \step

I'm paraphrasing Martin here, but this sort of sequence is not nice on in-
order cores. Ditto several times throughout the patchset.

> +
> +        vlse64.v \in1, (\lo), t0
> +        add \lo, \lo, \step
> +
> +        vlse64.v \in2, (\lo), t0
> +        add \lo, \lo, \step
> +
> +        vlse64.v \in3, (\lo), t0
> +.endm
> +
> +.macro store16_rvv in0, in1, in2, in3, lo, hi, step
> +        sub t0, \hi, \lo
> +        li  t1, 2 * \step
> +        vsse64.v \in0, (\lo), t0
> +
> +        addi \lo, \lo, \step
> +        sub t0, t0, t1
> +        vsse64.v \in1, (\lo), t0
> +
> +        addi \lo, \lo, \step
> +        sub t0, t0, t1
> +        vsse64.v \in2, (\lo), t0
> +
> +        addi \lo, \lo, \step
> +        sub t0, t0, t1
> +        vsse64.v \in3, (\lo), t0
> +.endm
> +
> +.macro load32_rvv src
> +        addi             a2,  \src, 64
> +        li               a3,  256
> +        li               t0, 128
> +
> +        vlse64.v v4, (a2), t0
> +
> +        add s2, a2, a3

Can't you use addi here?

> +        vlse64.v v5, (s2), t0
> +
> +        add s3, s2, a3
> +        vlse64.v v6, (s3), t0
> +
> +        add s4, s3, a3
> +        vlse64.v v7, (s4), t0
> +
> +        add s5, s4, a3
> +        vlse64.v v16, (s5), t0
> +
> +        add s6, s5, a3
> +        vlse64.v v17, (s6), t0
> +
> +        add s7, s6, a3
> +        vlse64.v v18, (s7), t0
> +
> +        add s8, s7, a3
> +        vlse64.v v19, (s8), t0
> +.endm
> +
> +.macro scale_store_rvv shift, dstL, dstH, step
> +        vsetivli    zero, 8, e16, m1, ta, ma
> +        vle16.v        v28, (t2)
> +        addi            t2, t2, 2*8
> +        vle16.v        v29, (t2)
> +        addi            t2, t2, 2*8
> +        vle16.v        v30, (t2)
> +        addi            t2, t2, 2*8
> +        vle16.v        v31, (t2)
> +        addi            t2, t2, 2*8
> +
> +        vsetivli    zero, 4, e32, m1, ta, ma
> +        butterfly32     v28, v24, v29, v25, v2
> +        butterfly32     v30, v26, v31, v27, v3
> +
> +        scale           v20, v21, v22, v23, v2, v28, v24, v29, v3, v30,
> v26, v31, \shift +
> +        transpose16_4x4_2 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
> +
> +        vsetivli    zero, 2, e64, m1, ta, ma
> +        store16_rvv         v20, v21, v22, v23, \dstL, \dstH, \step
> +.endm
> +
> +.macro store_to_stack_rvv off1, off2, in0, in2, in4, in6, in7, in5, in3,
> in1
> +        li      t0, \off1
> +        add     a2, sp, t0

Again, addi

> +
> +        li      t0, \off2
> +        add     a3, sp, t0
> +
> +        vse32.v         \in0, (a2)
> +        vse32.v         \in1, (a3)
> +        addi             a2, a2, 16
> +        addi             a3, a3, -16
> +        vse32.v         \in2, (a2)
> +        vse32.v         \in3, (a3)
> +        addi             a2, a2, 16
> +        addi             a3, a3, -16
> +        vse32.v         \in4, (a2)
> +        vse32.v         \in5, (a3)
> +        addi             a2, a2, 16
> +        addi             a3, a3, -16
> +        vse32.v         \in6, (a2)
> +        vse32.v         \in7, (a3)
> +.endm
> +
> +.macro transpose16_4x4_2 r0, r1, r2, r3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5
> +        vsetivli    zero, 8, e16, m1, ta, ma
> +        vid.v     v0
> +        vand.vi   v8, v0, 1
> +        vmsne.vi  v0, v8, 0
> +
> +        vslideup.vi    v8, v\r1, 1
> +        vsetivli    zero, 4, e16, m1, ta, ma
> +        vmerge.vvm       v\tmp0\(), v\r0\(), v8, v0
> +
> +        vslidedown.vi    v8, v\r0, 1
> +        vmerge.vvm       v\tmp1\(), v8, v\r1\(), v0
> +
> +        vslideup.vi    v8, v\r3, 1
> +        vmerge.vvm       v\tmp2\(), v\r2\(), v8, v0
> +
> +        vslidedown.vi    v8, v\r2\(), 1
> +        vmerge.vvm       v\tmp3\(), v8, v\r3\(), v0
> +
> +        vsetivli    zero, 2, e32, m1, ta, ma
> +
> +        vslideup.vi    v8, v\tmp2\(), 1
> +        vmerge.vvm       v\tmp4\(), v\tmp0\(), v8, v0
> +
> +        vslidedown.vi    v8, v\tmp0\(), 1
> +        vmerge.vvm       v\tmp5\(), v8, v\tmp2\(), v0
> +
> +        vslideup.vi    v8, v\tmp3\(), 1
> +        vmerge.vvm       v\tmp0\(), v\tmp1\(), v8, v0
> +
> +        vslidedown.vi    v8, v\tmp1\(), 1
> +        vmerge.vvm       v\tmp2\(), v8, v\tmp3\(), v0
> +
> +        vsetivli    zero, 1, e64, m1, ta, ma
> +        vmv.v.v  v\r0\(), v\tmp4\()
> +        vmv.v.v  v\r2\(), v\tmp5\()
> +        vmv.v.v  v\r1\(), v\tmp0\()
> +        vmv.v.v  v\r3\(), v\tmp2\()
> +
> +        vsetivli    zero, 8, e16, m1, ta, ma
> +
> +        vslideup.vi    v8, v\r2\(), 1
> +        vmerge.vvm       v\tmp0\(), v\r3\(), v8, v0
> +
> +        vslidedown.vi    v8, v\r3\(), 1
> +        vmerge.vvm        v\tmp1\(), v8, v\r2\(), v0
> +
> +        vslideup.vi    v8, v\r0\(), 1
> +        vmerge.vvm       v\tmp2\(), v\r1\(),v8, v0
> +
> +        vslidedown.vi    v8, v\r1\(), 1
> +        vmerge.vvm        v\tmp3\(), v8, v\r0\(), v0
> +
> +        vsetivli    zero, 4, e32, m1, ta, ma
> +
> +        vslideup.vi    v8, v\tmp2\(), 1
> +        vmerge.vvm       v\tmp4\(), v\tmp0\(), v8, v0
> +
> +        vslidedown.vi    v8, v\tmp0\(), 1
> +        vmerge.vvm       v\tmp5\(), v8, v\tmp2\(), v0
> +
> +        vslideup.vi    v8, v\tmp3\(), 1
> +        vmerge.vvm       v\tmp0\(), v\tmp1\(), v8, v0
> +
> +        vslidedown.vi    v8, v\tmp1\(), 1
> +        vmerge.vvm       v\tmp2\(), v8, v\tmp3\(), v0
> +
> +        vsetivli    zero, 2, e64, m1, ta, ma
> +
> +        vmerge.vvm       v\r3\(), v\r3\(), v\tmp4\(), v0
> +        vmerge.vvm       v\r1\(), v\r1\(), v\tmp5\(), v0
> +        vmerge.vvm       v\r2\(), v\r2\(), v\tmp0\(), v0
> +        vmerge.vvm       v\r0\(), v\r0\(), v\tmp2\(), v0
> +.endm

Again, I doubt that manual in-register transposition is any faster than 
segmented loads/stores in this case. Loads and stores are slow, but so are 
slides.

> +
> +.macro load_trans_8x4
> +        li s6, 89
> +        li s7, 75
> +        li s8, 50
> +        li s9, 18
> +
> +        neg    s2, s6
> +        neg    s4, s8
> +        neg    s5, s9
> +.endm
> +
> +.macro scale out0, out1, out2, out3, in0, in1, in2, in3, in4, in5, in6,
> in7, shift +        vsetivli        zero, 4, e16, mf2, ta, ma
> +        vnclip.wi       \out0\(), \in0\(), \shift
> +        vnclip.wi       \out1\(), \in2\(), \shift
> +        vnclip.wi       \out2\(), \in4\(), \shift
> +        vnclip.wi       \out3\(), \in6\(), \shift
> +
> +        vnclip.wi       \in1\(), \in1\(), \shift
> +        vnclip.wi       \in3\(), \in3\(), \shift
> +        vnclip.wi       \in5\(), \in5\(), \shift
> +        vnclip.wi       \in7\(), \in7\(), \shift
> +
> +        vsetivli        zero, 2, e64, m1, ta, ma
> +        vslideup.vi     \out0\(), \in1\(), 1
> +        vslideup.vi     \out1\(), \in3\(), 1
> +        vslideup.vi     \out2\(), \in5\(), 1
> +        vslideup.vi     \out3\(), \in7\(), 1
> +.endm
> +
> +.macro load_trans_4x4
> +        li s2, 64
> +        li s3, 83
> +
> +        li s5, 36
> +
> +        neg     s6, s2
> +        neg     s7, s3

You don't need a register dependency to calculate a constant.

> +.endm
> +
> +.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3
> +        vsetivli    zero, 4, e16, m1, ta, ma
> +        vwcvt.x.x.v     v8, \in0
> +        vsetivli    zero, 4, e32, m1, ta, ma
> +        vsll.vi         v28, v8, 6
> +
> +        vsetivli    zero, 16, e8, m1, ta, ma
> +        vmv.v.v v29, v28
> +
> +        load_trans_4x4
> +
> +        vsetivli    zero, 4, e16, mf2, ta, ma
> +        vwmul.vx    v30, \in1, s3
> +        vwmul.vx    v31, \in1, s5
> +        vwmacc.vx    v28, s2, \in2
> +
> +        vwmacc.vx    v29, s6, \in2
> +        vwmacc.vx    v30, s5, \in3
> +        vwmacc.vx    v31, s7, \in3
> +
> +        vsetivli    zero, 4, e32, m1, ta, ma
> +        vadd.vv             \out0, v28, v30
> +        vadd.vv             \out1, v29, v31
> +        vsub.vv             \out2, v29, v31
> +        vsub.vv             \out3, v28, v30
> +.endm
> +
> +.macro tr16_8x4 in0, in1, in2, in3, offset
> +        tr_4x4_8        \in0, \in1, \in2, \in3, v24, v25, v26, v27
> +
> +        load_trans_8x4
> +
> +        vsetivli    zero, 4, e16, mf2, ta, ma
> +        vslidedown.vi    v8, \in0, 4
> +        vwmul.vx    v28, v8, s6
> +        vwmul.vx    v29, v8, s7
> +        vwmul.vx    v30, v8, s8
> +        vwmul.vx    v31, v8, s9
> +
> +        vslidedown.vi    v8, \in1, 4
> +        vwmacc.vx    v28, s7, v8
> +        vwmacc.vx    v29, s5, v8
> +        vwmacc.vx    v30, s2, v8
> +        vwmacc.vx    v31, s4, v8
> +
> +        vslidedown.vi    v8, \in2, 4
> +        vwmacc.vx    v28, s8, v8
> +        vwmacc.vx    v29, s2, v8
> +        vwmacc.vx    v30, s9, v8
> +        vwmacc.vx    v31, s7, v8
> +
> +        vslidedown.vi    v8, \in3, 4
> +        vwmacc.vx    v28, s9, v8
> +        vwmacc.vx    v29, s4, v8
> +        vwmacc.vx    v30, s7, v8
> +        vwmacc.vx    v31, s2, v8
> +
> +        vsetivli    zero, 4, e32, m1, ta, ma
> +        butterfly       v24, v28, v16, v23
> +        butterfly       v25, v29, v17, v22
> +        butterfly       v26, v30, v18, v21
> +        butterfly       v27, v31, v19, v20
> +
> +        li t0, \offset
> +        add t0, sp, t0
> +
> +        vse32.v v16, (t0)
> +        add t0, t0, 16

Not valid syntax.

> +        vse32.v v17, (t0)
> +        add t0, t0, 16
> +        vse32.v v18, (t0)
> +        add t0, t0, 16
> +        vse32.v v19, (t0)
> +        add t0, t0, 16
> +
> +        vse32.v v20, (t0)
> +        add t0, t0, 16
> +        vse32.v v21, (t0)
> +        add t0, t0, 16
> +        vse32.v v22, (t0)
> +        add t0, t0, 16
> +        vse32.v v23, (t0)
> +.endm
> +
> +.macro load_trans_16x4
> +        li x12, 90
> +        li x13, 87
> +        li x14, 80
> +        li x15, 70
> +        li x16, 57
> +        li x17, 43
> +        li x18, 25
> +        li x19, 9
> +
> +        neg x20, x12
> +        neg x21, x13
> +        neg x22, x14
> +        neg x23, x15
> +
> +        neg x24, x16
> +        neg x25, x17
> +        neg x26, x18
> +        neg x27, x19
> +.endm
> +
> +.macro tr_16x4_rvv name, shift, offset, step
> +func func_tr_16x4_\name\()_rvv, zve64x
> +        mv             a2,  a0
> +        addi           a3,  a0, \step * 64
> +        li             a4,  \step * 128
> +        vsetivli    zero, 2, e64, m1, ta, ma
> +        load16_rvv     v16, v17, v18, v19, a2, a3, a4
> +
> +        tr16_8x4        v16, v17, v18, v19, \offset
> +
> +        addi            a2,  a0, \step * 32
> +        addi            a3,  a0, \step * 3 *32
> +        li              a4,  \step * 128
> +        vsetivli        zero, 2, e64, m1, ta, ma
> +        load16_rvv          v20, v17, v18, v19, a2, a3, a4
> +
> +        load_trans_16x4
> +
> +        vsetivli    zero, 4, e16, mf2, ta, ma
> +        vslidedown.vi    v0, v17, 4
> +        vslidedown.vi    v1, v18, 4
> +        vslidedown.vi    v2, v19, 4
> +        vslidedown.vi    v3, v20, 4
> +
> +        vwmul.vx            v21, v20, x12
> +        vwmul.vx            v22, v20, x13
> +        vwmul.vx            v23, v20, x14
> +        vwmul.vx            v24, v20, x15
> +
> +        vwmul.vx            v25, v20, x16
> +        vwmul.vx            v26, v20, x17
> +        vwmul.vx            v27, v20, x18
> +        vwmul.vx            v28, v20, x19
> +
> +        add_member      v3, x13, x16, x19, x25, x22, x20, x23, x26
> +        add_member      v17, x14, x19, x23, x21, x26, x16, x12, x17
> +        add_member      v0, x15, x25, x21, x19, x12, x18, x22, x24
> +        add_member      v18, x16, x22, x26, x12, x27, x21, x17, x15
> +        add_member      v1, x17, x20, x16, x18, x21, x15, x19, x22
> +        add_member      v19, x18, x23, x12, x22, x17, x19, x24, x13
> +        add_member      v2, x19, x26, x17, x24, x15, x22, x13, x20
> +
> +        li t0, \offset
> +        add t0, sp, t0
> +        vle32.v        v16, (t0)
> +
> +        addi            t0, t0, 16
> +        vle32.v        v17, (t0)
> +
> +        addi            t0, t0, 16
> +        vle32.v        v18, (t0)
> +
> +        addi            t0, t0, 16
> +        vle32.v        v19, (t0)
> +
> +        vsetivli    zero, 4, e32, m1, ta, ma
> +        butterfly16       v16, v21, v17, v22, v18, v23, v19, v24
> +        store_to_stack_rvv  \offset, (\offset + 240), v20, v21, v22, v23,
> v19, v18, v17, v16 +
> +        li t0, \offset+64
> +        add t0, sp, t0
> +
> +        vle32.v         v16, (t0)
> +        addi            t0, t0, 16
> +        vle32.v         v17, (t0)
> +        addi            t0, t0, 16
> +        vle32.v         v18, (t0)
> +        addi            t0, t0, 16
> +        vle32.v         v19, (t0)
> +
> +        butterfly16     v16, v25, v17, v26, v18, v27, v19, v28
> +        store_to_stack_rvv  (\offset + 64), (\offset + 176), v20, v25, v26,
> v27, v19, v18, v17, v16 +        ret
> +endfunc
> +.endm
> +
> +tr_16x4_rvv noscale, 0, 2048, 4
> +
> +.macro load_trans_32x4
> +        li x12, 90
> +        li x13, 90
> +        li x14, 88
> +        li x15, 85
> +        li x16, 82
> +        li x17, 78
> +        li x18, 73
> +        li x19, 67
> +        li x20, 61
> +        li x21, 54
> +        li x22, 46
> +        li x23, 38
> +        li x24, 31
> +        li x25, 22
> +        li x26, 13
> +        li x27, 4
> +.endm
> +
> +.macro tr_32x4_rvv name, shift
> +func func_tr_32x4_\name\()_rvv, zve64x
> +        mv             t3, ra
> +
> +        jal              func_tr_16x4_noscale_rvv
> +
> +        vsetivli    zero, 2, e64, m1, ta, ma
> +        load32_rvv a0
> +
> +        li t0, 2048
> +        add t2, sp, t0
> +
> +        load_trans_32x4
> +
> +        vsetivli    zero, 4, e16, mf2, ta, ma
> +        tr_block1
> +        mv                      t5, a1
> +        addi                    t1, a1, (56 + 3 * 64)
> +        scale_store_rvv         \shift, t5, t1, 64
> +
> +        vsetivli    zero, 4, e16, mf2, ta, ma
> +        tr_block2
> +        addi                    t5, a1, 8
> +        addi                    t1, a1, (48 + 3 * 64)
> +        scale_store_rvv         \shift, t5, t1, 64
> +
> +        vsetivli    zero, 4, e16, mf2, ta, ma
> +        tr_block3
> +        addi                    t5, a1, 16
> +        addi                    t1, a1, (40 + 3 * 64)
> +        scale_store_rvv         \shift, t5, t1, 64
> +
> +        vsetivli    zero, 4, e16, mf2, ta, ma
> +        tr_block4
> +        addi                    t5, a1, 24
> +        addi                    t1, a1, (32 + 3 * 64)
> +        scale_store_rvv         \shift, t5, t1, 64
> +
> +        jr t3

t3 is not a link register. This breaks return address prediction.

> +endfunc
> +.endm
> +
> +tr_32x4_rvv firstpass, 7
> +tr_32x4_rvv secondpass_8, 20 - 8
> +
> +.macro idct_32x32 bitdepth
> +func ff_hevc_idct_32x32_\bitdepth\()_rvv, zve64x
> +        addi    sp, sp, -8*13
> +        sd      ra, 8*12(sp)
> +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
> +        sd    s\i, 8*(11-\i)(sp)
> +.endr

This won't compile on riscv32.

> +        mv t6, a0
> +
> +        csrwi   vxrm, 1
> +        li      t0, 2432
> +        sub     sp, sp, t0
> +
> +.irp i, 0, 1, 2, 3, 4, 5, 6, 7
> +        li               t0, 8 * \i
> +        add              a0, t6, t0
> +
> +        li               t0, 8 * \i * 32
> +        add              a1, sp, t0
> +
> +        jal              func_tr_32x4_firstpass_rvv
> +.endr
> +
> +.irp i, 0, 1, 2, 3, 4, 5, 6, 7
> +        addi             a0, sp, 8 * \i
> +        addi             a1, t6, 8 * \i * 32
> +        jal              func_tr_32x4_secondpass_\bitdepth\()_rvv
> +.endr
> +
> +        li              t0, 2432
> +        add             sp,  sp,  t0
> +
> +        ld    ra, 8*12(sp)
> +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
> +        ld    s\i, 8*(11-\i)(sp)
> +.endr
> +        addi    sp, sp, 8*13
> +        ret
> +endfunc
> +.endm
> +
> +idct_32x32 8
> diff --git a/libavcodec/riscv/hevcdsp_init.c
> b/libavcodec/riscv/hevcdsp_init.c index 1d8326a573..6dfb889eec 100644
> --- a/libavcodec/riscv/hevcdsp_init.c
> +++ b/libavcodec/riscv/hevcdsp_init.c
> @@ -27,6 +27,8 @@
>  #include "libavcodec/hevc/dsp.h"
>  #include "libavcodec/riscv/h26x/h2656dsp.h"
> 
> +void ff_hevc_idct_32x32_8_rvv(int16_t *coeffs, int col_limit);
> +
>  #define RVV_FNASSIGN(member, v, h, fn, ext) \
>          member[1][v][h] = ff_h2656_put_pixels_##8_##ext;  \
>          member[3][v][h] = ff_h2656_put_pixels_##8_##ext;  \
> @@ -40,27 +42,37 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int
> bit_depth) const int flags = av_get_cpu_flags();
>      int vlenb;
> 
> -    if (!(flags & AV_CPU_FLAG_RVV_I32) || !(flags & AV_CPU_FLAG_RVB))
> -        return;
> -
>      vlenb = ff_get_rv_vlenb();
> -    if (vlenb >= 32) {
> -        switch (bit_depth) {
> -            case 8:
> -                RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256);
> -                RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256);
> -                break;
> -            default:
> -                break;
> -        }
> -    } else if (vlenb >= 16) {
> -        switch (bit_depth) {
> -            case 8:
> -                RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128);
> -                RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128);
> -                break;
> -            default:
> -                break;
> +
> +    if (flags & AV_CPU_FLAG_RVV_I64)
> +        if (vlenb >= 16)
> +            switch (bit_depth) {
> +                case 8:
> +                    c->idct[3]                     =
> ff_hevc_idct_32x32_8_rvv; +                    break;
> +                default:
> +                    break;
> +            }
> +
> +    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB)){
> +        if (vlenb >= 32) {
> +            switch (bit_depth) {
> +                case 8:
> +                    RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,
> rvv_256); +                    RVV_FNASSIGN(c->put_hevc_epel, 0, 0,
> pel_pixels, rvv_256); +                    break;
> +                default:
> +                    break;
> +            }
> +        } else if (vlenb >= 16) {
> +            switch (bit_depth) {
> +                case 8:
> +                    RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,
> rvv_128); +                    RVV_FNASSIGN(c->put_hevc_epel, 0, 0,
> pel_pixels, rvv_128); +                    break;
> +                default:
> +                    break;
> +            }
>          }
>      }
>  #endif


-- 
德尼-库尔蒙‧雷米
Villeneuve de Tapiola, ex-République finlandaise d´Uusimaa



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH v6] libavcodec/riscv: add RVV optimized idct_32x32_8 for HEVC
  2025-05-30  7:40 ` [FFmpeg-devel] [PATCH v5] " daichengrong
  2025-06-02 20:13   ` Rémi Denis-Courmont
@ 2025-06-06  9:26   ` daichengrong
  1 sibling, 0 replies; 5+ messages in thread
From: daichengrong @ 2025-06-06  9:26 UTC (permalink / raw)
  To: ffmpeg-devel

From: daichengrong <daichengrong@iscas.ac.cn>

On Banana PI F3:
        hevc_idct_32x32_8_c:                                119249.5 ( 1.00x)
        hevc_idct_32x32_8_rvv_i64:                           13352.5 ( 8.93x)
        hevc_idct_32x32_8_rvv_i64:                           13830.1 ( 8.66x) (transpose16_4x4_2 segmented L/S)

Changes in v6:
        Optimize data loading and avoid sliding half-sized vectors
        Adopt an instruction sorting strategy that is more favorable to in-order cores
        Encode more immediate values into instructions
        Support register save and restore of different xlen
        Optimize for VLEN > 128

Changes in v5:
        Improve the continuity of vector operations
        Optimize loading matrices from memory to using immediate instructions

Changes in v4:
        Optimize unnecessary slide operations
        Extract more scalars from vector registers into purpose registers

Changes in v3:
        remove the slides in transposition and spill values from vector registers to stack

Changes in v2:
        deleted tabs
        remove the unnecessary t0 in vsetivli
        extract scalars directly into general registers
---
 libavcodec/riscv/Makefile           |   1 +
 libavcodec/riscv/hevcdsp_idct_rvv.S | 748 ++++++++++++++++++++++++++++
 libavcodec/riscv/hevcdsp_init.c     |  61 ++-
 3 files changed, 791 insertions(+), 19 deletions(-)
 create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index a80d2fa2e7..dfc33afbee 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \
 OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
 RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 0000000000..2a0db809d9
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,748 @@
+/*
+ * Copyright (c) 2025 Institute of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3
+        .ifc \op0, -
+                neg    t0, \t0
+        .endif
+        .ifc \op1, -
+                neg    t1, \t1
+        .endif
+        .ifc \op2, -
+                neg    t4, \t2
+        .endif
+        .ifc \op3, -
+                neg    t5, \t3
+        .endif
+
+        .ifc \op0, -
+                vwmacc.vx    v24, t0, \in
+        .else
+                vwmacc.vx    v24, \t0, \in
+        .endif
+        .ifc \op1, -
+                vwmacc.vx    v25, t1, \in
+        .else
+                vwmacc.vx    v25, \t1, \in
+        .endif
+        .ifc \op2, -
+                vwmacc.vx    v26, t4, \in
+        .else
+                vwmacc.vx    v26, \t2, \in
+        .endif
+        .ifc \op3, -
+                vwmacc.vx    v27, t5, \in
+        .else
+                vwmacc.vx    v27, \t3, \in
+        .endif
+.endm
+
+.macro tr_block1
+        vwmul.vx        v24, v4, x12
+        vwmul.vx        v25, v4, x13
+        vwmul.vx        v26, v4, x14
+        vwmul.vx        v27, v4, x15
+
+        add_member32    v12, x13, x16, x19, x22,  +,  +,  +,  +
+        add_member32    v5, x14, x19, x24, x26,  +,  +,  +,  -
+        add_member32    v13, x15, x22, x26, x19,  +,  +,  -,  -
+        add_member32    v6, x16, x25, x21, x12,  +,  +,  -,  -
+        add_member32    v14, x17, x27, x16, x18,  +,  -,  -,  -
+        add_member32    v7, x18, x24, x12, x25,  +,  -,  -,  -
+        add_member32    v15, x19, x21, x17, x23,  +,  -,  -,  +
+
+        add_member32    v16, x20, x18, x22, x16,  +,  -,  -,  +
+        add_member32    v20, x21, x15, x27, x14,  +,  -,  -,  +
+        add_member32    v17, x22, x13, x23, x21,  +,  -,  +,  +
+        add_member32    v21, x23, x14, x18, x27,  +,  -,  +,  -
+        add_member32    v18, x24, x17, x13, x20,  +,  -,  +,  -
+        add_member32    v22, x25, x20, x15, x13,  +,  -,  +,  -
+        add_member32    v19, x26, x23, x20, x17,  +,  -,  +,  -
+        add_member32    v23, x27, x26, x25, x24,  +,  -,  +,  -
+.endm
+
+.macro tr_block2
+        vwmul.vx        v24, v4, x16
+        vwmul.vx        v25, v4, x17
+        vwmul.vx        v26, v4, x18
+        vwmul.vx        v27, v4, x19
+
+        add_member32    v12, x25, x27, x24, x21,  +,  -,  -,  -
+        add_member32    v5, x21, x16, x12, x17,  -,  -,  -,  -
+        add_member32    v13, x12, x18, x25, x23,  -,  -,  -,  +
+        add_member32    v6, x20, x26, x17, x15,  -,  +,  +,  +
+        add_member32    v14, x26, x15, x19, x25,  +,  +,  +,  -
+        add_member32    v7, x17, x19, x23, x12,  +,  +,  -,  -
+        add_member32    v15, x15, x25, x13, x27,  +,  -,  -,  +
+
+        add_member32    v16, x24, x14, x26, x13,  +,  -,  -,  +
+        add_member32    v20, x22, x20, x16, x26,  -,  -,  +,  +
+        add_member32    v17, x13, x24, x20, x14,  -,  +,  +,  -
+        add_member32    v21, x19, x13, x22, x24,  -,  +,  -,  -
+        add_member32    v18, x27, x21, x14, x16,  +,  +,  -,  +
+        add_member32    v22, x18, x23, x27, x22,  +,  -,  -,  +
+        add_member32    v19, x14, x13, x15, x18,  +,  -,  +,  -
+        add_member32    v23, x23, x22, x21, x20,  +,  -,  +,  -
+.endm
+
+.macro tr_block3
+        vwmul.vx        v24, v4, x20
+        vwmul.vx        v25, v4, x21
+        vwmul.vx        v26, v4, x22
+        vwmul.vx        v27, v4, x23
+
+        add_member32    v12, x18, x15, x12, x14,  -,  -,  -,  -
+        add_member32    v5, x22, x27, x23, x18,  -,  -,  +,  +
+        add_member32    v13, x16, x14, x21, x27,  +,  +,  +,  -
+        add_member32    v6, x24, x22, x13, x19,  +,  -,  -,  -
+        add_member32    v14, x14, x20, x24, x12,  -,  -,  +,  +
+        add_member32    v7, x26, x16, x20, x22,  -,  +,  +,  -
+        add_member32    v15, x12, x26, x14, x24,  +,  +,  -,  -
+        add_member32    v16, x27, x13, x25, x15,  -,  -,  +,  +
+        add_member32    v20, x13, x23, x19, x17,  -,  +,  +,  -
+        add_member32    v17, x25, x19, x15, x26,  +,  +,  -,  +
+        add_member32    v21, x15, x17, x26, x20,  +,  -,  +,  +
+        add_member32    v18, x23, x25, x18, x13,  -,  -,  +,  -
+        add_member32    v22, x17, x12, x16, x21,  -,  +,  -,  +
+        add_member32    v19, x21, x24, x27, x25,  +,  -,  +,  +
+        add_member32    v23, x19, x18, x17, x16,  +,  -,  +,  -
+.endm
+
+.macro tr_block4
+        vwmul.vx        v24, v4, x24
+        vwmul.vx        v25, v4, x25
+        vwmul.vx        v26, v4, x26
+        vwmul.vx        v27, v4, x27
+
+        add_member32    v12, x17, x20, x23, x26,  -,  -,  -,  -
+        add_member32    v5, x12, x15, x20, x25,  +,  +,  +,  +
+        add_member32    v13, x20, x12, x17, x24,  -,  -,  -,  -
+        add_member32    v6, x27, x18, x14, x23,  +,  +,  +,  +
+        add_member32    v14, x21, x23, x12, x22,  +,  -,  -,  -
+        add_member32    v7, x14, x27, x15, x21,  -,  -,  +,  +
+        add_member32    v15, x16, x22, x18, x20,  +,  +,  -,  -
+        add_member32    v16, x23, x17, x21, x19,  -,  -,  +,  +
+        add_member32    v20, x25, x13, x24, x18,  -,  +,  -,  -
+        add_member32    v17, x18, x16, x27, x17,  +,  -,  +,  +
+        add_member32    v21, x13, x21, x25, x16,  -,  +,  +,  -
+        add_member32    v18, x19, x26, x22, x15,  +,  -,  -,  +
+        add_member32    v22, x26, x24, x19, x14,  -,  -,  +,  -
+        add_member32    v19, x22, x19, x16, x13,  -,  +,  -,  +
+        add_member32    v23, x15, x14, x13, x12,  +,  -,  +,  -
+.endm
+
+.macro butterfly e, o, tmp_p, tmp_m
+        vadd.vv             \tmp_p, \e, \o
+        vsub.vv             \tmp_m, \e, \o
+.endm
+
+.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
+        vadd.vv              v20, \in0, \in1
+        vsub.vv              \in0, \in0, \in1
+        vadd.vv              \in1, \in2, \in3
+        vsub.vv              \in2, \in2, \in3
+        vadd.vv              \in3, \in4, \in5
+        vsub.vv              \in4, \in4, \in5
+        vadd.vv              \in5, \in6, \in7
+        vsub.vv              \in6, \in6, \in7
+.endm
+
+.macro butterfly32 in0, in1, in2, in3, out
+        vadd.vv             \out, \in0, \in1
+        vsub.vv             \in0, \in0, \in1
+        vadd.vv             \in1, \in2, \in3
+        vsub.vv             \in2, \in2, \in3
+.endm
+
+.macro add_member in, tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7
+        vwmacc.vx    v21, \tt0, \in
+        vwmacc.vx    v22, \tt1, \in
+        vwmacc.vx    v23, \tt2, \in
+        vwmacc.vx    v24, \tt3, \in
+        vwmacc.vx    v25, \tt4, \in
+        vwmacc.vx    v26, \tt5, \in
+        vwmacc.vx    v27, \tt6, \in
+        vwmacc.vx    v28, \tt7, \in
+.endm
+
+.macro load16_rvv in0, in1, in2, in3, off1, off2, step, in4, in5, in6, in7
+        addi t0,  a0, \off1
+        addi a2, t0, \step * 1
+        addi a3, t0, \step * 2
+        addi a4, t0, \step * 3
+
+        addi t1,  a0, \off2
+        addi s2, t1, \step * 1
+        addi s3, t1, \step * 2
+        addi s4, t1, \step * 3
+
+        vle16.v \in0, (t0)
+        vle16.v \in1, (a2)
+        vle16.v \in2, (a3)
+        vle16.v \in3, (a4)
+
+        vle16.v \in4, (t1)
+        vle16.v \in5, (s2)
+        vle16.v \in6, (s3)
+        vle16.v \in7, (s4)
+.endm
+
+.macro store16_rvv in0, in1, in2, in3, off1, off2, step
+        li t0, \off2
+        addi t0, t0, -\off1
+        addi t2, t0, -2 * \step
+        addi t4, t0, -4 * \step
+        addi s0, t0, -6 * \step
+
+        addi t1, a1, \off1
+        addi t3, t1, \step
+        addi t5, t1, 2 * \step
+        addi s1, t1, 3 * \step
+
+        vsse64.v \in0, (t1), t0
+        vsse64.v \in1, (t3), t2
+        vsse64.v \in2, (t5), t4
+        vsse64.v \in3, (s1), s0
+.endm
+
+.macro load32_rvv
+        addi t0,  a0, 64
+        addi a2, t0, 256 * 1
+        addi a3, t0, 256 * 2
+        addi a4, t0, 256 * 3
+        addi a5, t0, 256 * 4
+        addi a6, t0, 256 * 5
+        addi a7, t0, 256 * 6
+        addi s9, t0, 256 * 7
+
+        addi t1,  t0, 128
+        addi s2, t1, 256 * 1
+        addi s3, t1, 256 * 2
+        addi s4, t1, 256 * 3
+        addi s5, t1, 256 * 4
+        addi s6, t1, 256 * 5
+        addi s7, t1, 256 * 6
+        addi s8, t1, 256 * 7
+
+        vle64.v v4, (t0)
+        vle64.v v5, (a2)
+        vle64.v v6, (a3)
+        vle64.v v7, (a4)
+
+        vle64.v v16, (a5)
+        vle64.v v17, (a6)
+        vle64.v v18, (a7)
+        vle64.v v19, (s9)
+
+        vle64.v v12, (t1)
+        vle64.v v13, (s2)
+        vle64.v v14, (s3)
+        vle64.v v15, (s4)
+
+        vle64.v v20, (s5)
+        vle64.v v21, (s6)
+        vle64.v v22, (s7)
+        vle64.v v23, (s8)
+.endm
+
+.macro reload16 offset
+        li t0, 2048
+        add t0, sp, t0
+        addi t0, t0, \offset
+        addi t1, t0, 2*8*1
+        addi t2, t0, 2*8*2
+        addi t3, t0, 2*8*3
+
+        vsetivli    zero, 8, e16, m1, ta, ma
+        vle16.v        v28, (t0)
+        vle16.v        v29, (t1)
+        vle16.v        v30, (t2)
+        vle16.v        v31, (t3)
+.endm
+
+.macro scale_store_rvv vlen, shift, step, off1, off2, offset
+        reload16 \offset
+
+        vsetivli        zero, 4, e32, m1, ta, ma
+        butterfly32     v28, v24, v29, v25, v2
+        butterfly32     v30, v26, v31, v27, v3
+        scale           \vlen, v1, v10, v3, v9, v2, v28, v24, v29, v3, v30, v26, v31, \shift
+
+        transpose16_4x4_2 1, 10, 3, 9, 24, 25, 26, 27, 28, 29
+
+        store16_rvv         v1, v10, v3, v9, \off1, \off2, \step
+.endm
+
+.macro store_to_stack_rvv off1, off2, in0, in2, in4, in6, in7, in5, in3, in1
+.if \off1 < 2048
+        addi     a2, sp, \off1
+.else
+        li      t0, \off1
+        add     a2, sp, t0
+.endif
+
+.if \off2 < 2048
+        addi     a3, sp, \off2
+.else
+        li      t0, \off2
+        add     a3, sp, t0
+.endif
+
+        addi             a4, a2, 16 * 1
+        addi             a5, a3, -16 * 1
+        addi             a6, a2, 16 * 2
+        addi             a7, a3, -16 * 2
+        addi             s2, a2, 16 * 3
+        addi             s3, a3, -16 * 3
+
+        vse32.v         \in0, (a2)
+        vse32.v         \in1, (a3)
+        vse32.v         \in2, (a4)
+        vse32.v         \in3, (a5)
+        vse32.v         \in4, (a6)
+        vse32.v         \in5, (a7)
+        vse32.v         \in6, (s2)
+        vse32.v         \in7, (s3)
+.endm
+
+.macro transpose16_4x4_2 r0, r1, r2, r3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5
+        vsetivli    zero, 8, e16, m1, ta, ma
+        vid.v     v0
+        vand.vi   v8, v0, 1
+        vmsne.vi  v0, v8, 0
+
+        vslideup.vi    v8, v\r1, 1
+        vsetivli    zero, 4, e16, m1, ta, ma
+        vmerge.vvm       v\tmp0\(), v\r0\(), v8, v0
+
+        vslidedown.vi    v8, v\r0, 1
+        vmerge.vvm       v\tmp1\(), v8, v\r1\(), v0
+
+        vslideup.vi    v8, v\r3, 1
+        vmerge.vvm       v\tmp2\(), v\r2\(), v8, v0
+
+        vslidedown.vi    v8, v\r2\(), 1
+        vmerge.vvm       v\tmp3\(), v8, v\r3\(), v0
+
+        vsetivli    zero, 2, e32, m1, ta, ma
+
+        vslideup.vi    v8, v\tmp2\(), 1
+        vmerge.vvm       v\tmp4\(), v\tmp0\(), v8, v0
+
+        vslidedown.vi    v8, v\tmp0\(), 1
+        vmerge.vvm       v\tmp5\(), v8, v\tmp2\(), v0
+
+        vslideup.vi    v8, v\tmp3\(), 1
+        vmerge.vvm       v\tmp0\(), v\tmp1\(), v8, v0
+
+        vslidedown.vi    v8, v\tmp1\(), 1
+        vmerge.vvm       v\tmp2\(), v8, v\tmp3\(), v0
+
+        vsetivli    zero, 1, e64, m1, ta, ma
+        vmv.v.v  v\r0\(), v\tmp4\()
+        vmv.v.v  v\r2\(), v\tmp5\()
+        vmv.v.v  v\r1\(), v\tmp0\()
+        vmv.v.v  v\r3\(), v\tmp2\()
+
+        vsetivli    zero, 8, e16, m1, ta, ma
+
+        vslideup.vi    v8, v\r2\(), 1
+        vmerge.vvm       v\tmp0\(), v\r3\(), v8, v0
+
+        vslidedown.vi    v8, v\r3\(), 1
+        vmerge.vvm        v\tmp1\(), v8, v\r2\(), v0
+
+        vslideup.vi    v8, v\r0\(), 1
+        vmerge.vvm       v\tmp2\(), v\r1\(),v8, v0
+
+        vslidedown.vi    v8, v\r1\(), 1
+        vmerge.vvm        v\tmp3\(), v8, v\r0\(), v0
+
+        vsetivli    zero, 4, e32, m1, ta, ma
+
+        vslideup.vi    v8, v\tmp2\(), 1
+        vmerge.vvm       v\tmp4\(), v\tmp0\(), v8, v0
+
+        vslidedown.vi    v8, v\tmp0\(), 1
+        vmerge.vvm       v\tmp5\(), v8, v\tmp2\(), v0
+
+        vslideup.vi    v8, v\tmp3\(), 1
+        vmerge.vvm       v\tmp0\(), v\tmp1\(), v8, v0
+
+        vslidedown.vi    v8, v\tmp1\(), 1
+        vmerge.vvm       v\tmp2\(), v8, v\tmp3\(), v0
+
+        vsetivli    zero, 2, e64, m1, ta, ma
+
+        vmerge.vvm       v\r3\(), v\r3\(), v\tmp4\(), v0
+        vmerge.vvm       v\r1\(), v\r1\(), v\tmp5\(), v0
+        vmerge.vvm       v\r2\(), v\r2\(), v\tmp0\(), v0
+        vmerge.vvm       v\r0\(), v\r0\(), v\tmp2\(), v0
+.endm
+
+.macro load_trans_8x4
+        li s6, 89
+        li s7, 75
+        li s8, 50
+        li s9, 18
+
+        li s2, -89
+        li s4, -50
+        li s5, -18
+.endm
+
+.macro scale vlen, out0, out1, out2, out3, in0, in1, in2, in3, in4, in5, in6, in7, shift
+.if \vlen > 128
+        vsetivli        zero, 4, e64, m1, ta, ma
+        vslideup.vi     \in0\(), \in1\(), 2
+        vslideup.vi     \in2\(), \in3\(), 2
+        vslideup.vi     \in4\(), \in5\(), 2
+        vslideup.vi     \in6\(), \in7\(), 2
+
+        vsetivli        zero, 8, e16, mf2, ta, ma
+        vnclip.wi       \out0\(), \in0\(), \shift
+        vnclip.wi       \out1\(), \in2\(), \shift
+        vnclip.wi       \out2\(), \in4\(), \shift
+        vnclip.wi       \out3\(), \in6\(), \shift
+.else
+        vsetivli        zero, 4, e16, mf2, ta, ma
+        vnclip.wi       \out0\(), \in0\(), \shift
+        vnclip.wi       \out1\(), \in2\(), \shift
+        vnclip.wi       \out2\(), \in4\(), \shift
+        vnclip.wi       \out3\(), \in6\(), \shift
+
+        vnclip.wi       \in1\(), \in1\(), \shift
+        vnclip.wi       \in3\(), \in3\(), \shift
+        vnclip.wi       \in5\(), \in5\(), \shift
+        vnclip.wi       \in7\(), \in7\(), \shift
+
+        vsetivli        zero, 2, e64, m1, ta, ma
+        vslideup.vi     \out0\(), \in1\(), 1
+        vslideup.vi     \out1\(), \in3\(), 1
+        vslideup.vi     \out2\(), \in5\(), 1
+        vslideup.vi     \out3\(), \in7\(), 1
+.endif
+.endm
+
+.macro load_trans_4x4
+        li s2, 64
+        li s3, 83
+
+        li s5, 36
+        li s6, -64
+        li s7, -83
+.endm
+
+.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3
+        vwcvt.x.x.v     v8, \in0
+        vsetivli    zero, 4, e32, m1, ta, ma
+        vsll.vi         v28, v8, 6
+        vmv.v.v v29, v28
+
+        load_trans_4x4
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        vwmul.vx    v30, \in1, s3
+        vwmul.vx    v31, \in1, s5
+        vwmacc.vx    v28, s2, \in2
+
+        vwmacc.vx    v29, s6, \in2
+        vwmacc.vx    v30, s5, \in3
+        vwmacc.vx    v31, s7, \in3
+
+        vsetivli    zero, 4, e32, m1, ta, ma
+        vadd.vv             \out0, v28, v30
+        vadd.vv             \out1, v29, v31
+        vsub.vv             \out2, v29, v31
+        vsub.vv             \out3, v28, v30
+.endm
+
+.macro tr16_8x4 in0, in1, in2, in3, offset, in4, in5, in6, in7
+        tr_4x4_8        \in0, \in1, \in2, \in3, v24, v25, v26, v27
+        load_trans_8x4
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        vwmul.vx    v28, \in4, s6
+        vwmul.vx    v29, \in4, s7
+        vwmul.vx    v30, \in4, s8
+        vwmul.vx    v31, \in4, s9
+
+        vwmacc.vx    v28, s7, \in5
+        vwmacc.vx    v29, s5, \in5
+        vwmacc.vx    v30, s2, \in5
+        vwmacc.vx    v31, s4, \in5
+
+        vwmacc.vx    v28, s8, \in6
+        vwmacc.vx    v29, s2, \in6
+        vwmacc.vx    v30, s9, \in6
+        vwmacc.vx    v31, s7, \in6
+
+        vwmacc.vx    v28, s9, \in7
+        vwmacc.vx    v29, s4, \in7
+        vwmacc.vx    v30, s7, \in7
+        vwmacc.vx    v31, s2, \in7
+
+        vsetivli    zero, 4, e32, m1, ta, ma
+        butterfly       v24, v28, v16, v23
+        butterfly       v25, v29, v17, v22
+        butterfly       v26, v30, v18, v21
+        butterfly       v27, v31, v19, v20
+
+.if \offset < 2048
+        addi t0, sp, \offset
+.else
+        li t0, \offset
+        add t0, sp, t0
+.endif
+        addi s2, t0, 16 * 1
+        addi s3, t0, 16 * 2
+        addi s4, t0, 16 * 3
+
+        addi s5, t0, 16 * 4
+        addi s6, t0, 16 * 5
+        addi s7, t0, 16 * 6
+        addi s8, t0, 16 * 7
+
+        vse32.v v16, (t0)
+        vse32.v v17, (s2)
+        vse32.v v18, (s3)
+        vse32.v v19, (s4)
+
+        vse32.v v20, (s5)
+        vse32.v v21, (s6)
+        vse32.v v22, (s7)
+        vse32.v v23, (s8)
+.endm
+
+.macro load_trans_16x4
+        li x12, 90
+        li x13, 87
+        li x14, 80
+        li x15, 70
+
+        li x16, 57
+        li x17, 43
+        li x18, 25
+        li x19, 9
+
+        li x20, -90
+        li x21, -87
+        li x22, -80
+        li x23, -70
+
+        li x24, -57
+        li x25, -43
+        li x26, -25
+        li x27, -9
+.endm
+
+.macro tr_16x4_rvv name, shift, offset, step
+func func_tr_16x4_\name\()_rvv, zve64x
+        vsetivli    zero, 4, e16, m1, ta, ma
+        load16_rvv     v16, v17, v18, v19, 0, \step * 64, \step * 128, v0, v1, v2, v3,
+
+        tr16_8x4        v16, v17, v18, v19, \offset, v0, v1, v2, v3,
+
+        vsetivli    zero, 4, e16, m1, ta, ma
+        load16_rvv          v20, v17, v18, v19, \step * 32, \step * 3 * 32, \step * 128, v3, v0, v1, v2,
+
+        load_trans_16x4
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+
+        vwmul.vx            v21, v20, x12
+        vwmul.vx            v22, v20, x13
+        vwmul.vx            v23, v20, x14
+        vwmul.vx            v24, v20, x15
+
+        vwmul.vx            v25, v20, x16
+        vwmul.vx            v26, v20, x17
+        vwmul.vx            v27, v20, x18
+        vwmul.vx            v28, v20, x19
+
+        add_member      v3, x13, x16, x19, x25, x22, x20, x23, x26
+        add_member      v17, x14, x19, x23, x21, x26, x16, x12, x17
+        add_member      v0, x15, x25, x21, x19, x12, x18, x22, x24
+        add_member      v18, x16, x22, x26, x12, x27, x21, x17, x15
+        add_member      v1, x17, x20, x16, x18, x21, x15, x19, x22
+        add_member      v19, x18, x23, x12, x22, x17, x19, x24, x13
+        add_member      v2, x19, x26, x17, x24, x15, x22, x13, x20
+
+.if \offset < 2048
+        addi t0, sp, \offset
+.else
+        li t0, \offset
+        add t0, sp, t0
+.endif
+        addi s2, t0, 16
+        addi s3, t0, 16*2
+        addi s4, t0, 16*3
+        vle32.v        v16, (t0)
+        vle32.v        v17, (s2)
+        vle32.v        v18, (s3)
+        vle32.v        v19, (s4)
+
+        vsetivli    zero, 4, e32, m1, ta, ma
+        butterfly16       v16, v21, v17, v22, v18, v23, v19, v24
+        store_to_stack_rvv  \offset, (\offset + 240), v20, v21, v22, v23, v19, v18, v17, v16
+
+.if \offset < 2048 - 64
+        addi t0, sp, \offset + 64
+.else
+        li t0, \offset + 64
+        add t0, sp, t0
+.endif
+        addi s2, t0, 16
+        addi s3, t0, 16*2
+        addi s4, t0, 16*3
+
+        vle32.v         v16, (t0)
+        vle32.v         v17, (s2)
+        vle32.v         v18, (s3)
+        vle32.v         v19, (s4)
+
+        butterfly16     v16, v25, v17, v26, v18, v27, v19, v28
+        store_to_stack_rvv  (\offset + 64), (\offset + 176), v20, v25, v26, v27, v19, v18, v17, v16
+        ret
+endfunc
+.endm
+
+tr_16x4_rvv noscale, 0, 2048, 4
+
+.macro load_trans_32x4
+        li x12, 90
+        li x13, 90
+        li x14, 88
+        li x15, 85
+
+        li x16, 82
+        li x17, 78
+        li x18, 73
+        li x19, 67
+
+        li x20, 61
+        li x21, 54
+        li x22, 46
+        li x23, 38
+
+        li x24, 31
+        li x25, 22
+        li x26, 13
+        li x27, 4
+.endm
+
+.macro tr_32x4_rvv name, shift, vlen
+func func_tr_32x4_\name\()_rvv_\vlen\(), zve64x
+        vsetivli    zero, 1, e64, m1, ta, ma
+        load32_rvv
+
+        load_trans_32x4
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+
+        tr_block1
+        scale_store_rvv         \vlen, \shift, 64, 0, (56 + 3 * 64), 0
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        tr_block2
+        scale_store_rvv          \vlen, \shift, 64, 8, (48 + 3 * 64), 64
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        tr_block3
+        scale_store_rvv          \vlen, \shift, 64, 16, (40 + 3 * 64), 128
+
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        tr_block4
+        scale_store_rvv          \vlen, \shift, 64, 24, (32 + 3 * 64), 192
+
+        ret
+endfunc
+.endm
+
+tr_32x4_rvv firstpass, 7, 128
+tr_32x4_rvv secondpass_8, 20 - 8, 128
+
+tr_32x4_rvv firstpass, 7, 256
+tr_32x4_rvv secondpass_8, 20 - 8, 256
+
+.macro  lx rd, addr
+#if (__riscv_xlen == 32)
+        lw      \rd, \addr
+#elif (__riscv_xlen == 64)
+        ld      \rd, \addr
+#else
+        lq      \rd, \addr
+#endif
+.endm
+
+.macro  sx rd, addr
+#if (__riscv_xlen == 32)
+        sw      \rd, \addr
+#elif (__riscv_xlen == 64)
+        sd      \rd, \addr
+#else
+        sq      \rd, \addr
+#endif
+.endm
+
+.macro idct_32x32 bitdepth, vlen
+func ff_hevc_idct_32x32_\bitdepth\()_rvv_\vlen\(), zve64x
+
+        addi    sp, sp, -(__riscv_xlen / 8)*13
+        sx  ra, (__riscv_xlen / 8)*(12)(sp)
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+        sx    s\i, (__riscv_xlen / 8)*(11-\i)(sp)
+.endr
+        mv t6, a0
+
+        csrwi   vxrm, 1
+        li      t0, 2432
+        sub     sp, sp, t0
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+        addi              a0, t6, 8 * \i
+        addi              a1, sp, 8 * \i * 32
+        jal              func_tr_16x4_noscale_rvv
+        jal              func_tr_32x4_firstpass_rvv_\vlen\()
+.endr
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+        addi             a0, sp, 8 * \i
+        addi             a1, t6, 8 * \i * 32
+        jal              func_tr_16x4_noscale_rvv
+        jal              func_tr_32x4_secondpass_\bitdepth\()_rvv_\vlen\()
+.endr
+
+        li              t0, 2432
+        add             sp,  sp,  t0
+
+        lx  ra, (__riscv_xlen / 8)*(12)(sp)
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+        lx    s\i, (__riscv_xlen / 8)*(11-\i)(sp)
+.endr
+        addi    sp, sp, (__riscv_xlen / 8)*13
+        ret
+endfunc
+.endm
+
+idct_32x32 8, 128
+idct_32x32 8, 256
diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c
index 1d8326a573..d567c8b433 100644
--- a/libavcodec/riscv/hevcdsp_init.c
+++ b/libavcodec/riscv/hevcdsp_init.c
@@ -27,6 +27,9 @@
 #include "libavcodec/hevc/dsp.h"
 #include "libavcodec/riscv/h26x/h2656dsp.h"
 
+void ff_hevc_idct_32x32_8_rvv_128(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_8_rvv_256(int16_t *coeffs, int col_limit);
+
 #define RVV_FNASSIGN(member, v, h, fn, ext) \
         member[1][v][h] = ff_h2656_put_pixels_##8_##ext;  \
         member[3][v][h] = ff_h2656_put_pixels_##8_##ext;  \
@@ -40,27 +43,47 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int bit_depth)
     const int flags = av_get_cpu_flags();
     int vlenb;
 
-    if (!(flags & AV_CPU_FLAG_RVV_I32) || !(flags & AV_CPU_FLAG_RVB))
-        return;
-
     vlenb = ff_get_rv_vlenb();
-    if (vlenb >= 32) {
-        switch (bit_depth) {
-            case 8:
-                RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256);
-                RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256);
-                break;
-            default:
-                break;
+
+    if (flags & AV_CPU_FLAG_RVV_I64){
+        if (vlenb >= 32) {
+            switch (bit_depth) {
+                case 8:
+                    c->idct[3]                     = ff_hevc_idct_32x32_8_rvv_256;
+                    break;
+                default:
+                    break;
+            }
+        } else if (vlenb >= 16){
+            switch (bit_depth) {
+                case 8:
+                    c->idct[3]                     = ff_hevc_idct_32x32_8_rvv_128;
+                    break;
+                default:
+                    break;
+            }
         }
-    } else if (vlenb >= 16) {
-        switch (bit_depth) {
-            case 8:
-                RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128);
-                RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128);
-                break;
-            default:
-                break;
+    }
+
+    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB)){
+        if (vlenb >= 32) {
+            switch (bit_depth) {
+                case 8:
+                    RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256);
+                    RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256);
+                    break;
+                default:
+                    break;
+            }
+        } else if (vlenb >= 16) {
+            switch (bit_depth) {
+                case 8:
+                    RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128);
+                    RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128);
+                    break;
+                default:
+                    break;
+            }
         }
     }
 #endif
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-06-06  9:27 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-05-20  7:58 [FFmpeg-devel] [PATCH v4] libavcodec/riscv:add RVV optimized idct_32x32_8 for HEVC daichengrong
2025-05-24 16:18 ` Rémi Denis-Courmont
2025-05-30  7:40 ` [FFmpeg-devel] [PATCH v5] " daichengrong
2025-06-02 20:13   ` Rémi Denis-Courmont
2025-06-06  9:26   ` [FFmpeg-devel] [PATCH v6] libavcodec/riscv: add " daichengrong

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git