Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] libavcodec/riscv:add RVV optimized for idct_32x32_8:
@ 2025-04-15  7:34 daichengrong
  2025-04-15 15:02 ` Rémi Denis-Courmont
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: daichengrong @ 2025-04-15  7:34 UTC (permalink / raw)
  To: ffmpeg-devel

From: daichengrong <daichengrong@iscas.ac.cn>

     riscv/hevcdsp_idct_rvv: Optimize idct_32x32_8

     On Banana PI F3:

     hevc_idct_32x32_8_c:                                119579.3 ( 1.00x)
     hevc_idct_32x32_8_rvv_i64:                           51254.4 ( 2.33x)

Signed-off-by: daichengrong <daichengrong@iscas.ac.cn>
---
 libavcodec/riscv/Makefile           |    1 +
 libavcodec/riscv/hevcdsp_idct_rvv.S | 1042 +++++++++++++++++++++++++++
 libavcodec/riscv/hevcdsp_init.c     |   52 +-
 3 files changed, 1075 insertions(+), 20 deletions(-)
 create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index a80d2fa2e7..dfc33afbee 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \
 OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
 RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 0000000000..f8dd2e5bf4
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,1042 @@
+/*
+ * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+const trans, align=4
+        .2byte          64, 83, 64, 36
+        .2byte          89, 75, 50, 18
+        .2byte          90, 87, 80, 70
+        .2byte          57, 43, 25, 9
+        .2byte          90, 90, 88, 85
+        .2byte          82, 78, 73, 67
+        .2byte          61, 54, 46, 38
+        .2byte          31, 22, 13, 4
+endconst
+
+.macro sum_sub out, in, c, op, p
+        vsetivli	t0, 4, e16, mf2, tu, ma
+  .ifc \op, +
+        .ifc \p, 2  
+                vslidedown.vi	v8, \in, 4
+                vwmacc.vx	\out, \c, v8
+        .else
+                vwmacc.vx	\out, \c, \in
+        .endif
+  .else
+        .ifc \p, 2  
+                neg	\c, \c
+                vslidedown.vi	v8, \in, 4
+                vwmacc.vx	\out, \c, v8
+                neg	\c, \c
+        .else
+                neg	\c, \c
+                vwmacc.vx	\out, \c, \in
+                neg	\c, \c
+        .endif
+  .endif
+.endm
+
+.macro add_member32 in, t0, index0, t1, index1, t2, index2, t3, index3, op0, op1, op2, op3, p
+        vsetivli	t0, 1, e16, m1, tu, ma
+        vslidedown.vi	v12, \t0, \index0
+        vmv.x.s	s2, v12
+        vslidedown.vi	v12, \t1, \index1
+        vmv.x.s	s3, v12
+        vslidedown.vi	v12, \t2, \index2
+        vmv.x.s	s4, v12
+        vslidedown.vi	v12, \t3, \index3
+        vmv.x.s	s5, v12
+
+        sum_sub         v24, \in, s2, \op0, \p
+        sum_sub         v25, \in, s3, \op1, \p
+        sum_sub         v26, \in, s4, \op2, \p
+        sum_sub         v27, \in, s5, \op3, \p
+.endm
+
+.macro butterfly e, o, tmp_p, tmp_m
+        vsetivli             t0, 4, e32, m1, tu, ma
+        vadd.vv             \tmp_p, \e, \o
+        vsub.vv             \tmp_m, \e, \o
+.endm
+
+.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
+        vsetivli             t0, 4, e32, m1, tu, ma
+        vadd.vv              v20, \in0, \in1
+        vsub.vv              \in0, \in0, \in1
+        vadd.vv              \in1, \in2, \in3
+        vsub.vv              \in2, \in2, \in3
+        vadd.vv              \in3, \in4, \in5
+        vsub.vv              \in4, \in4, \in5
+        vadd.vv              \in5, \in6, \in7
+        vsub.vv              \in6, \in6, \in7
+.endm
+
+.macro multiply in
+        vsetivli	t0, 1, e16, m1, tu, ma
+        vmv.x.s	s2, \in
+        vslidedown.vi	v12, \in, 1
+        vmv.x.s	s3, v12
+        vslidedown.vi	v12, \in, 2
+        vmv.x.s	s4, v12
+        vslidedown.vi	v12, \in, 3
+        vmv.x.s	s5, v12
+
+        vsetivli        t0, 4, e16, mf2, tu, ma
+        vwmul.vx        v24, v4, s2
+        vwmul.vx        v25, v4, s3
+        vwmul.vx        v26, v4, s4
+        vwmul.vx        v27, v4, s5
+.endm
+
+func tr_block1, zve64x
+        multiply        v0
+        add_member32    v4,  v0, 1, v1, 0, v1, 3, v2, 2, +, +, +, +, 2
+        add_member32    v5,  v0, 2, v1, 3, v3, 0, v3, 2, +, +, +, -
+        add_member32    v5,  v0, 3, v2, 2, v3, 2, v1, 3, +, +, -, -, 2
+        add_member32    v6,  v1, 0, v3, 1, v2, 1, v0, 0, +, +, -, -
+        add_member32    v6,  v1, 1, v3, 3, v1, 0, v1, 2, +, -, -, -, 2
+        add_member32    v7,  v1, 2, v3, 0, v0, 0, v3, 1, +, -, -, -
+        add_member32    v7,  v1, 3, v2, 1, v1, 1, v2, 3, +, -, -, +, 2
+        add_member32    v16, v2, 0, v1, 2, v2, 2, v1, 0, +, -, -, +
+        add_member32    v16, v2, 1, v0, 3, v3, 3, v0, 2, +, -, -, +, 2
+        add_member32    v17, v2, 2, v0, 1, v2, 3, v2, 1, +, -, +, +
+        add_member32    v17, v2, 3, v0, 2, v1, 2, v3, 3, +, -, +, -, 2
+        add_member32    v18, v3, 0, v1, 1, v0, 1, v2, 0, +, -, +, -
+        add_member32    v18, v3, 1, v2, 0, v0, 3, v0, 1, +, -, +, -, 2
+        add_member32    v19, v3, 2, v2, 3, v2, 0, v1, 1, +, -, +, -
+        add_member32    v19, v3, 3, v3, 2, v3, 1, v3, 0, +, -, +, -, 2
+        ret
+endfunc
+
+func tr_block2, zve64x
+        multiply        v1
+        add_member32    v4,  v3, 1, v3, 3, v3, 0, v2, 1, +, -, -, -, 2
+        add_member32    v5,  v2, 1, v1, 0, v0, 0, v1, 1, -, -, -, -
+        add_member32    v5,  v0, 0, v1, 2, v3, 1, v2, 3, -, -, -, +, 2
+        add_member32    v6,  v2, 0, v3, 2, v1, 1, v0, 3, -, +, +, +
+        add_member32    v6,  v3, 2, v0, 3, v1, 3, v3, 1, +, +, +, -, 2
+        add_member32    v7,  v1, 1, v1, 3, v2, 3, v0, 0, +, +, -, -
+        add_member32    v7,  v0, 3, v3, 1, v0, 1, v3, 3, +, -, -, +, 2
+        add_member32    v16, v3, 0, v0, 2, v3, 2, v0, 1, +, -, -, +
+        add_member32    v16, v2, 2, v2, 0, v1, 0, v3, 2, -, -, +, +, 2
+        add_member32    v17, v0, 1, v3, 0, v2, 0, v0, 2, -, +, +, -
+        add_member32    v17, v1, 3, v0, 1, v2, 2, v3, 0, -, +, -, -, 2
+        add_member32    v18, v3, 3, v2, 1, v0, 2, v1, 0, +, +, -, +
+        add_member32    v18, v1, 2, v2, 3, v3, 3, v2, 2, +, -, -, +, 2
+        add_member32    v19, v0, 2, v0, 1, v0, 3, v1, 2, +, -, +, -
+        add_member32    v19, v2, 3, v2, 2, v2, 1, v2, 0, +, -, +, -, 2
+        ret
+endfunc
+
+func tr_block3, zve64x
+        multiply        v2
+        add_member32    v4,  v1, 2, v0, 3, v0, 0, v0, 2, -, -, -, -, 2
+        add_member32    v5,  v2, 2, v3, 3, v2, 3, v1, 2, -, -, +, +
+        add_member32    v5,  v1, 0, v0, 2, v2, 1, v3, 3, +, +, +, -, 2
+        add_member32    v6,  v3, 0, v2, 2, v0, 1, v1, 3, +, -, -, -
+        add_member32    v6,  v0, 2, v2, 0, v3, 0, v0, 0, -, -, +, +, 2
+        add_member32    v7,  v3, 2, v1, 0, v2, 0, v2, 2, -, +, +, -
+        add_member32    v7,  v0, 0, v3, 2, v0, 2, v3, 0, +, +, -, -, 2
+        add_member32    v16, v3, 3, v0, 1, v3, 1, v0, 3, -, -, +, +
+        add_member32    v16, v0, 1, v2, 3, v1, 3, v1, 1, -, +, +, -, 2
+        add_member32    v17, v3, 1, v1, 3, v0, 3, v3, 2, +, +, -, +
+        add_member32    v17, v0, 3, v1, 1, v3, 2, v2, 0, +, -, +, +, 2
+        add_member32    v18, v2, 3, v3, 1, v1, 2, v0, 1, -, -, +, -
+        add_member32    v18, v1, 1, v0, 0, v1, 0, v2, 1, -, +, -, +, 2
+        add_member32    v19, v2, 1, v3, 0, v3, 3, v3, 1, +, -, +, +
+        add_member32    v19, v1, 3, v1, 2, v1, 1, v1, 0, +, -, +, -, 2
+        ret
+endfunc
+
+func tr_block4, zve64x
+        multiply        v3
+        add_member32    v4,  v1, 1, v2, 0, v2, 3, v3, 2, -, -, -, -, 2
+        add_member32    v5,  v0, 0, v0, 3, v2, 0, v3, 1, +, +, +, +
+        add_member32    v5,  v2, 0, v0, 0, v1, 1, v3, 0, -, -, -, -, 2
+        add_member32    v6,  v3, 3, v1, 2, v0, 2, v2, 3, +, +, +, +
+        add_member32    v6,  v2, 1, v2, 3, v0, 0, v2, 2, +, -, -, -, 2
+        add_member32    v7,  v0, 2, v3, 3, v0, 3, v2, 1, -, -, +, +
+        add_member32    v7,  v1, 0, v2, 2, v1, 2, v2, 0, +, +, -, -, 2
+        add_member32    v16, v2, 3, v1, 1, v2, 1, v1, 3, -, -, +, +
+        add_member32    v16, v3, 1, v0, 1, v3, 0, v1, 2, -, +, -, -, 2
+        add_member32    v17, v1, 2, v1, 0, v3, 3, v1, 1, +, -, +, +
+        add_member32    v17, v0, 1, v2, 1, v3, 1, v1, 0, -, +, +, -, 2
+        add_member32    v18, v1, 3, v3, 2, v2, 2, v0, 3, +, -, -, +
+        add_member32    v18, v3, 2, v3, 0, v1, 3, v0, 2, -, -, +, -, 2
+        add_member32    v19, v2, 2, v1, 3, v1, 0, v0, 1, -, +, -, +
+        add_member32    v19, v0, 3, v0, 2, v0, 1, v0, 0, +, -, +, -, 2
+        ret
+endfunc
+
+.macro butterfly32 in0, in1, in2, in3, out
+        vsetivli              t0, 4, e32, m1, tu, ma
+        vadd.vv             \out, \in0, \in1
+        vsub.vv             \in0, \in0, \in1
+        vadd.vv             \in1, \in2, \in3
+        vsub.vv             \in2, \in2, \in3
+.endm
+
+.macro load16 in0, in1, in2, in3
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v         v8, (a3)
+        
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1up.vx    \in0, v8, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v         \in0, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+     
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1up.vx \in1, v8, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v \in1, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+        
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1up.vx \in2, v8, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v \in2, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+        
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1up.vx \in3, v8, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v \in3, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+.endm
+
+.macro store16 in0, in1, in2, in3, rx
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vse64.v \in0, (a1)
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1down.vx v8, \in0, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vse64.v v8, (a3)
+        add a1, a1, a2
+        add a3, a3, \rx
+
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vse64.v \in1, (a1)
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1down.vx v8, \in1, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vse64.v v8, (a3)
+        add a1, a1, a2
+        add a3, a3, \rx
+        
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vse64.v \in2, (a1)
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1down.vx v8, \in2, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vse64.v v8, (a3)
+        add a1, a1, a2
+        add a3, a3, \rx
+        
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vse64.v \in3, (a1)
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1down.vx v8, \in3, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vse64.v v8, (a3)
+        add a1, a1, a2
+        add a3, a3, \rx
+       
+.endm
+
+.macro load32
+        addi             a1,  a5, 64
+        addi             a3,  a1, 128
+        li             a2,  256
+        
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1up.vx v4, v8, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v4, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1up.vx v5, v8, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v5, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1up.vx v6, v8, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v6, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1up.vx v7, v8, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v7, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1up.vx v16, v8, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v16, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1up.vx v17, v8, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v17, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1up.vx v18, v8, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v18, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli        t0, 2, e64, m1, tu, ma
+        vslide1up.vx v19, v8, zero
+        vsetivli        t0, 1, e64, m1, tu, ma
+        vle64.v v19, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+.endm
+
+
+
+.macro add_member in, tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7, op0, op1, op2, op3, op4, op5, op6, op7, p
+        sum_sub         v21, \in, \tt0, \op0, \p
+        sum_sub         v22, \in, \tt1, \op1, \p
+        sum_sub         v23, \in, \tt2, \op2, \p
+        sum_sub         v24, \in, \tt3, \op3, \p
+        sum_sub         v25, \in, \tt4, \op4, \p
+        sum_sub         v26, \in, \tt5, \op5, \p
+        sum_sub         v27, \in, \tt6, \op6, \p
+        sum_sub         v28, \in, \tt7, \op7, \p
+.endm
+
+.macro scale_store shift
+        vsetivli        t0, 8, e16, m1, tu, ma 
+        vle16.v        v28, (a4)
+        addi            a4, a4, 2*8
+        vle16.v        v29, (a4)
+        addi            a4, a4, 2*8
+        vle16.v        v30, (a4)
+        addi            a4, a4, 2*8
+        vle16.v        v31, (a4)
+        addi            a4, a4, 2*8
+
+        butterfly32     v28, v24, v29, v25, v2
+        butterfly32     v30, v26, v31, v27, v3
+
+        scale           v20, v21, v22, v23, v2, v28, v24, v29, v3, v30, v26, v31, \shift
+
+        transpose16_4x4_2 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        store16         v20, v21, v22, v23, t1
+
+        vsetivli        t0, 4, e16, m1, tu, ma 
+        vle16.v         v2, (t2)
+        addi            t2, t2, 8
+        vle16.v         v3, (t2)
+        addi            t2, t2, -8
+.endm
+
+.macro store_to_stack off1, off2, in0, in2, in4, in6, in7, in5, in3, in1
+        li a7, \off1
+        add a1, sp, a7
+        li a7, \off2
+        add a3, sp, a7  
+        li             a2, -16
+        li             a4, 16
+
+        vsetivli        t0, 4, e32, m1, tu, ma
+        vse32.v \in0, (a1)
+        add a1, a1, a4
+        vse32.v \in1, (a3)
+        add a3, a3, a2
+        vse32.v \in2, (a1)
+        add a1, a1, a4
+        vse32.v \in3, (a3)
+        add a3, a3, a2
+        vse32.v \in4, (a1)
+        add a1, a1, a4
+        vse32.v \in5, (a3)
+        add a3, a3, a2
+        vse32.v \in6, (a1)
+        vse32.v \in7, (a3)
+.endm
+
+.macro transpose16_4x4_2 r0, r1, r2, r3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5
+        // lower halves
+        vsetivli	t0, 1, e16, m1, tu, ma
+        vmv.v.v	\tmp0\(), \r0\()
+        vsetivli	t0, 2, e16, m1, tu, ma
+        vslideup.vi	\tmp0\(), \r1\(), 1
+        vslidedown.vi	v8, \r0\(), 2
+        vsetivli	t0, 3, e16, m1, tu, ma
+        vslideup.vi	\tmp0\(), v8, 2
+        vslidedown.vi	v8, \r1\(), 2
+        vsetivli	t0, 4, e16, m1, tu, ma
+        vslideup.vi	\tmp0\(), v8, 3
+        vsetivli	t0, 1, e16, m1, tu, ma
+        vslidedown.vi	v8, \r0\(), 1
+        vmv.v.v	\tmp1\(), v8
+        vslidedown.vi	v8, \r1\(), 1
+        vsetivli	t0, 2, e16, m1, tu, ma
+        vslideup.vi	\tmp1\(), v8, 1
+        vslidedown.vi	v8, \r0\(), 3
+        vsetivli	t0, 3, e16, m1, tu, ma
+        vslideup.vi	\tmp1\(), v8, 2
+        vslidedown.vi	v8, \r1\(), 3
+        vsetivli	t0, 4, e16, m1, tu, ma
+        vslideup.vi	\tmp1\(), v8, 3
+        vsetivli	t0, 1, e16, m1, tu, ma
+        vmv.v.v	\tmp2\(), \r2\()
+        vsetivli	t0, 2, e16, m1, tu, ma
+        vslideup.vi	\tmp2\(), \r3\(), 1
+        vslidedown.vi	v8, \r2\(), 2
+        vsetivli	t0, 3, e16, m1, tu, ma
+        vslideup.vi	\tmp2\(), v8, 2
+        vslidedown.vi	v8, \r3\(), 2
+        vsetivli	t0, 4, e16, m1, tu, ma
+        vslideup.vi	\tmp2\(), v8, 3
+        vsetivli	t0, 1, e16, m1, tu, ma
+        vslidedown.vi	v8, \r2\(), 1
+        vmv.v.v	\tmp3\(), v8
+        vslidedown.vi	v8, \r3\(), 1
+        vsetivli	t0, 2, e16, m1, tu, ma
+        vslideup.vi	\tmp3\(), v8, 1
+        vslidedown.vi	v8, \r2\(), 3
+        vsetivli	t0, 3, e16, m1, tu, ma
+        vslideup.vi	\tmp3\(), v8, 2
+        vslidedown.vi	v8, \r3\(), 3
+        vsetivli	t0, 4, e16, m1, tu, ma
+        vslideup.vi	\tmp3\(), v8, 3
+        vsetivli	t0, 1, e32, m1, tu, ma
+        vmv.v.v	\tmp4\(), \tmp0\()
+        vsetivli	t0, 2, e32, m1, tu, ma
+        vslideup.vi	\tmp4\(), \tmp2\(), 1
+        vsetivli	t0, 1, e32, m1, tu, ma
+        vslidedown.vi	v8, \tmp0\(), 1
+        vmv.v.v	\tmp5\(), v8
+        vslidedown.vi	v8, \tmp2\(), 1
+        vsetivli	t0, 2, e32, m1, tu, ma
+        vslideup.vi	\tmp5\(), v8, 1
+        vsetivli	t0, 1, e32, m1, tu, ma
+        vmv.v.v	\tmp0\(), \tmp1\()
+        vsetivli	t0, 2, e32, m1, tu, ma
+        vslideup.vi	\tmp0\(), \tmp3\(), 1
+        vsetivli	t0, 1, e32, m1, tu, ma
+        vslidedown.vi	v8, \tmp1\(), 1
+        vmv.v.v	\tmp2\(), v8
+        vslidedown.vi	v8, \tmp3\(), 1
+        vsetivli	t0, 2, e32, m1, tu, ma
+        vslideup.vi	\tmp2\(), v8, 1
+
+        vsetivli         t0, 1, e64, m1, tu, ma
+        vmv.v.v          \r0\(), \tmp4\()
+        vmv.v.v          \r2\(), \tmp5\()
+        vmv.v.v          \r1\(), \tmp0\()
+        vmv.v.v          \r3\(), \tmp2\()
+
+        vsetivli	t0, 1, e16, m1, tu, ma
+        vmv.v.v	\tmp0\(), \r3\()
+        vsetivli	t0, 2, e16, m1, tu, ma
+        vslideup.vi	\tmp0\(), \r2\(), 1
+        vslidedown.vi	v8, \r3\(), 2
+        vsetivli	t0, 3, e16, m1, tu, ma
+        vslideup.vi	\tmp0\(), v8, 2
+        vslidedown.vi	v8, \r2\(), 2
+        vsetivli	t0, 4, e16, m1, tu, ma
+        vslideup.vi	\tmp0\(), v8, 3
+        vslidedown.vi	v8, \r3\(), 4
+        vsetivli	t0, 5, e16, m1, tu, ma
+        vslideup.vi	\tmp0\(), v8, 4
+        vslidedown.vi	v8, \r2\(), 4
+        vsetivli	t0, 6, e16, m1, tu, ma
+        vslideup.vi	\tmp0\(), v8, 5
+        vslidedown.vi	v8, \r3\(), 6
+        vsetivli	t0, 7, e16, m1, tu, ma
+        vslideup.vi	\tmp0\(), v8, 6
+        vslidedown.vi	v8, \r2\(), 6
+        vsetivli	t0, 8, e16, m1, tu, ma
+        vslideup.vi	\tmp0\(), v8, 7
+        vsetivli	t0, 1, e16, m1, tu, ma
+        vslidedown.vi	v8, \r3\(), 1
+        vmv.v.v	\tmp1\(), v8
+        vslidedown.vi	v8, \r2\(), 1
+        vsetivli	t0, 2, e16, m1, tu, ma
+        vslideup.vi	\tmp1\(), v8, 1
+        vslidedown.vi	v8, \r3\(), 3
+        vsetivli	t0, 3, e16, m1, tu, ma
+        vslideup.vi	\tmp1\(), v8, 2
+        vslidedown.vi	v8, \r2\(), 3
+        vsetivli	t0, 4, e16, m1, tu, ma
+        vslideup.vi	\tmp1\(), v8, 3
+        vslidedown.vi	v8, \r3\(), 5
+        vsetivli	t0, 5, e16, m1, tu, ma
+        vslideup.vi	\tmp1\(), v8, 4
+        vslidedown.vi	v8, \r2\(), 5
+        vsetivli	t0, 6, e16, m1, tu, ma
+        vslideup.vi	\tmp1\(), v8, 5
+        vslidedown.vi	v8, \r3\(), 7
+        vsetivli	t0, 7, e16, m1, tu, ma
+        vslideup.vi	\tmp1\(), v8, 6
+        vslidedown.vi	v8, \r2\(), 7
+        vsetivli	t0, 8, e16, m1, tu, ma
+        vslideup.vi	\tmp1\(), v8, 7
+        vsetivli	t0, 1, e16, m1, tu, ma
+        vmv.v.v	\tmp2\(), \r1\()
+        vsetivli	t0, 2, e16, m1, tu, ma
+        vslideup.vi	\tmp2\(), \r0\(), 1
+        vslidedown.vi	v8, \r1\(), 2
+        vsetivli	t0, 3, e16, m1, tu, ma
+        vslideup.vi	\tmp2\(), v8, 2
+        vslidedown.vi	v8, \r0\(), 2
+        vsetivli	t0, 4, e16, m1, tu, ma
+        vslideup.vi	\tmp2\(), v8, 3
+        vslidedown.vi	v8, \r1\(), 4
+        vsetivli	t0, 5, e16, m1, tu, ma
+        vslideup.vi	\tmp2\(), v8, 4
+        vslidedown.vi	v8, \r0\(), 4
+        vsetivli	t0, 6, e16, m1, tu, ma
+        vslideup.vi	\tmp2\(), v8, 5
+        vslidedown.vi	v8, \r1\(), 6
+        vsetivli	t0, 7, e16, m1, tu, ma
+        vslideup.vi	\tmp2\(), v8, 6
+        vslidedown.vi	v8, \r0\(), 6
+        vsetivli	t0, 8, e16, m1, tu, ma
+        vslideup.vi	\tmp2\(), v8, 7
+        vsetivli	t0, 1, e16, m1, tu, ma
+        vslidedown.vi	v8, \r1\(), 1
+        vmv.v.v	\tmp3\(), v8
+        vslidedown.vi	v8, \r0\(), 1
+        vsetivli	t0, 2, e16, m1, tu, ma
+        vslideup.vi	\tmp3\(), v8, 1
+        vslidedown.vi	v8, \r1\(), 3
+        vsetivli	t0, 3, e16, m1, tu, ma
+        vslideup.vi	\tmp3\(), v8, 2
+        vslidedown.vi	v8, \r0\(), 3
+        vsetivli	t0, 4, e16, m1, tu, ma
+        vslideup.vi	\tmp3\(), v8, 3
+        vslidedown.vi	v8, \r1\(), 5
+        vsetivli	t0, 5, e16, m1, tu, ma
+        vslideup.vi	\tmp3\(), v8, 4
+        vslidedown.vi	v8, \r0\(), 5
+        vsetivli	t0, 6, e16, m1, tu, ma
+        vslideup.vi	\tmp3\(), v8, 5
+        vslidedown.vi	v8, \r1\(), 7
+        vsetivli	t0, 7, e16, m1, tu, ma
+        vslideup.vi	\tmp3\(), v8, 6
+        vslidedown.vi	v8, \r0\(), 7
+        vsetivli	t0, 8, e16, m1, tu, ma
+        vslideup.vi	\tmp3\(), v8, 7
+        vsetivli	t0, 1, e32, m1, tu, ma
+        vmv.v.v	\tmp4\(), \tmp0\()
+        vsetivli	t0, 2, e32, m1, tu, ma
+        vslideup.vi	\tmp4\(), \tmp2\(), 1
+        vslidedown.vi	v8, \tmp0\(), 2
+        vsetivli	t0, 3, e32, m1, tu, ma
+        vslideup.vi	\tmp4\(), v8, 2
+        vslidedown.vi	v8, \tmp2\(), 2
+        vsetivli	t0, 4, e32, m1, tu, ma
+        vslideup.vi	\tmp4\(), v8, 3
+        vsetivli	t0, 1, e32, m1, tu, ma
+        vslidedown.vi	v8, \tmp0\(), 1
+        vmv.v.v	\tmp5\(), v8
+        vslidedown.vi	v8, \tmp2\(), 1
+        vsetivli	t0, 2, e32, m1, tu, ma
+        vslideup.vi	\tmp5\(), v8, 1
+        vslidedown.vi	v8, \tmp0\(), 3
+        vsetivli	t0, 3, e32, m1, tu, ma
+        vslideup.vi	\tmp5\(), v8, 2
+        vslidedown.vi	v8, \tmp2\(), 3
+        vsetivli	t0, 4, e32, m1, tu, ma
+        vslideup.vi	\tmp5\(), v8, 3
+        vsetivli	t0, 1, e32, m1, tu, ma
+        vmv.v.v	\tmp0\(), \tmp1\()
+        vsetivli	t0, 2, e32, m1, tu, ma
+        vslideup.vi	\tmp0\(), \tmp3\(), 1
+        vslidedown.vi	v8, \tmp1\(), 2
+        vsetivli	t0, 3, e32, m1, tu, ma
+        vslideup.vi	\tmp0\(), v8, 2
+        vslidedown.vi	v8, \tmp3\(), 2
+        vsetivli	t0, 4, e32, m1, tu, ma
+        vslideup.vi	\tmp0\(), v8, 3
+        vsetivli	t0, 1, e32, m1, tu, ma
+        vslidedown.vi	v8, \tmp1\(), 1
+        vmv.v.v	\tmp2\(), v8
+        vslidedown.vi	v8, \tmp3\(), 1
+        vsetivli	t0, 2, e32, m1, tu, ma
+        vslideup.vi	\tmp2\(), v8, 1
+        vslidedown.vi	v8, \tmp1\(), 3
+        vsetivli	t0, 3, e32, m1, tu, ma
+        vslideup.vi	\tmp2\(), v8, 2
+        vslidedown.vi	v8, \tmp3\(), 3
+        vsetivli	t0, 4, e32, m1, tu, ma
+        vslideup.vi	\tmp2\(), v8, 3
+
+        vsetivli         t0, 1, e64, m1, tu, ma
+        vslidedown.vi      v8, \tmp4\(), 1
+        vsetivli         t0, 2, e64, m1, tu, ma
+        vslideup.vi       \r3\(), v8, 1 
+
+        vsetivli         t0, 1, e64, m1, tu, ma
+        vslidedown.vi      v8, \tmp5\(), 1
+        vsetivli         t0, 2, e64, m1, tu, ma
+        vslideup.vi       \r1\(), v8, 1 
+
+        vsetivli         t0, 1, e64, m1, tu, ma
+        vslidedown.vi      v8, \tmp0\(), 1
+        vsetivli         t0, 2, e64, m1, tu, ma
+        vslideup.vi       \r2\(), v8, 1 
+
+        vsetivli         t0, 1, e64, m1, tu, ma
+        vslidedown.vi      v8, \tmp2\(), 1
+        vsetivli         t0, 2, e64, m1, tu, ma
+        vslideup.vi       \r0\(), v8, 1 
+.endm
+
+.macro tr16_8x4 in0, in1, in2, in3, offset
+        tr_4x4_8        \in0, \in1, \in2, \in3, v24, v25, v26, v27
+        
+        vsetivli	t0, 1, e16, m1, tu, ma
+        vmv.x.s	s2, v0
+        vslidedown.vi	v12, v0, 1
+        vmv.x.s	s3, v12
+        vslidedown.vi	v12, v0, 2
+        vmv.x.s	s4, v12
+        vslidedown.vi	v12, v0, 3
+        vmv.x.s	s5, v12
+        vslidedown.vi	v12, v0, 4
+        vmv.x.s	s6, v12
+        vslidedown.vi	v12, v0, 5
+        vmv.x.s	s7, v12
+        vslidedown.vi	v12, v0, 6
+        vmv.x.s	s8, v12
+        vslidedown.vi	v12, v0, 7
+        vmv.x.s	s9, v12
+        
+        vsetivli	t0, 4, e16, mf2, tu, ma
+        vslidedown.vi	v8, \in0, 4
+        vwmul.vx	v28, v8, s6
+        vslidedown.vi	v8, \in0, 4
+        vwmul.vx	v29, v8, s7
+        vslidedown.vi	v8, \in0, 4
+        vwmul.vx	v30, v8, s8
+        vslidedown.vi	v8, \in0, 4
+        vwmul.vx	v31, v8, s9
+
+        sum_sub         v28, \in1, s7, +, 2
+        sum_sub         v29, \in1, s9, -, 2
+        sum_sub         v30, \in1, s6, -, 2
+        sum_sub         v31, \in1, s8, -, 2
+        sum_sub         v28, \in2, s8, +, 2
+        sum_sub         v29, \in2, s6, -, 2
+        sum_sub         v30, \in2, s9, +, 2
+        sum_sub         v31, \in2, s7, +, 2
+        sum_sub         v28, \in3, s9, +, 2
+        sum_sub         v29, \in3, s8, -, 2
+        sum_sub         v30, \in3, s7, +, 2
+        sum_sub         v31, \in3, s6, -, 2
+
+        butterfly       v24, v28, v16, v23
+        butterfly       v25, v29, v17, v22
+        butterfly       v26, v30, v18, v21
+        butterfly       v27, v31, v19, v20
+        
+        li a7, \offset
+        add a4, sp, a7
+
+        vsetivli        t0, 4, e32, m1, tu, ma
+        vse32.v v16, (a4)
+        add a4, a4, 16
+        vse32.v v17, (a4)
+        add a4, a4, 16
+        vse32.v v18, (a4)
+        add a4, a4, 16
+        vse32.v v19, (a4)
+        add a4, a4, 16
+
+        vse32.v v20, (a4)
+        add a4, a4, 16
+        vse32.v v21, (a4)
+        add a4, a4, 16
+        vse32.v v22, (a4)
+        add a4, a4, 16
+        vse32.v v23, (a4)
+        add a4, a4, 16
+
+        add a4, a4, -64
+.endm
+
+.macro scale out0, out1, out2, out3, in0, in1, in2, in3, in4, in5, in6, in7, shift
+        vsetivli	t0, 4, e16, mf2, tu, ma
+        vnclip.wi	\out0\(), \in0\(), \shift
+        vsetivli	t0, 1, e64, m1, tu, ma
+        vmv.x.s	a7, \out0\()
+        vsetivli	t0, 4, e16, mf2, tu, ma
+        vnclip.wi	v8, \in1\(), \shift
+        vsetivli	t0, 2, e64, m1, tu, ma
+        vslide1up.vx	\out0\(), v8, a7
+        vsetivli	t0, 4, e16, mf2, tu, ma
+        vnclip.wi	\out1\(), \in2\(), \shift
+        vsetivli	t0, 1, e64, m1, tu, ma
+        vmv.x.s	a7, \out1\()
+        vsetivli	t0, 4, e16, mf2, tu, ma
+        vnclip.wi	v8, \in3\(), \shift
+        vsetivli	t0, 2, e64, m1, tu, ma
+        vslide1up.vx	\out1\(), v8, a7
+        vsetivli	t0, 4, e16, mf2, tu, ma
+        vnclip.wi	\out2\(), \in4\(), \shift
+        vsetivli	t0, 1, e64, m1, tu, ma
+        vmv.x.s	a7, \out2\()
+        vsetivli	t0, 4, e16, mf2, tu, ma
+        vnclip.wi	v8, \in5\(), \shift
+        vsetivli	t0, 2, e64, m1, tu, ma
+        vslide1up.vx	\out2\(), v8, a7
+        vsetivli	t0, 4, e16, mf2, tu, ma
+        vnclip.wi	\out3\(), \in6\(), \shift
+        vsetivli	t0, 1, e64, m1, tu, ma
+        vmv.x.s	a7, \out3\()
+        vsetivli	t0, 4, e16, mf2, tu, ma
+        vnclip.wi	v8, \in7\(), \shift
+        vsetivli	t0, 2, e64, m1, tu, ma
+        vslide1up.vx	\out3\(), v8, a7
+
+.endm
+
+.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
+        vsetivli        t0, 4, e16, m1, tu, ma
+        vwcvt.x.x.v     v8, \in0  
+        vsetivli        t0, 4, e32, m1, tu, ma
+        vsll.vi         v28, v8, 6
+
+        vsetivli t0, 16, e8, m1, tu, ma
+        vmv.v.v v29, v28
+
+        vsetivli	t0, 1, e16, m1, tu, ma
+        vmv.x.s	s2, v0
+        vslidedown.vi	v12, v0, 1
+        vmv.x.s	s3, v12
+        vslidedown.vi	v12, v0, 3
+        vmv.x.s	s5, v12
+        vsetivli	t0, 4, e16, mf2, tu, ma
+        vwmul.vx	v30, \in1, s3
+        vwmul.vx	v31, \in1, s5
+        vwmacc.vx	v28, s2, \in2
+        neg	s2, s2
+        vwmacc.vx	v29, s2, \in2
+        neg	s2, s2
+        vwmacc.vx	v30, s5, \in3
+        neg	s3, s3
+        vwmacc.vx	v31, s3, \in3
+        neg	s3, s3
+
+        vsetivli        t0, 4, e32, m1, tu, ma
+        vadd.vv             \out0, v28, v30
+        vadd.vv             \out1, v29, v31
+        vsub.vv             \out2, v29, v31
+        vsub.vv             \out3, v28, v30
+.endm
+
+.macro tr_16x4 name, shift, offset, step
+func func_tr_16x4_\name, zve64x
+        mv             a1,  a5
+        addi             a3,  a5, \step * 64
+        li             a2,  \step * 128
+
+        load16          v16, v17, v18, v19
+
+        lla              a1, trans
+
+        vsetivli        t0, 8, e16, m1, tu, ma
+        vle16.v v0, (a1) 
+
+        tr16_8x4        v16, v17, v18, v19, \offset
+
+        addi             a1,  a5, \step * 32
+        addi             a3,  a5, \step * 3 *32
+
+        li             a2,  \step * 128
+
+        load16          v20, v17, v18, v19
+
+        lla              a1, trans
+        addi             a1, a1, 16
+
+        vsetivli         t0, 8, e16, m1, tu, ma
+        vle16.v         v1, (a1) 
+
+        vsetivli	t0, 1, e16, m1, tu, ma
+        vmv.x.s	s2, v1
+        vslidedown.vi	v12, v1, 1
+        vmv.x.s	s3, v12
+        vslidedown.vi	v12, v1, 2
+        vmv.x.s	s4, v12
+        vslidedown.vi	v12, v1, 3
+        vmv.x.s	s5, v12
+        vslidedown.vi	v12, v1, 4
+        vmv.x.s	s6, v12
+        vslidedown.vi	v12, v1, 5
+        vmv.x.s	s7, v12
+        vslidedown.vi	v12, v1, 6
+        vmv.x.s	s8, v12
+        vslidedown.vi	v12, v1, 7
+        vmv.x.s	s9, v12
+
+        vsetivli        t0, 4, e16, mf2, tu, ma
+        vwmul.vx            v21, v20, s2
+        vwmul.vx            v22, v20, s3
+        vwmul.vx            v23, v20, s4
+        vwmul.vx            v24, v20, s5
+        vwmul.vx            v25, v20, s6
+        vwmul.vx            v26, v20, s7
+        vwmul.vx            v27, v20, s8
+        vwmul.vx            v28, v20, s9
+
+        vsetivli	t0, 1, e16, m1, tu, ma
+        vmv.x.s	s2, v1
+        vslidedown.vi	v12, v1, 1
+        vmv.x.s	s3, v12
+        vslidedown.vi	v12, v1, 2
+        vmv.x.s	s4, v12
+        vslidedown.vi	v12, v1, 3
+        vmv.x.s	s5, v12
+        vslidedown.vi	v12, v1, 4
+        vmv.x.s	s6, v12
+        vslidedown.vi	v12, v1, 5
+        vmv.x.s	s7, v12
+        vslidedown.vi	v12, v1, 6
+        vmv.x.s	s8, v12
+        vslidedown.vi	v12, v1, 7
+        vmv.x.s	s9, v12
+
+        add_member      v20, s3, s6, s9, s7, s4, s2, s5, s8, +, +, +, -, -, -, -, -, 2
+        add_member      v17, s4, s9, s5, s3, s8, s6, s2, s7, +, +, -, -, -, +, +, +
+        add_member      v17, s5, s7, s3, s9, s2, s8, s4, s6, +, -, -, +, +, +, -, -, 2
+        add_member      v18, s6, s4, s8, s2, s9, s3, s7, s5, +, -, -, +, -, -, +, +
+        add_member      v18, s7, s2, s6, s8, s3, s5, s9, s4, +, -, +, +, -, +, +, -, 2
+        add_member      v19, s8, s5, s2, s4, s7, s9, s6, s3, +, -, +, -, +, +, -, +
+        add_member      v19, s9, s8, s7, s6, s5, s4, s3, s2, +, -, +, -, +, -, +, -, 2
+
+        li a7, \offset
+        add a4, sp, a7
+
+        vsetivli        t0, 4, e32, m1, tu, ma 
+        vle32.v        v16, (a4)
+        addi            a4, a4, 16
+        vle32.v        v17, (a4)
+        addi            a4, a4, 16
+        vle32.v        v18, (a4)
+        addi            a4, a4, 16
+        vle32.v        v19, (a4)
+        addi            a4, a4, 16
+
+        butterfly16       v16,    v21,    v17,    v22,    v18,    v23,     v19,   v24
+    .if \shift > 0
+        scale             v29, v30, v31, v24, v20,    v16,    v21,    v17,    v22,    v18,    v23,    v19,    \shift
+
+        transpose16_4x4_2 v29, v30, v31, v24, v2, v3, v4, v5, v6, v7
+
+        mv             a1,  a6
+        addi             a3,  a6, 24 +3*32
+        li             a2, 32
+        li             a4, -32
+
+        store16         v29, v30, v31, v24, a4
+    .else
+        store_to_stack  \offset, (\offset + 240), v20, v21, v22, v23, v19, v18, v17, v16
+    .endif
+
+        li a7, \offset+64
+        add a4, sp, a7
+
+        vsetivli        t0, 4, e32, m1, tu, ma 
+        vle32.v        v16, (a4)
+        addi            a4, a4, 16
+        vle32.v        v17, (a4)
+        addi            a4, a4, 16
+        vle32.v        v18, (a4)
+        addi            a4, a4, 16
+        vle32.v        v19, (a4)
+        addi            a4, a4, 16
+
+        butterfly16     v16, v25, v17, v26, v18, v27, v19, v28
+   .if \shift > 0
+        scale             v29, v30, v31, v20, v20, v16, v25, v17, v26, v18, v27, v19, \shift
+        transpose16_4x4_2 v29, v30, v31, v20, v2, v3, v4, v5, v6, v7
+
+        add             a1,  a6, 8
+        add             a3,  a6, (16 + 3 * 32)
+        li             a2, 32
+        li             a4, -32
+        store16         v29, v30, v31, v20, a4
+   .else
+        store_to_stack  (\offset + 64), (\offset + 176), v20, v25, v26, v27, v19, v18, v17, v16
+   .endif
+
+        ret
+endfunc
+.endm
+
+tr_16x4 noscale, 0, 2048, 4
+
+.macro tr_32x4 name, shift
+func func_tr_32x4_\name, zve64x
+        mv             t3, ra        
+        jal              func_tr_16x4_noscale
+
+        load32
+
+        lla              t2, trans
+        addi            t2, t2, 32
+
+        vsetivli        t0, 4, e16, m1, tu, ma 
+        vle16.v v0, (t2)
+        addi t2, t2, 2*4
+        vle16.v v1, (t2)
+        addi t2, t2, 2*4
+        vle16.v v2, (t2)
+        addi t2, t2, 2*4
+        vle16.v v3, (t2)
+        addi t2, t2, -2*4
+
+        li a7, 2048
+        add a4, sp, a7
+
+        li               a2, 64
+        li               t1, -64
+        
+        jal               tr_block1
+        mv                  a1, t4
+        addi             a3, t4, (56 + 3 * 64)
+        scale_store     \shift
+
+        jal              tr_block2
+        addi             a1, t4, 8
+        addi             a3, t4, (48 + 3 * 64)
+        scale_store     \shift
+
+        jal              tr_block3
+        addi             a1, t4, 16
+        addi             a3, t4, (40 + 3 * 64)
+        scale_store     \shift
+
+        jal              tr_block4
+        addi             a1, t4, 24
+        addi             a3, t4, (32 + 3 * 64)
+        scale_store     \shift
+
+        jr t3
+endfunc
+.endm
+
+tr_32x4 firstpass, 7
+tr_32x4 secondpass_8, 20 - 8
+
+.macro idct_32x32 bitdepth
+func ff_hevc_idct_32x32_\bitdepth\()_rvv, zve64x
+        mv t6, ra
+        addi	sp,sp,-8*13
+        sd	ra,8*12(sp)
+        sd	s2,8*9(sp)
+        sd	s3,8*8(sp)
+        sd	s4,8*7(sp)
+        sd	s5,8*6(sp)
+        sd	s6,8*5(sp)
+        sd	s7,8*4(sp)
+        sd	s8,8*3(sp)
+        sd	s9,8*2(sp)
+        sd	s10,8*1(sp)
+        sd	s11,8*0(sp)
+
+        csrwi   vxrm, 1
+        li  a7, 2432 
+        sub sp, sp, a7
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+        li               a7, 8 * \i
+        add             a5, a0, a7
+
+        li               a7, 8 * \i * 32
+        add              t4, sp, a7
+        jal              func_tr_32x4_firstpass
+.endr
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+        addi            a5, sp, 8 * \i
+        addi             t4, a0, 8 * \i * 32
+        jal              func_tr_32x4_secondpass_\bitdepth
+.endr
+
+        li              a7, 2432 
+        add             sp,  sp,  a7
+        ld	ra,8*12(sp)
+        ld	s2,8*9(sp)
+        ld	s3,8*8(sp)
+        ld	s4,8*7(sp)
+        ld	s5,8*6(sp)
+        ld	s6,8*5(sp)
+        ld	s7,8*4(sp)
+        ld	s8,8*3(sp)
+        ld	s9,8*2(sp)
+        ld	s10,8*1(sp)
+        ld	s11,8*0(sp)
+        addi	sp,sp,8*13
+
+        jr t6
+endfunc
+.endm
+
+idct_32x32 8
diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c
index 1d8326a573..6dfb889eec 100644
--- a/libavcodec/riscv/hevcdsp_init.c
+++ b/libavcodec/riscv/hevcdsp_init.c
@@ -27,6 +27,8 @@
 #include "libavcodec/hevc/dsp.h"
 #include "libavcodec/riscv/h26x/h2656dsp.h"
 
+void ff_hevc_idct_32x32_8_rvv(int16_t *coeffs, int col_limit);
+
 #define RVV_FNASSIGN(member, v, h, fn, ext) \
         member[1][v][h] = ff_h2656_put_pixels_##8_##ext;  \
         member[3][v][h] = ff_h2656_put_pixels_##8_##ext;  \
@@ -40,27 +42,37 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int bit_depth)
     const int flags = av_get_cpu_flags();
     int vlenb;
 
-    if (!(flags & AV_CPU_FLAG_RVV_I32) || !(flags & AV_CPU_FLAG_RVB))
-        return;
-
     vlenb = ff_get_rv_vlenb();
-    if (vlenb >= 32) {
-        switch (bit_depth) {
-            case 8:
-                RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256);
-                RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256);
-                break;
-            default:
-                break;
-        }
-    } else if (vlenb >= 16) {
-        switch (bit_depth) {
-            case 8:
-                RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128);
-                RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128);
-                break;
-            default:
-                break;
+
+    if (flags & AV_CPU_FLAG_RVV_I64)
+        if (vlenb >= 16)
+            switch (bit_depth) {
+                case 8:
+                    c->idct[3]                     = ff_hevc_idct_32x32_8_rvv;
+                    break;
+                default:
+                    break;
+            }
+
+    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB)){
+        if (vlenb >= 32) {
+            switch (bit_depth) {
+                case 8:
+                    RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256);
+                    RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256);
+                    break;
+                default:
+                    break;
+            }
+        } else if (vlenb >= 16) {
+            switch (bit_depth) {
+                case 8:
+                    RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128);
+                    RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128);
+                    break;
+                default:
+                    break;
+            }
         }
     }
 #endif
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [FFmpeg-devel] [PATCH] libavcodec/riscv:add RVV optimized for idct_32x32_8:
  2025-04-15  7:34 [FFmpeg-devel] [PATCH] libavcodec/riscv:add RVV optimized for idct_32x32_8: daichengrong
@ 2025-04-15 15:02 ` Rémi Denis-Courmont
  2025-04-17 21:07 ` Michael Niedermayer
  2025-04-28  9:32 ` [FFmpeg-devel] [PATCH v v2] " daichengrong
  2 siblings, 0 replies; 4+ messages in thread
From: Rémi Denis-Courmont @ 2025-04-15 15:02 UTC (permalink / raw)
  To: ffmpeg-devel

Hi,

Le tiistaina 15. huhtikuuta 2025, 10.34.24 Itä-Euroopan kesäaika 
daichengrong@iscas.ac.cn a écrit :
> From: daichengrong <daichengrong@iscas.ac.cn>
> 
>      riscv/hevcdsp_idct_rvv: Optimize idct_32x32_8
> 
>      On Banana PI F3:
> 
>      hevc_idct_32x32_8_c:                                119579.3 ( 1.00x)
>      hevc_idct_32x32_8_rvv_i64:                           51254.4 ( 2.33x)
> 
> Signed-off-by: daichengrong <daichengrong@iscas.ac.cn>
> ---
>  libavcodec/riscv/Makefile           |    1 +
>  libavcodec/riscv/hevcdsp_idct_rvv.S | 1042 +++++++++++++++++++++++++++
>  libavcodec/riscv/hevcdsp_init.c     |   52 +-
>  3 files changed, 1075 insertions(+), 20 deletions(-)
>  create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S
> 
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index a80d2fa2e7..dfc33afbee 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o
> riscv/h264dsp_rvv.o \ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
>  RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
>  OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
> +OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
>  RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
>  OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
>  RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
> diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S
> b/libavcodec/riscv/hevcdsp_idct_rvv.S new file mode 100644
> index 0000000000..f8dd2e5bf4
> --- /dev/null
> +++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
> @@ -0,0 +1,1042 @@
> +/*
> + * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences
> (ISCAS). + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +const trans, align=4
> +        .2byte          64, 83, 64, 36
> +        .2byte          89, 75, 50, 18
> +        .2byte          90, 87, 80, 70
> +        .2byte          57, 43, 25, 9
> +        .2byte          90, 90, 88, 85
> +        .2byte          82, 78, 73, 67
> +        .2byte          61, 54, 46, 38
> +        .2byte          31, 22, 13, 4
> +endconst
> +
> +.macro sum_sub out, in, c, op, p
> +        vsetivli	t0, 4, e16, mf2, tu, ma

I tihnk that you don't need t0 here? Ditto below.

> +  .ifc \op, +
> +        .ifc \p, 2
> +                vslidedown.vi	v8, \in, 4
> +                vwmacc.vx	\out, \c, v8
> +        .else
> +                vwmacc.vx	\out, \c, \in
> +        .endif
> +  .else
> +        .ifc \p, 2
> +                neg	\c, \c
> +                vslidedown.vi	v8, \in, 4
> +                vwmacc.vx	\out, \c, v8
> +                neg	\c, \c
> +        .else
> +                neg	\c, \c
> +                vwmacc.vx	\out, \c, \in
> +                neg	\c, \c

The typical problem with complex nested macros like this is, you easily end up 
assembling very inefficient code.

For instance, this keeps vainly flipping the sign of the same value over and 
over only to allow this macro to exist.

> +        .endif
> +  .endif
> +.endm
> +
> +.macro add_member32 in, t0, index0, t1, index1, t2, index2, t3, index3,
> op0, op1, op2, op3, p
> +        vsetivli	t0, 1, e16, m1, tu, ma
> +        vslidedown.vi	v12, \t0, \index0
> +        vmv.x.s	s2, v12
> +        vslidedown.vi	v12, \t1, \index1
> +        vmv.x.s	s3, v12
> +        vslidedown.vi	v12, \t2, \index2
> +        vmv.x.s	s4, v12
> +        vslidedown.vi	v12, \t3, \index3
> +        vmv.x.s	s5, v12

This is a very inefficient way to extract 4 scalars out of a vector. I'm not 
familar with the overall specific algorithm, but I would expect that this can 
be avoided. At least, I have never seen need for such construct to implement a 
DCT. And we already have quite a few DCTs in the FFmpeg RISC-V port.

Admittedly for smaller matrices than 32x32. But typically larger matrices are 
not as big a deal with RVV as they are on Arm or x86. RVV requires spilling to 
the intermediate values from the first DCT dimension to memory, to transpose 
them before the second DCT dimension. That being the case, the penalty for not 
fitting the entire matrix in the vector register bank is comparatively much 
smaller.

And yes, please don't use slides for transposition. It's horribly complicated 
and almost certainly slower than spilling to stack and using strided loads/
stores, for any non-trivial matrix size.

-- 
Rémi Denis-Courmont
Tapiolan uusi kaupunki, Uudenmaan entinen Suomen tasavalta



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [FFmpeg-devel] [PATCH] libavcodec/riscv:add RVV optimized for idct_32x32_8:
  2025-04-15  7:34 [FFmpeg-devel] [PATCH] libavcodec/riscv:add RVV optimized for idct_32x32_8: daichengrong
  2025-04-15 15:02 ` Rémi Denis-Courmont
@ 2025-04-17 21:07 ` Michael Niedermayer
  2025-04-28  9:32 ` [FFmpeg-devel] [PATCH v v2] " daichengrong
  2 siblings, 0 replies; 4+ messages in thread
From: Michael Niedermayer @ 2025-04-17 21:07 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


[-- Attachment #1.1: Type: text/plain, Size: 1073 bytes --]

On Tue, Apr 15, 2025 at 03:34:24PM +0800, daichengrong@iscas.ac.cn wrote:
> From: daichengrong <daichengrong@iscas.ac.cn>
> 
>      riscv/hevcdsp_idct_rvv: Optimize idct_32x32_8
> 
>      On Banana PI F3:
> 
>      hevc_idct_32x32_8_c:                                119579.3 ( 1.00x)
>      hevc_idct_32x32_8_rvv_i64:                           51254.4 ( 2.33x)
> 
> Signed-off-by: daichengrong <daichengrong@iscas.ac.cn>
> ---
>  libavcodec/riscv/Makefile           |    1 +
>  libavcodec/riscv/hevcdsp_idct_rvv.S | 1042 +++++++++++++++++++++++++++
>  libavcodec/riscv/hevcdsp_init.c     |   52 +-
>  3 files changed, 1075 insertions(+), 20 deletions(-)
>  create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

there are 358 tabs in this, please use spaces
both for consistency with the rest of the .S files and also
because the git push hook will block this

thx

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Republics decline into democracies and democracies degenerate into
despotisms. -- Aristotle

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [FFmpeg-devel] [PATCH v v2] libavcodec/riscv:add RVV optimized for idct_32x32_8:
  2025-04-15  7:34 [FFmpeg-devel] [PATCH] libavcodec/riscv:add RVV optimized for idct_32x32_8: daichengrong
  2025-04-15 15:02 ` Rémi Denis-Courmont
  2025-04-17 21:07 ` Michael Niedermayer
@ 2025-04-28  9:32 ` daichengrong
  2 siblings, 0 replies; 4+ messages in thread
From: daichengrong @ 2025-04-28  9:32 UTC (permalink / raw)
  To: ffmpeg-devel

From: daichengrong <daichengrong@iscas.ac.cn>

     riscv/hevcdsp_idct_rvv: Optimize idct_32x32_8

     On Banana PI F3:

     hevc_idct_32x32_8_c:                                118945.0 ( 1.00x)
     hevc_idct_32x32_8_rvv_i64:                           28503.7 ( 4.17x)

Signed-off-by: daichengrong <daichengrong@iscas.ac.cn>
---
 libavcodec/riscv/Makefile           |   1 +
 libavcodec/riscv/hevcdsp_idct_rvv.S | 973 ++++++++++++++++++++++++++++
 libavcodec/riscv/hevcdsp_init.c     |  52 +-
 3 files changed, 1006 insertions(+), 20 deletions(-)
 create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index a80d2fa2e7..dfc33afbee 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \
 OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
 RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 0000000000..561b8ada47
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,973 @@
+/*
+ * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+const trans, align=4
+        .2byte          64, 83, 64, 36
+        .2byte          89, 75, 50, 18
+        .2byte          90, 87, 80, 70
+        .2byte          57, 43, 25, 9
+        .2byte          90, 90, 88, 85
+        .2byte          82, 78, 73, 67
+        .2byte          61, 54, 46, 38
+        .2byte          31, 22, 13, 4
+endconst
+
+.macro sum_sub out, in, c, op, p
+        mv t0, \c
+        .ifc \op, -
+                        neg    t0, t0
+        .endif
+        vsetivli    zero, 4, e16, mf2, tu, ma
+        .ifc \p, 2  
+                vslidedown.vi    v8, \in, 4
+                vwmacc.vx    \out, t0, v8
+        .else
+                vwmacc.vx    \out, t0, \in
+        .endif
+.endm
+
+.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p
+        sum_sub         v24, \in, \t0, \op0, \p
+        sum_sub         v25, \in, \t1, \op1, \p
+        sum_sub         v26, \in, \t2, \op2, \p
+        sum_sub         v27, \in, \t3, \op3, \p
+.endm
+
+.macro butterfly e, o, tmp_p, tmp_m
+        vsetivli    zero, 4, e32, m1, tu, ma
+        vadd.vv             \tmp_p, \e, \o
+        vsub.vv             \tmp_m, \e, \o
+.endm
+
+.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
+        vsetivli    zero, 4, e32, m1, tu, ma
+        vadd.vv              v20, \in0, \in1
+        vsub.vv              \in0, \in0, \in1
+        vadd.vv              \in1, \in2, \in3
+        vsub.vv              \in2, \in2, \in3
+        vadd.vv              \in3, \in4, \in5
+        vsub.vv              \in4, \in4, \in5
+        vadd.vv              \in5, \in6, \in7
+        vsub.vv              \in6, \in6, \in7
+.endm
+
+.macro multiply in
+        vsetivli    zero, 4, e16, m1, tu, ma
+        vse16.v         \in, (s0)
+        ld      s2, 0*2(s0)
+        ld      s3, 1*2(s0)
+        ld      s4, 2*2(s0)
+        ld      s5, 3*2(s0)
+
+        vsetivli    zero, 4, e16, mf2, tu, ma
+        vwmul.vx        v24, v4, s2
+        vwmul.vx        v25, v4, s3
+        vwmul.vx        v26, v4, s4
+        vwmul.vx        v27, v4, s5
+.endm
+
+func tr_block1, zve64x
+        multiply        v0
+
+        addi    sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        sd    x\i,8*(\i - 10)(sp)
+.endr
+        vsetivli    zero, 4, e16, m1, tu, ma
+        vse16.v         v0, (s0)
+        ld      x10, 0*2(s0)
+        ld      x11, 1*2(s0)
+        ld      x12, 2*2(s0)
+        ld      x13, 3*2(s0)
+        vse16.v         v1, (s0)
+        ld      x14, 0*2(s0)
+        ld      x15, 1*2(s0)
+        ld      x16, 2*2(s0)
+        ld      x17, 3*2(s0)
+        vse16.v         v2, (s0)
+        ld      x18, 0*2(s0)
+        ld      x19, 1*2(s0)
+        ld      x20, 2*2(s0)
+        ld      x21, 3*2(s0)
+        vse16.v         v3, (s0)
+        ld      x22, 0*2(s0)
+        ld      x23, 1*2(s0)
+        ld      x24, 2*2(s0)
+        ld      x25, 3*2(s0)
+        
+        add_member32    v4,  x11, x14, x17, x20, +, +, +, +, 2
+        add_member32    v5,  x12, x17, x22, x24, +, +, +, -
+        add_member32    v5,  x13, x20, x24, x17, +, +, -, -, 2
+        add_member32    v6,  x14, x23, x19, x10, +, +, -, -
+        add_member32    v6,  x15, x25, x14, x16, +, -, -, -, 2
+        add_member32    v7,  x16, x22, x10, x23, +, -, -, -
+        add_member32    v7,  x17, x19, x15, x21, +, -, -, +, 2
+        add_member32    v16, x18, x16, x20, x14, +, -, -, +
+        add_member32    v16, x19, x13, x25, x12, +, -, -, +, 2
+        add_member32    v17, x20, x11, x21, x19, +, -, +, +
+        add_member32    v17, x21, x12, x16, x25, +, -, +, -, 2
+        add_member32    v18, x22, x15, x11, x18, +, -, +, -
+        add_member32    v18, x23, x18, x13, x11, +, -, +, -, 2
+        add_member32    v19, x24, x21, x18, x15, +, -, +, -
+        add_member32    v19, x25, x24, x23, x22, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        ld    x\i, 8*(\i - 10)(sp)
+.endr
+        addi    sp, sp, 8*16
+
+        ret
+endfunc
+
+func tr_block2, zve64x
+        multiply        v1
+
+        addi    sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        sd    x\i,8*(\i - 10)(sp)
+.endr
+        vsetivli    zero, 4, e16, m1, tu, ma
+        vse16.v         v0, (s0)
+        ld      x10, 0*2(s0)
+        ld      x11, 1*2(s0)
+        ld      x12, 2*2(s0)
+        ld      x13, 3*2(s0)
+        vse16.v         v1, (s0)
+        ld      x14, 0*2(s0)
+        ld      x15, 1*2(s0)
+        ld      x16, 2*2(s0)
+        ld      x17, 3*2(s0)
+        vse16.v         v2, (s0)
+        ld      x18, 0*2(s0)
+        ld      x19, 1*2(s0)
+        ld      x20, 2*2(s0)
+        ld      x21, 3*2(s0)
+        vse16.v         v3, (s0)
+        ld      x22, 0*2(s0)
+        ld      x23, 1*2(s0)
+        ld      x24, 2*2(s0)
+        ld      x25, 3*2(s0)
+
+        add_member32    v4,  x23, x25, x22, x19, +, -, -, -, 2
+        add_member32    v5,  x19, x14, x10, x15, -, -, -, -
+        add_member32    v5,  x10, x16, x23, x21, -, -, -, +, 2
+        add_member32    v6,  x18, x24, x15, x13, -, +, +, +
+        add_member32    v6,  x24, x13, x17, x23, +, +, +, -, 2
+        add_member32    v7,  x15, x17, x21, x10, +, +, -, -
+        add_member32    v7,  x13, x23, x11, x25, +, -, -, +, 2
+        add_member32    v16, x22, x12, x24, x11, +, -, -, +
+        add_member32    v16, x20, x18, x14, x24, -, -, +, +, 2
+        add_member32    v17, x11, x22, x18, x12, -, +, +, -
+        add_member32    v17, x17, x11, x20, x22, -, +, -, -, 2
+        add_member32    v18, x25, x19, x12, x14, +, +, -, +
+        add_member32    v18, x16, x21, x25, x20, +, -, -, +, 2
+        add_member32    v19, x12, x11, x13, x16, +, -, +, -
+        add_member32    v19, x21, x20, x19, x18, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        ld    x\i, 8*(\i - 10)(sp)
+.endr
+        addi    sp, sp, 8*16
+
+        ret
+endfunc
+
+func tr_block3, zve64x
+        multiply        v2
+        addi    sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        sd    x\i,8*(\i - 10)(sp)
+.endr
+        vsetivli    zero, 4, e16, m1, tu, ma
+        vse16.v         v0, (s0)
+        ld      x10, 0*2(s0)
+        ld      x11, 1*2(s0)
+        ld      x12, 2*2(s0)
+        ld      x13, 3*2(s0)
+        vse16.v         v1, (s0)
+        ld      x14, 0*2(s0)
+        ld      x15, 1*2(s0)
+        ld      x16, 2*2(s0)
+        ld      x17, 3*2(s0)
+        vse16.v         v2, (s0)
+        ld      x18, 0*2(s0)
+        ld      x19, 1*2(s0)
+        ld      x20, 2*2(s0)
+        ld      x21, 3*2(s0)
+        vse16.v         v3, (s0)
+        ld      x22, 0*2(s0)
+        ld      x23, 1*2(s0)
+        ld      x24, 2*2(s0)
+        ld      x25, 3*2(s0)
+
+        add_member32    v4,  x16, x13, x10, x12, -, -, -, -, 2
+        add_member32    v5,  x20, x25, x21, x16, -, -, +, +
+        add_member32    v5,  x14, x12, x19, x25, +, +, +, -, 2
+        add_member32    v6,  x22, x20, x11, x17, +, -, -, -
+        add_member32    v6,  x12, x18, x22, x10, -, -, +, +, 2
+        add_member32    v7,  x24, x14, x18, x20, -, +, +, -
+        add_member32    v7,  x10, x24, x12, x22, +, +, -, -, 2
+        add_member32    v16, x25, x11, x23, x13, -, -, +, +
+        add_member32    v16, x11, x21, x17, x15, -, +, +, -, 2
+        add_member32    v17, x23, x17, x13, x24, +, +, -, +
+        add_member32    v17, x13, x15, x24, x18, +, -, +, +, 2
+        add_member32    v18, x21, x23, x16, x11, -, -, +, -
+        add_member32    v18, x15, x10, x14, x19, -, +, -, +, 2
+        add_member32    v19, x19, x22, x25, x23, +, -, +, +
+        add_member32    v19, x17, x16, x15, x14, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        ld    x\i, 8*(\i - 10)(sp)
+.endr
+        addi    sp, sp, 8*16
+
+        ret
+endfunc
+
+func tr_block4, zve64x
+        multiply        v3
+        addi    sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        sd    x\i,8*(\i - 10)(sp)
+.endr
+        vsetivli    zero, 4, e16, m1, tu, ma
+        vse16.v         v0, (s0)
+        ld      x10, 0*2(s0)
+        ld      x11, 1*2(s0)
+        ld      x12, 2*2(s0)
+        ld      x13, 3*2(s0)
+        vse16.v         v1, (s0)
+        ld      x14, 0*2(s0)
+        ld      x15, 1*2(s0)
+        ld      x16, 2*2(s0)
+        ld      x17, 3*2(s0)
+        vse16.v         v2, (s0)
+        ld      x18, 0*2(s0)
+        ld      x19, 1*2(s0)
+        ld      x20, 2*2(s0)
+        ld      x21, 3*2(s0)
+        vse16.v         v3, (s0)
+        ld      x22, 0*2(s0)
+        ld      x23, 1*2(s0)
+        ld      x24, 2*2(s0)
+        ld      x25, 3*2(s0)
+
+        add_member32    v4,  x15, x18, x21, x24, -, -, -, -, 2
+        add_member32    v5,  x10, x13, x18, x23, +, +, +, +
+        add_member32    v5,  x18, x10, x15, x22, -, -, -, -, 2
+        add_member32    v6,  x25, x16, x12, x21, +, +, +, +
+        add_member32    v6,  x19, x21, x10, x20, +, -, -, -, 2
+        add_member32    v7,  x12, x25, x13, x19, -, -, +, +
+        add_member32    v7,  x14, x20, x16, x18, +, +, -, -, 2
+        add_member32    v16, x21, x15, x19, x17, -, -, +, +
+        add_member32    v16, x23, x11, x22, x16, -, +, -, -, 2
+        add_member32    v17, x16, x14, x25, x15, +, -, +, +
+        add_member32    v17, x11, x19, x23, x14, -, +, +, -, 2
+        add_member32    v18, x17, x24, x20, x13, +, -, -, +
+        add_member32    v18, x24, x22, x17, x12, -, -, +, -, 2
+        add_member32    v19, x20, x17, x14, x11, -, +, -, +
+        add_member32    v19, x13, x12, x11, x10, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        ld    x\i, 8*(\i - 10)(sp)
+.endr
+        addi    sp, sp, 8*16
+
+        ret
+endfunc
+
+.macro butterfly32 in0, in1, in2, in3, out
+        vsetivli    zero, 4, e32, m1, tu, ma
+        vadd.vv             \out, \in0, \in1
+        vsub.vv             \in0, \in0, \in1
+        vadd.vv             \in1, \in2, \in3
+        vsub.vv             \in2, \in2, \in3
+.endm
+
+.macro load16 in0, in1, in2, in3
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v         v8, (a3)
+        
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx    \in0, v8, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v         \in0, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+     
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx \in1, v8, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v \in1, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+        
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx \in2, v8, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v \in2, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+        
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx \in3, v8, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v \in3, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+.endm
+
+.macro store16 in0, in1, in2, in3, rx
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vse64.v \in0, (a1)
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1down.vx v8, \in0, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vse64.v v8, (a3)
+        add a1, a1, a2
+        add a3, a3, \rx
+
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vse64.v \in1, (a1)
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1down.vx v8, \in1, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vse64.v v8, (a3)
+        add a1, a1, a2
+        add a3, a3, \rx
+        
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vse64.v \in2, (a1)
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1down.vx v8, \in2, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vse64.v v8, (a3)
+        add a1, a1, a2
+        add a3, a3, \rx
+        
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vse64.v \in3, (a1)
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1down.vx v8, \in3, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vse64.v v8, (a3)
+        add a1, a1, a2
+        add a3, a3, \rx 
+.endm
+
+.macro load32
+        addi             a1,  a5, 64
+        addi             a3,  a1, 128
+        li               a2,  256
+        
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx v4, v8, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v4, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx v5, v8, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v5, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx v6, v8, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v6, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx v7, v8, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v7, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx v16, v8, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v16, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx v17, v8, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v17, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx v18, v8, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v18, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v8, (a3)
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx v19, v8, zero
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vle64.v v19, (a1) 
+        add a1, a1, a2
+        add a3, a3, a2
+
+.endm
+
+.macro add_member in, tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7, op0, op1, op2, op3, op4, op5, op6, op7, p
+        sum_sub         v21, \in, \tt0, \op0, \p
+        sum_sub         v22, \in, \tt1, \op1, \p
+        sum_sub         v23, \in, \tt2, \op2, \p
+        sum_sub         v24, \in, \tt3, \op3, \p
+        sum_sub         v25, \in, \tt4, \op4, \p
+        sum_sub         v26, \in, \tt5, \op5, \p
+        sum_sub         v27, \in, \tt6, \op6, \p
+        sum_sub         v28, \in, \tt7, \op7, \p
+.endm
+
+.macro scale_store shift
+        vsetivli    zero, 8, e16, m1, tu, ma 
+        vle16.v        v28, (a4)
+        addi            a4, a4, 2*8
+        vle16.v        v29, (a4)
+        addi            a4, a4, 2*8
+        vle16.v        v30, (a4)
+        addi            a4, a4, 2*8
+        vle16.v        v31, (a4)
+        addi            a4, a4, 2*8
+
+        butterfly32     v28, v24, v29, v25, v2
+        butterfly32     v30, v26, v31, v27, v3
+
+        scale           v20, v21, v22, v23, v2, v28, v24, v29, v3, v30, v26, v31, \shift
+
+        transpose16_4x4_2 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+
+        store16         v20, v21, v22, v23, t1
+
+        vsetivli    zero, 4, e16, m1, tu, ma 
+        vle16.v         v2, (t2)
+        addi            t2, t2, 8
+        vle16.v         v3, (t2)
+        addi            t2, t2, -8
+.endm
+
+.macro store_to_stack off1, off2, in0, in2, in4, in6, in7, in5, in3, in1
+        li      a7, \off1
+        add     a1, sp, a7
+        li      a7, \off2
+        add     a3, sp, a7  
+        li      a2, -16
+        li      a4, 16
+
+        vsetivli    zero, 4, e32, m1, tu, ma
+        vse32.v         \in0, (a1)
+        add             a1, a1, a4
+        vse32.v         \in1, (a3)
+        add             a3, a3, a2
+        vse32.v         \in2, (a1)
+        add             a1, a1, a4
+        vse32.v         \in3, (a3)
+        add             a3, a3, a2
+        vse32.v         \in4, (a1)
+        add             a1, a1, a4
+        vse32.v         \in5, (a3)
+        add             a3, a3, a2
+        vse32.v         \in6, (a1)
+        vse32.v         \in7, (a3)
+.endm
+
+.macro transpose16_4x4_2 r0, r1, r2, r3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5
+        vsetivli    zero, 8, e16, m1, tu, ma
+        vse16.v         v0, (s0)
+
+        vsetivli    zero, 8, e16, m1, tu, ma
+        vid.v     v0
+        vand.vi   v8, v0, 1
+        vmsne.vi  v0, v8, 0
+
+        vslideup.vi    v8, v\r1, 1
+        vsetivli    zero, 4, e16, m1, ta, ma
+        vmerge.vvm       v\tmp0\(), v\r0\(), v8, v0
+
+        vslidedown.vi    v8, v\r0, 1
+        vmerge.vvm       v\tmp1\(), v8, v\r1\(), v0
+
+        vslideup.vi    v8, v\r3, 1
+        vmerge.vvm       v\tmp2\(), v\r2\(), v8, v0
+        
+        vslidedown.vi    v8, v\r2\(), 1
+        vmerge.vvm       v\tmp3\(), v8, v\r3\(), v0
+
+        vsetivli    zero, 2, e32, m1, tu
+
+        vslideup.vi    v8, v\tmp2\(), 1
+        vmerge.vvm       v\tmp4\(), v\tmp0\(), v8, v0
+
+        vslidedown.vi    v8, v\tmp0\(), 1
+        vmerge.vvm       v\tmp5\(), v8, v\tmp2\(), v0
+
+        vslideup.vi    v8, v\tmp3\(), 1
+        vmerge.vvm       v\tmp0\(), v\tmp1\(), v8, v0
+
+        vslidedown.vi    v8, v\tmp1\(), 1
+        vmerge.vvm       v\tmp2\(), v8, v\tmp3\(), v0
+
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vmv.v.v  v\r0\(), v\tmp4\()
+        vmv.v.v  v\r2\(), v\tmp5\()
+        vmv.v.v  v\r1\(), v\tmp0\()
+        vmv.v.v  v\r3\(), v\tmp2\()
+
+        vsetivli    zero, 8, e16, m1, tu, ma
+
+        vslideup.vi    v8, v\r2\(), 1
+        vmerge.vvm       v\tmp0\(), v\r3\(), v8, v0
+
+        vslidedown.vi    v8, v\r3\(), 1
+        vmerge.vvm        v\tmp1\(), v8, v\r2\(), v0
+
+        vslideup.vi    v8, v\r0\(), 1
+        vmerge.vvm       v\tmp2\(), v\r1\(),v8, v0
+
+        vslidedown.vi    v8, v\r1\(), 1
+        vmerge.vvm        v\tmp3\(), v8, v\r0\(), v0
+
+        vsetivli    zero, 4, e32, m1, tu, ma
+
+        vslideup.vi    v8, v\tmp2\(), 1
+        vmerge.vvm       v\tmp4\(), v\tmp0\(), v8, v0
+
+        vslidedown.vi    v8, v\tmp0\(), 1
+        vmerge.vvm       v\tmp5\(), v8, v\tmp2\(), v0
+
+        vslideup.vi    v8, v\tmp3\(), 1
+        vmerge.vvm       v\tmp0\(), v\tmp1\(), v8, v0
+
+        vslidedown.vi    v8, v\tmp1\(), 1
+        vmerge.vvm       v\tmp2\(), v8, v\tmp3\(), v0
+
+        vsetivli    zero, 2, e64, m1, tu, ma
+
+        vmerge.vvm       v\r3\(), v\r3\(), v\tmp4\(), v0
+        vmerge.vvm       v\r1\(), v\r1\(), v\tmp5\(), v0
+        vmerge.vvm       v\r2\(), v\r2\(), v\tmp0\(), v0
+        vmerge.vvm       v\r0\(), v\r0\(), v\tmp2\(), v0
+
+        vsetivli    zero, 8, e16, m1, tu, ma
+        vle16.v         v0, (s0)
+.endm
+
+.macro tr16_8x4 in0, in1, in2, in3, offset
+        tr_4x4_8        \in0, \in1, \in2, \in3, v24, v25, v26, v27
+        
+        vsetivli    zero, 8, e16, m1, tu, ma
+        vse16.v     v0, (s0)
+        lh      s6, 4*2(s0)
+        lh      s7, 5*2(s0)
+        lh      s8, 6*2(s0)
+        lh      s9, 7*2(s0)
+        
+        vsetivli    zero, 4, e16, mf2, tu, ma
+        vslidedown.vi    v8, \in0, 4
+        vwmul.vx    v28, v8, s6
+        vslidedown.vi    v8, \in0, 4
+        vwmul.vx    v29, v8, s7
+        vslidedown.vi    v8, \in0, 4
+        vwmul.vx    v30, v8, s8
+        vslidedown.vi    v8, \in0, 4
+        vwmul.vx    v31, v8, s9
+
+        sum_sub         v28, \in1, s7, +, 2
+        sum_sub         v29, \in1, s9, -, 2
+        sum_sub         v30, \in1, s6, -, 2
+        sum_sub         v31, \in1, s8, -, 2
+        sum_sub         v28, \in2, s8, +, 2
+        sum_sub         v29, \in2, s6, -, 2
+        sum_sub         v30, \in2, s9, +, 2
+        sum_sub         v31, \in2, s7, +, 2
+        sum_sub         v28, \in3, s9, +, 2
+        sum_sub         v29, \in3, s8, -, 2
+        sum_sub         v30, \in3, s7, +, 2
+        sum_sub         v31, \in3, s6, -, 2
+
+        butterfly       v24, v28, v16, v23
+        butterfly       v25, v29, v17, v22
+        butterfly       v26, v30, v18, v21
+        butterfly       v27, v31, v19, v20
+        
+        li a7, \offset
+        add a4, sp, a7
+
+        vsetivli    zero, 4, e32, m1, tu, ma
+        vse32.v v16, (a4)
+        add a4, a4, 16
+        vse32.v v17, (a4)
+        add a4, a4, 16
+        vse32.v v18, (a4)
+        add a4, a4, 16
+        vse32.v v19, (a4)
+        add a4, a4, 16
+
+        vse32.v v20, (a4)
+        add a4, a4, 16
+        vse32.v v21, (a4)
+        add a4, a4, 16
+        vse32.v v22, (a4)
+        add a4, a4, 16
+        vse32.v v23, (a4)
+        add a4, a4, 16
+
+        add a4, a4, -64
+.endm
+
+.macro scale out0, out1, out2, out3, in0, in1, in2, in3, in4, in5, in6, in7, shift
+        vsetivli    zero, 4, e16, mf2, tu, ma
+        vnclip.wi    \out0\(), \in0\(), \shift
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vmv.x.s    a7, \out0\()
+        vsetivli    zero, 4, e16, mf2, tu, ma
+        vnclip.wi    v8, \in1\(), \shift
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx    \out0\(), v8, a7
+        vsetivli    zero, 4, e16, mf2, tu, ma
+        vnclip.wi    \out1\(), \in2\(), \shift
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vmv.x.s    a7, \out1\()
+        vsetivli    zero, 4, e16, mf2, tu, ma
+        vnclip.wi    v8, \in3\(), \shift
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx    \out1\(), v8, a7
+        vsetivli    zero, 4, e16, mf2, tu, ma
+        vnclip.wi    \out2\(), \in4\(), \shift
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vmv.x.s    a7, \out2\()
+        vsetivli    zero, 4, e16, mf2, tu, ma
+        vnclip.wi    v8, \in5\(), \shift
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx    \out2\(), v8, a7
+        vsetivli    zero, 4, e16, mf2, tu, ma
+        vnclip.wi    \out3\(), \in6\(), \shift
+        vsetivli    zero, 1, e64, m1, tu, ma
+        vmv.x.s    a7, \out3\()
+        vsetivli    zero, 4, e16, mf2, tu, ma
+        vnclip.wi    v8, \in7\(), \shift
+        vsetivli    zero, 2, e64, m1, tu, ma
+        vslide1up.vx    \out3\(), v8, a7
+.endm
+
+.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
+        vsetivli    zero, 4, e16, m1, tu, ma
+        vwcvt.x.x.v     v8, \in0  
+        vsetivli    zero, 4, e32, m1, tu, ma
+        vsll.vi         v28, v8, 6
+
+        vsetivli    zero, 16, e8, m1, tu, ma
+        vmv.v.v v29, v28
+
+        vsetivli    zero, 1, e16, m1, tu, ma
+        vmv.x.s    s2, v0
+        vslidedown.vi    v12, v0, 1
+        vmv.x.s    s3, v12
+        vslidedown.vi    v12, v0, 3
+        vmv.x.s    s5, v12
+        vsetivli    zero, 4, e16, mf2, tu, ma
+        vwmul.vx    v30, \in1, s3
+        vwmul.vx    v31, \in1, s5
+        vwmacc.vx    v28, s2, \in2
+        neg    s2, s2
+        vwmacc.vx    v29, s2, \in2
+        neg    s2, s2
+        vwmacc.vx    v30, s5, \in3
+        neg    s3, s3
+        vwmacc.vx    v31, s3, \in3
+        neg    s3, s3
+
+        vsetivli    zero, 4, e32, m1, tu, ma
+        vadd.vv             \out0, v28, v30
+        vadd.vv             \out1, v29, v31
+        vsub.vv             \out2, v29, v31
+        vsub.vv             \out3, v28, v30
+.endm
+
+.macro tr_16x4 name, shift, offset, step
+func func_tr_16x4_\name, zve64x
+        mv             a1,  a5
+        addi           a3,  a5, \step * 64
+        li             a2,  \step * 128
+        load16          v16, v17, v18, v19
+
+        lla              a1, trans
+
+        vsetivli    zero, 8, e16, m1, tu, ma
+        vle16.v         v0, (a1) 
+
+        tr16_8x4        v16, v17, v18, v19, \offset
+
+        addi             a1,  a5, \step * 32
+        addi             a3,  a5, \step * 3 *32
+        li             a2,  \step * 128
+        load16          v20, v17, v18, v19
+
+        lla              a1, trans
+        addi             a1, a1, 16
+
+        vsetivli    zero, 8, e16, m1, tu, ma
+        vle16.v          v1, (a1) 
+
+        lh      s2, 0*2(a1)
+        lh      s3, 1*2(a1)
+        lh      s4, 2*2(a1)
+        lh      s5, 3*2(a1)
+        lh      s6, 4*2(a1)
+        lh      s7, 5*2(a1)
+        lh      s8, 6*2(a1)
+        lh      s9, 7*2(a1)
+
+        vsetivli    zero, 4, e16, mf2, tu, ma
+        vwmul.vx            v21, v20, s2
+        vwmul.vx            v22, v20, s3
+        vwmul.vx            v23, v20, s4
+        vwmul.vx            v24, v20, s5
+        vwmul.vx            v25, v20, s6
+        vwmul.vx            v26, v20, s7
+        vwmul.vx            v27, v20, s8
+        vwmul.vx            v28, v20, s9
+
+        add_member      v20, s3, s6, s9, s7, s4, s2, s5, s8, +, +, +, -, -, -, -, -, 2
+        add_member      v17, s4, s9, s5, s3, s8, s6, s2, s7, +, +, -, -, -, +, +, +
+        add_member      v17, s5, s7, s3, s9, s2, s8, s4, s6, +, -, -, +, +, +, -, -, 2
+        add_member      v18, s6, s4, s8, s2, s9, s3, s7, s5, +, -, -, +, -, -, +, +
+        add_member      v18, s7, s2, s6, s8, s3, s5, s9, s4, +, -, +, +, -, +, +, -, 2
+        add_member      v19, s8, s5, s2, s4, s7, s9, s6, s3, +, -, +, -, +, +, -, +
+        add_member      v19, s9, s8, s7, s6, s5, s4, s3, s2, +, -, +, -, +, -, +, -, 2
+
+        li a7, \offset
+        add a4, sp, a7
+
+        vsetivli    zero, 4, e32, m1, tu, ma 
+        vle32.v        v16, (a4)
+        addi            a4, a4, 16
+        vle32.v        v17, (a4)
+        addi            a4, a4, 16
+        vle32.v        v18, (a4)
+        addi            a4, a4, 16
+        vle32.v        v19, (a4)
+        addi            a4, a4, 16
+
+        butterfly16       v16, v21, v17, v22, v18, v23, v19, v24
+    .if \shift > 0
+        scale             v29, v30, v31, v24, v20, v16, v21, v17, v22, v18, v23, v19, \shift
+
+        transpose16_4x4_2 29, 30, 31, 24, 2, 3, 4, 5, 6, 7
+
+        mv             a1,  a6
+        addi             a3,  a6, 24 +3*32
+        li             a2, 32
+        li             a4, -32
+
+        store16         v29, v30, v31, v24, a4
+    .else
+        store_to_stack  \offset, (\offset + 240), v20, v21, v22, v23, v19, v18, v17, v16
+    .endif
+
+        li a7, \offset+64
+        add a4, sp, a7
+
+        vsetivli    zero, 4, e32, m1, tu, ma 
+        vle32.v        v16, (a4)
+        addi            a4, a4, 16
+        vle32.v        v17, (a4)
+        addi            a4, a4, 16
+        vle32.v        v18, (a4)
+        addi            a4, a4, 16
+        vle32.v        v19, (a4)
+        addi            a4, a4, 16
+
+        butterfly16     v16, v25, v17, v26, v18, v27, v19, v28
+   .if \shift > 0
+        scale             v29, v30, v31, v20, v20, v16, v25, v17, v26, v18, v27, v19, \shift
+
+        transpose16_4x4_2 29, 30, 31, 20, 2, 3, 4, 5, 6, 7
+
+        add            a1,  a6, 8
+        add            a3,  a6, (16 + 3 * 32)
+        li             a2, 32
+        li             a4, -32
+        store16        v29, v30, v31, v20, a4
+   .else
+        store_to_stack  (\offset + 64), (\offset + 176), v20, v25, v26, v27, v19, v18, v17, v16
+   .endif
+        ret
+endfunc
+.endm
+
+tr_16x4 noscale, 0, 2048, 4
+
+.macro tr_32x4 name, shift
+func func_tr_32x4_\name, zve64x
+        mv             t3, ra        
+        jal              func_tr_16x4_noscale
+
+        load32
+
+        lla              t2, trans
+        addi             t2, t2, 32
+
+        vsetivli    zero, 4, e16, m1, tu, ma 
+        vle16.v v0, (t2)
+        addi t2, t2, 2*4
+        vle16.v v1, (t2)
+        addi t2, t2, 2*4
+        vle16.v v2, (t2)
+        addi t2, t2, 2*4
+        vle16.v v3, (t2)
+        addi t2, t2, -2*4
+
+        li a7, 2048
+        add a4, sp, a7
+
+        li               a2, 64
+        li               t1, -64
+        
+        jal               tr_block1
+        mv                a1, t4
+        addi              a3, t4, (56 + 3 * 64)
+        scale_store       \shift
+
+        jal              tr_block2
+        addi             a1, t4, 8
+        addi             a3, t4, (48 + 3 * 64)
+        scale_store      \shift
+
+        jal              tr_block3
+        addi             a1, t4, 16
+        addi             a3, t4, (40 + 3 * 64)
+        scale_store      \shift
+
+        jal              tr_block4
+        addi             a1, t4, 24
+        addi             a3, t4, (32 + 3 * 64)
+        scale_store      \shift
+
+        jr t3
+endfunc
+.endm
+
+tr_32x4 firstpass, 7
+tr_32x4 secondpass_8, 20 - 8
+
+.macro idct_32x32 bitdepth
+func ff_hevc_idct_32x32_\bitdepth\()_rvv, zve64x
+        mv t6, ra
+        addi    sp, sp, -8*13
+        sd    ra, 8*12(sp)
+        sd    s0, 8*11(sp)
+        sd    s1, 8*10(sp)
+        sd    s2, 8*9(sp)
+        sd    s3, 8*8(sp)
+        sd    s4, 8*7(sp)
+        sd    s5, 8*6(sp)
+        sd    s6, 8*5(sp)
+        sd    s7, 8*4(sp)
+        sd    s8, 8*3(sp)
+        sd    s9, 8*2(sp)
+        sd    s10, 8*1(sp)
+        sd    s11, 8*0(sp)
+
+        add sp, sp, -16
+        mv s0, sp
+
+        csrwi   vxrm, 1
+        li  a7, 2432 
+        sub sp, sp, a7
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+        li               a7, 8 * \i
+        add              a5, a0, a7
+
+        li               a7, 8 * \i * 32
+        add              t4, sp, a7
+        jal              func_tr_32x4_firstpass
+.endr
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+        addi             a5, sp, 8 * \i
+        addi             t4, a0, 8 * \i * 32
+        jal              func_tr_32x4_secondpass_\bitdepth
+.endr
+
+        li              a7, 2432 
+        add             sp,  sp,  a7
+        
+        add  sp, sp, 16
+
+        ld    ra, 8*12(sp)
+        ld    s0, 8*11(sp)
+        ld    s1, 8*10(sp)
+        ld    s2, 8*9(sp)
+        ld    s3, 8*8(sp)
+        ld    s4, 8*7(sp)
+        ld    s5, 8*6(sp)
+        ld    s6, 8*5(sp)
+        ld    s7, 8*4(sp)
+        ld    s8, 8*3(sp)
+        ld    s9, 8*2(sp)
+        ld    s10, 8*1(sp)
+        ld    s11, 8*0(sp)
+        addi    sp, sp, 8*13
+
+        jr t6
+endfunc
+.endm
+
+idct_32x32 8
diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c
index 1d8326a573..6dfb889eec 100644
--- a/libavcodec/riscv/hevcdsp_init.c
+++ b/libavcodec/riscv/hevcdsp_init.c
@@ -27,6 +27,8 @@
 #include "libavcodec/hevc/dsp.h"
 #include "libavcodec/riscv/h26x/h2656dsp.h"
 
+void ff_hevc_idct_32x32_8_rvv(int16_t *coeffs, int col_limit);
+
 #define RVV_FNASSIGN(member, v, h, fn, ext) \
         member[1][v][h] = ff_h2656_put_pixels_##8_##ext;  \
         member[3][v][h] = ff_h2656_put_pixels_##8_##ext;  \
@@ -40,27 +42,37 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int bit_depth)
     const int flags = av_get_cpu_flags();
     int vlenb;
 
-    if (!(flags & AV_CPU_FLAG_RVV_I32) || !(flags & AV_CPU_FLAG_RVB))
-        return;
-
     vlenb = ff_get_rv_vlenb();
-    if (vlenb >= 32) {
-        switch (bit_depth) {
-            case 8:
-                RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256);
-                RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256);
-                break;
-            default:
-                break;
-        }
-    } else if (vlenb >= 16) {
-        switch (bit_depth) {
-            case 8:
-                RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128);
-                RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128);
-                break;
-            default:
-                break;
+
+    if (flags & AV_CPU_FLAG_RVV_I64)
+        if (vlenb >= 16)
+            switch (bit_depth) {
+                case 8:
+                    c->idct[3]                     = ff_hevc_idct_32x32_8_rvv;
+                    break;
+                default:
+                    break;
+            }
+
+    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB)){
+        if (vlenb >= 32) {
+            switch (bit_depth) {
+                case 8:
+                    RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256);
+                    RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256);
+                    break;
+                default:
+                    break;
+            }
+        } else if (vlenb >= 16) {
+            switch (bit_depth) {
+                case 8:
+                    RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128);
+                    RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128);
+                    break;
+                default:
+                    break;
+            }
         }
     }
 #endif
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-04-28  9:33 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-04-15  7:34 [FFmpeg-devel] [PATCH] libavcodec/riscv:add RVV optimized for idct_32x32_8: daichengrong
2025-04-15 15:02 ` Rémi Denis-Courmont
2025-04-17 21:07 ` Michael Niedermayer
2025-04-28  9:32 ` [FFmpeg-devel] [PATCH v v2] " daichengrong

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git