From: "zhanheng.yang--- via ffmpeg-devel" <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: Zhanheng Yang <zhanheng.yang@linux.alibaba.com>
Subject: [FFmpeg-devel] [PATCH 5/6] libavcodec/riscv: add RVV optimized for qpel_hv in HEVC.
Date: Thu, 22 Jan 2026 12:23:56 +0800
Message-ID: <20260122042357.1438-5-zhanheng.yang@linux.alibaba.com> (raw)
In-Reply-To: <20260122042357.1438-1-zhanheng.yang@linux.alibaba.com>
From: Zhanheng Yang <zhanheng.yang@linux.alibaba.com>
Bench on A210 C908 core(VLEN 128).
put_hevc_qpel_hv4_8_c: 865.6 ( 1.00x)
put_hevc_qpel_hv4_8_rvv_i32: 501.8 ( 1.72x)
put_hevc_qpel_hv6_8_c: 1602.9 ( 1.00x)
put_hevc_qpel_hv6_8_rvv_i32: 635.4 ( 2.52x)
put_hevc_qpel_hv8_8_c: 2571.2 ( 1.00x)
put_hevc_qpel_hv8_8_rvv_i32: 774.1 ( 3.32x)
put_hevc_qpel_hv12_8_c: 5366.3 ( 1.00x)
put_hevc_qpel_hv12_8_rvv_i32: 1049.3 ( 5.11x)
put_hevc_qpel_hv16_8_c: 8959.2 ( 1.00x)
put_hevc_qpel_hv16_8_rvv_i32: 1328.1 ( 6.75x)
put_hevc_qpel_hv24_8_c: 18969.7 ( 1.00x)
put_hevc_qpel_hv24_8_rvv_i32: 3712.5 ( 5.11x)
put_hevc_qpel_hv32_8_c: 32674.3 ( 1.00x)
put_hevc_qpel_hv32_8_rvv_i32: 4806.7 ( 6.80x)
put_hevc_qpel_hv48_8_c: 71309.9 ( 1.00x)
put_hevc_qpel_hv48_8_rvv_i32: 10465.8 ( 6.81x)
put_hevc_qpel_hv64_8_c: 124846.0 ( 1.00x)
put_hevc_qpel_hv64_8_rvv_i32: 18306.5 ( 6.82x)
put_hevc_qpel_uni_hv4_8_c: 920.4 ( 1.00x)
put_hevc_qpel_uni_hv4_8_rvv_i32: 532.1 ( 1.73x)
put_hevc_qpel_uni_hv6_8_c: 1753.0 ( 1.00x)
put_hevc_qpel_uni_hv6_8_rvv_i32: 691.0 ( 2.54x)
put_hevc_qpel_uni_hv8_8_c: 2872.7 ( 1.00x)
put_hevc_qpel_uni_hv8_8_rvv_i32: 836.9 ( 3.43x)
put_hevc_qpel_uni_hv12_8_c: 5828.4 ( 1.00x)
put_hevc_qpel_uni_hv12_8_rvv_i32: 1141.2 ( 5.11x)
put_hevc_qpel_uni_hv16_8_c: 9906.7 ( 1.00x)
put_hevc_qpel_uni_hv16_8_rvv_i32: 1452.5 ( 6.82x)
put_hevc_qpel_uni_hv24_8_c: 20871.3 ( 1.00x)
put_hevc_qpel_uni_hv24_8_rvv_i32: 4094.0 ( 5.10x)
put_hevc_qpel_uni_hv32_8_c: 36123.3 ( 1.00x)
put_hevc_qpel_uni_hv32_8_rvv_i32: 5310.5 ( 6.80x)
put_hevc_qpel_uni_hv48_8_c: 79016.0 ( 1.00x)
put_hevc_qpel_uni_hv48_8_rvv_i32: 11591.2 ( 6.82x)
put_hevc_qpel_uni_hv64_8_c: 138779.8 ( 1.00x)
put_hevc_qpel_uni_hv64_8_rvv_i32: 20321.1 ( 6.83x)
put_hevc_qpel_uni_w_hv4_8_c: 988.8 ( 1.00x)
put_hevc_qpel_uni_w_hv4_8_rvv_i32: 580.3 ( 1.70x)
put_hevc_qpel_uni_w_hv6_8_c: 1871.5 ( 1.00x)
put_hevc_qpel_uni_w_hv6_8_rvv_i32: 751.7 ( 2.49x)
put_hevc_qpel_uni_w_hv8_8_c: 3089.8 ( 1.00x)
put_hevc_qpel_uni_w_hv8_8_rvv_i32: 923.7 ( 3.35x)
put_hevc_qpel_uni_w_hv12_8_c: 6384.8 ( 1.00x)
put_hevc_qpel_uni_w_hv12_8_rvv_i32: 1266.7 ( 5.04x)
put_hevc_qpel_uni_w_hv16_8_c: 10844.7 ( 1.00x)
put_hevc_qpel_uni_w_hv16_8_rvv_i32: 1612.2 ( 6.73x)
put_hevc_qpel_uni_w_hv24_8_c: 23060.9 ( 1.00x)
put_hevc_qpel_uni_w_hv24_8_rvv_i32: 4560.2 ( 5.06x)
put_hevc_qpel_uni_w_hv32_8_c: 39977.0 ( 1.00x)
put_hevc_qpel_uni_w_hv32_8_rvv_i32: 5927.0 ( 6.74x)
put_hevc_qpel_uni_w_hv48_8_c: 87560.3 ( 1.00x)
put_hevc_qpel_uni_w_hv48_8_rvv_i32: 12978.3 ( 6.75x)
put_hevc_qpel_uni_w_hv64_8_c: 153980.5 ( 1.00x)
put_hevc_qpel_uni_w_hv64_8_rvv_i32: 22823.0 ( 6.75x)
put_hevc_qpel_bi_hv4_8_c: 938.5 ( 1.00x)
put_hevc_qpel_bi_hv4_8_rvv_i32: 541.4 ( 1.73x)
put_hevc_qpel_bi_hv6_8_c: 1760.1 ( 1.00x)
put_hevc_qpel_bi_hv6_8_rvv_i32: 695.9 ( 2.53x)
put_hevc_qpel_bi_hv8_8_c: 2924.3 ( 1.00x)
put_hevc_qpel_bi_hv8_8_rvv_i32: 849.3 ( 3.44x)
put_hevc_qpel_bi_hv12_8_c: 5992.7 ( 1.00x)
put_hevc_qpel_bi_hv12_8_rvv_i32: 1157.5 ( 5.18x)
put_hevc_qpel_bi_hv16_8_c: 10065.4 ( 1.00x)
put_hevc_qpel_bi_hv16_8_rvv_i32: 1473.6 ( 6.83x)
put_hevc_qpel_bi_hv24_8_c: 21450.2 ( 1.00x)
put_hevc_qpel_bi_hv24_8_rvv_i32: 4151.3 ( 5.17x)
put_hevc_qpel_bi_hv32_8_c: 37107.8 ( 1.00x)
put_hevc_qpel_bi_hv32_8_rvv_i32: 5386.4 ( 6.89x)
put_hevc_qpel_bi_hv48_8_c: 81401.7 ( 1.00x)
put_hevc_qpel_bi_hv48_8_rvv_i32: 11761.7 ( 6.92x)
put_hevc_qpel_bi_hv64_8_c: 143503.3 ( 1.00x)
put_hevc_qpel_bi_hv64_8_rvv_i32: 20700.3 ( 6.93x)
Signed-off-by: Zhanheng Yang <zhanheng.yang@linux.alibaba.com>
---
libavcodec/riscv/h26x/h2656dsp.h | 11 +
libavcodec/riscv/h26x/hevcqpel_rvv.S | 386 ++++++++++++++++++++++++++-
libavcodec/riscv/hevcdsp_init.c | 4 +
3 files changed, 400 insertions(+), 1 deletion(-)
diff --git a/libavcodec/riscv/h26x/h2656dsp.h b/libavcodec/riscv/h26x/h2656dsp.h
index 085ed4cf14..7e320bd795 100644
--- a/libavcodec/riscv/h26x/h2656dsp.h
+++ b/libavcodec/riscv/h26x/h2656dsp.h
@@ -47,6 +47,17 @@ void ff_hevc_put_qpel_uni_w_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride,
void ff_hevc_put_qpel_bi_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
mx, intptr_t my, int width);
+void ff_hevc_put_qpel_hv_8_m1_rvv(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_uni_hv_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
+ ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_uni_w_hv_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride,
+ const uint8_t *_src, ptrdiff_t _srcstride,
+ int height, int denom, int wx, int ox,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_bi_hv_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
+ ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
+ mx, intptr_t my, int width);
void ff_hevc_put_epel_h_8_m1_rvv(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height,
intptr_t mx, intptr_t my, int width);
diff --git a/libavcodec/riscv/h26x/hevcqpel_rvv.S b/libavcodec/riscv/h26x/hevcqpel_rvv.S
index 8fd3c47bcc..ed7fa8fe00 100644
--- a/libavcodec/riscv/h26x/hevcqpel_rvv.S
+++ b/libavcodec/riscv/h26x/hevcqpel_rvv.S
@@ -619,4 +619,388 @@ func ff_hevc_put_qpel_bi_v_8_\lmul\()_rvv, zve32x
endfunc
.endm
-hevc_qpel_v m1, m2, m4
\ No newline at end of file
+hevc_qpel_v m1, m2, m4
+
+/* clobbers reg t4 */
+.macro filter_v_s vdst, vsrc0, vsrc1, vsrc2, vsrc3, vsrc4, vsrc5, vsrc6, vsrc7, vf
+ vwmul.vx \vdst, \vsrc0, s0
+ vwmacc.vx \vdst, s9, \vsrc1
+ vmv.v.v \vsrc0, \vsrc1
+ vwmacc.vx \vdst, s10, \vsrc2
+ vmv.v.v \vsrc1, \vsrc2
+ vwmacc.vx \vdst, s11, \vsrc3
+ vmv.v.v \vsrc2, \vsrc3
+ lb t4, 4(\vf)
+ vwmacc.vx \vdst, t4, \vsrc4
+ lb t4, 5(\vf)
+ vmv.v.v \vsrc3, \vsrc4
+ vwmacc.vx \vdst, t4, \vsrc5
+ lb t4, 6(\vf)
+ vmv.v.v \vsrc4, \vsrc5
+ vwmacc.vx \vdst, t4, \vsrc6
+ lb t4, 7(\vf)
+ vmv.v.v \vsrc5, \vsrc6
+ vwmacc.vx \vdst, t4, \vsrc7
+ vmv.v.v \vsrc6, \vsrc7
+.endm
+
+/* output \m as t0; clobbers t0, t1, reg not enough for all coef */
+.macro load_filter2 m
+ la t0, qpel_filters
+ slli t1, \m, 3
+ add t0, t0, t1
+ lb s0, 0(t0)
+ lb s9, 1(t0)
+ lb s10, 2(t0)
+ lb s11, 3(t0)
+ mv \m, t0
+.endm
+
+.macro hevc_qpel_hv lmul, lmul2, lmul4
+func ff_hevc_put_qpel_hv_8_\lmul\()_rvv, zve32x
+ csrwi vxrm, 2
+ addi sp, sp, -96
+ sx s0, 0(sp)
+ sx s1, 8(sp)
+ sx s2, 16(sp)
+ sx s3, 24(sp)
+ sx s4, 32(sp)
+ sx s5, 40(sp)
+ sx s6, 48(sp)
+ sx s7, 56(sp)
+ sx s8, 64(sp)
+ sx s9, 72(sp)
+ sx s10, 80(sp)
+ sx s11, 88(sp)
+ load_filter a4
+ load_filter2 a5
+ slli t1, a2, 1
+ add t1, t1, a2
+ sub a1, a1, t1 # src - 3 * src_stride
+ mv t0, a3
+ li t1, 0 # offset
+
+1:
+ add t2, a1, t1
+ slli t3, t1, 1
+ add t3, a0, t3
+
+ vsetvli t6, a6, e8, \lmul, ta, ma
+ filter_h v4, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a2
+ filter_h v6, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a2
+ filter_h v8, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a2
+ filter_h v10, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a2
+ filter_h v12, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a2
+ filter_h v14, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a2
+ filter_h v16, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a2
+
+2:
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ filter_h v18, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a2
+
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ filter_v_s v0, v4, v6, v8, v10, v12, v14, v16, v18, a5
+ vnclip.wi v0, v0, 6
+ vse16.v v0, (t3)
+ addi a3, a3, -1
+ addi t3, t3, 2*HEVC_MAX_PB_SIZE
+ bgt a3, zero, 2b
+ mv a3, t0
+ add t1, t1, t6
+ sub a6, a6, t6
+ bgt a6, zero, 1b
+
+ lx s0, 0(sp)
+ lx s1, 8(sp)
+ lx s2, 16(sp)
+ lx s3, 24(sp)
+ lx s4, 32(sp)
+ lx s5, 40(sp)
+ lx s6, 48(sp)
+ lx s7, 56(sp)
+ lx s8, 64(sp)
+ lx s9, 72(sp)
+ lx s10, 80(sp)
+ lx s11, 88(sp)
+ addi sp, sp, 96
+ ret
+endfunc
+
+func ff_hevc_put_qpel_uni_hv_8_\lmul\()_rvv, zve32x
+ csrwi vxrm, 0
+ addi sp, sp, -96
+ sx s0, 0(sp)
+ sx s1, 8(sp)
+ sx s2, 16(sp)
+ sx s3, 24(sp)
+ sx s4, 32(sp)
+ sx s5, 40(sp)
+ sx s6, 48(sp)
+ sx s7, 56(sp)
+ sx s8, 64(sp)
+ sx s9, 72(sp)
+ sx s10, 80(sp)
+ sx s11, 88(sp)
+ load_filter a5
+ load_filter2 a6
+ slli t1, a3, 1
+ add t1, t1, a3
+ sub a2, a2, t1 # src - 3 * src_stride
+ mv t0, a4
+ li t1, 0 # offset
+
+1:
+ add t2, a2, t1
+ add t3, a0, t1
+
+ vsetvli t6, a7, e8, \lmul, ta, ma
+ filter_h v4, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v6, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v8, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v10, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v12, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v14, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v16, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+
+2:
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ filter_h v18, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ filter_v_s v0, v4, v6, v8, v10, v12, v14, v16, v18, a6
+ vsetvli zero, zero, e32, \lmul4, ta, ma
+ vsra.vi v0, v0, 6
+ vmax.vx v0, v0, zero
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ vnclipu.wi v0, v0, 6
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vnclipu.wi v0, v0, 0
+ vse8.v v0, (t3)
+ addi a4, a4, -1
+ add t3, t3, a1
+ bgt a4, zero, 2b
+ mv a4, t0
+ add t1, t1, t6
+ sub a7, a7, t6
+ bgt a7, zero, 1b
+
+ lx s0, 0(sp)
+ lx s1, 8(sp)
+ lx s2, 16(sp)
+ lx s3, 24(sp)
+ lx s4, 32(sp)
+ lx s5, 40(sp)
+ lx s6, 48(sp)
+ lx s7, 56(sp)
+ lx s8, 64(sp)
+ lx s9, 72(sp)
+ lx s10, 80(sp)
+ lx s11, 88(sp)
+ addi sp, sp, 96
+ ret
+endfunc
+
+func ff_hevc_put_qpel_uni_w_hv_8_\lmul\()_rvv, zve32x
+ csrwi vxrm, 0
+ lx t2, 0(sp) # mx
+#if (__riscv_xlen == 32)
+ lw t4, 4(sp) # my
+ lw t5, 8(sp) # width
+#elif (__riscv_xlen == 64)
+ ld t4, 8(sp)
+ lw t5, 16(sp)
+#endif
+ addi a5, a5, 6 # shift
+ addi sp, sp, -104
+ sx s0, 0(sp)
+ sx s1, 8(sp)
+ sx s2, 16(sp)
+ sx s3, 24(sp)
+ sx s4, 32(sp)
+ sx s5, 40(sp)
+ sx s6, 48(sp)
+ sx s7, 56(sp)
+ sx s8, 64(sp)
+ sx s9, 72(sp)
+ sx s10, 80(sp)
+ sx s11, 88(sp)
+ sx ra, 96(sp)
+ mv ra, t4
+ load_filter t2
+ load_filter2 ra
+ slli t1, a3, 1
+ add t1, t1, a3
+ sub a2, a2, t1 # src - 3 * src_stride
+ mv t0, a4
+ li t1, 0 # offset
+
+1:
+ add t2, a2, t1
+ add t3, a0, t1
+
+ vsetvli t6, t5, e8, \lmul, ta, ma
+ filter_h v4, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v6, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v8, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v10, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v12, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v14, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v16, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+
+2:
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ filter_h v18, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ filter_v_s v0, v4, v6, v8, v10, v12, v14, v16, v18, ra
+ vsetvli zero, zero, e32, \lmul4, ta, ma
+ vsra.vi v0, v0, 6
+ vmul.vx v0, v0, a6
+ vssra.vx v0, v0, a5
+ vsadd.vx v0, v0, a7
+ vmax.vx v0, v0, zero
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ vnclip.wi v0, v0, 0
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vnclipu.wi v0, v0, 0
+ vse8.v v0, (t3)
+ addi a4, a4, -1
+ add t3, t3, a1
+ bgt a4, zero, 2b
+ mv a4, t0
+ add t1, t1, t6
+ sub t5, t5, t6
+ bgt t5, zero, 1b
+
+ lx s0, 0(sp)
+ lx s1, 8(sp)
+ lx s2, 16(sp)
+ lx s3, 24(sp)
+ lx s4, 32(sp)
+ lx s5, 40(sp)
+ lx s6, 48(sp)
+ lx s7, 56(sp)
+ lx s8, 64(sp)
+ lx s9, 72(sp)
+ lx s10, 80(sp)
+ lx s11, 88(sp)
+ lx ra, 96(sp)
+ addi sp, sp, 104
+ ret
+endfunc
+
+func ff_hevc_put_qpel_bi_hv_8_\lmul\()_rvv, zve32x
+ csrwi vxrm, 0
+ lw t3, 0(sp) # width
+ addi sp, sp, -96
+ sx s0, 0(sp)
+ sx s1, 8(sp)
+ sx s2, 16(sp)
+ sx s3, 24(sp)
+ sx s4, 32(sp)
+ sx s5, 40(sp)
+ sx s6, 48(sp)
+ sx s7, 56(sp)
+ sx s8, 64(sp)
+ sx s9, 72(sp)
+ sx s10, 80(sp)
+ sx s11, 88(sp)
+ load_filter a6
+ load_filter2 a7
+ mv a6, t3
+ slli t1, a3, 1
+ add t1, t1, a3
+ sub a2, a2, t1 # src - 3 * src_stride
+ mv t0, a5
+ li t1, 0 # offset
+
+1:
+ add t2, a2, t1
+ add t3, a0, t1
+ slli t5, t1, 1
+ add t5, a4, t5
+
+ vsetvli t6, a6, e8, \lmul, ta, ma
+ filter_h v4, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v6, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v8, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v10, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v12, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v14, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+ filter_h v16, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+
+2:
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ filter_h v18, v24, v25, v26, v27, v28, v29, v30, v31, t2
+ add t2, t2, a3
+
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ vle16.V v24, (t5)
+ addi t5, t5, 2*HEVC_MAX_PB_SIZE
+ filter_v_s v0, v4, v6, v8, v10, v12, v14, v16, v18, a7
+ vsetvli zero, zero, e32, \lmul4, ta, ma
+ vsra.vi v0, v0, 6
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ vwadd.wv v0, v0, v24
+ vnclip.wi v0, v0, 7
+ vmax.vx v0, v0, zero
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vnclipu.wi v0, v0, 0
+ vse8.v v0, (t3)
+ addi a5, a5, -1
+ add t3, t3, a1
+ bgt a5, zero, 2b
+ mv a5, t0
+ add t1, t1, t6
+ sub a6, a6, t6
+ bgt a6, zero, 1b
+
+ lx s0, 0(sp)
+ lx s1, 8(sp)
+ lx s2, 16(sp)
+ lx s3, 24(sp)
+ lx s4, 32(sp)
+ lx s5, 40(sp)
+ lx s6, 48(sp)
+ lx s7, 56(sp)
+ lx s8, 64(sp)
+ lx s9, 72(sp)
+ lx s10, 80(sp)
+ lx s11, 88(sp)
+ addi sp, sp, 96
+ ret
+endfunc
+.endm
+
+hevc_qpel_hv m1, m2, m4
diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c
index c7874996a8..53c800626f 100644
--- a/libavcodec/riscv/hevcdsp_init.c
+++ b/libavcodec/riscv/hevcdsp_init.c
@@ -89,6 +89,10 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int bit_depth)
RVV_FNASSIGN_PEL(c->put_hevc_qpel_uni, 1, 0, ff_hevc_put_qpel_uni_v_8_m1_rvv);
RVV_FNASSIGN_PEL(c->put_hevc_qpel_uni_w, 1, 0, ff_hevc_put_qpel_uni_w_v_8_m1_rvv);
RVV_FNASSIGN_PEL(c->put_hevc_qpel_bi, 1, 0, ff_hevc_put_qpel_bi_v_8_m1_rvv);
+ RVV_FNASSIGN_PEL(c->put_hevc_qpel, 1, 1, ff_hevc_put_qpel_hv_8_m1_rvv);
+ RVV_FNASSIGN_PEL(c->put_hevc_qpel_uni, 1, 1, ff_hevc_put_qpel_uni_hv_8_m1_rvv);
+ RVV_FNASSIGN_PEL(c->put_hevc_qpel_uni_w, 1, 1, ff_hevc_put_qpel_uni_w_hv_8_m1_rvv);
+ RVV_FNASSIGN_PEL(c->put_hevc_qpel_bi, 1, 1, ff_hevc_put_qpel_bi_hv_8_m1_rvv);
RVV_FNASSIGN_PEL(c->put_hevc_epel, 0, 1, ff_hevc_put_epel_h_8_m1_rvv);
RVV_FNASSIGN_PEL(c->put_hevc_epel_uni, 0, 1, ff_hevc_put_epel_uni_h_8_m1_rvv);
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
next prev parent reply other threads:[~2026-01-24 15:58 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-22 4:23 [FFmpeg-devel] [PATCH 1/6] libavcodec/riscv: add RVV optimized for qpel_h " zhanheng.yang--- via ffmpeg-devel
2026-01-22 4:23 ` [FFmpeg-devel] [PATCH 2/6] libavcodec/riscv: add RVV optimized for qpel_v " zhanheng.yang--- via ffmpeg-devel
2026-01-22 4:23 ` [FFmpeg-devel] [PATCH 3/6] libavcodec/riscv: add RVV optimized for epel_h " zhanheng.yang--- via ffmpeg-devel
2026-01-22 4:23 ` [FFmpeg-devel] [PATCH 4/6] libavcodec/riscv: add RVV optimized for epel_v " zhanheng.yang--- via ffmpeg-devel
2026-01-22 4:23 ` zhanheng.yang--- via ffmpeg-devel [this message]
2026-01-22 4:23 ` [FFmpeg-devel] [PATCH 6/6] libavcodec/riscv: add RVV optimized for epel_hv " zhanheng.yang--- via ffmpeg-devel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260122042357.1438-5-zhanheng.yang@linux.alibaba.com \
--to=ffmpeg-devel@ffmpeg.org \
--cc=zhanheng.yang@linux.alibaba.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git