From: "zhanheng.yang--- via ffmpeg-devel" <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: Zhanheng Yang <zhanheng.yang@linux.alibaba.com>
Subject: [FFmpeg-devel] [PATCH 3/6] libavcodec/riscv: add RVV optimized for epel_h in HEVC.
Date: Thu, 22 Jan 2026 12:23:54 +0800
Message-ID: <20260122042357.1438-3-zhanheng.yang@linux.alibaba.com> (raw)
In-Reply-To: <20260122042357.1438-1-zhanheng.yang@linux.alibaba.com>
From: Zhanheng Yang <zhanheng.yang@linux.alibaba.com>
Bench on A210 C908 core(VLEN 128).
put_hevc_epel_h4_8_c: 146.2 ( 1.00x)
put_hevc_epel_h4_8_rvv_i32: 81.8 ( 1.79x)
put_hevc_epel_h6_8_c: 305.4 ( 1.00x)
put_hevc_epel_h6_8_rvv_i32: 115.5 ( 2.65x)
put_hevc_epel_h8_8_c: 532.7 ( 1.00x)
put_hevc_epel_h8_8_rvv_i32: 156.7 ( 3.40x)
put_hevc_epel_h12_8_c: 1233.8 ( 1.00x)
put_hevc_epel_h12_8_rvv_i32: 225.7 ( 5.47x)
put_hevc_epel_h16_8_c: 2223.8 ( 1.00x)
put_hevc_epel_h16_8_rvv_i32: 296.2 ( 7.51x)
put_hevc_epel_h24_8_c: 4739.4 ( 1.00x)
put_hevc_epel_h24_8_rvv_i32: 800.7 ( 5.92x)
put_hevc_epel_h32_8_c: 8344.4 ( 1.00x)
put_hevc_epel_h32_8_rvv_i32: 1066.0 ( 7.83x)
put_hevc_epel_h48_8_c: 18595.3 ( 1.00x)
put_hevc_epel_h48_8_rvv_i32: 2324.3 ( 8.00x)
put_hevc_epel_h64_8_c: 32911.2 ( 1.00x)
put_hevc_epel_h64_8_rvv_i32: 4079.8 ( 8.07x)
put_hevc_epel_uni_h4_8_c: 225.1 ( 1.00x)
put_hevc_epel_uni_h4_8_rvv_i32: 99.0 ( 2.27x)
put_hevc_epel_uni_h6_8_c: 500.0 ( 1.00x)
put_hevc_epel_uni_h6_8_rvv_i32: 138.1 ( 3.62x)
put_hevc_epel_uni_h8_8_c: 895.6 ( 1.00x)
put_hevc_epel_uni_h8_8_rvv_i32: 186.3 ( 4.81x)
put_hevc_epel_uni_h12_8_c: 1925.0 ( 1.00x)
put_hevc_epel_uni_h12_8_rvv_i32: 264.4 ( 7.28x)
put_hevc_epel_uni_h16_8_c: 3372.3 ( 1.00x)
put_hevc_epel_uni_h16_8_rvv_i32: 342.7 ( 9.84x)
put_hevc_epel_uni_h24_8_c: 7501.4 ( 1.00x)
put_hevc_epel_uni_h24_8_rvv_i32: 935.6 ( 8.02x)
put_hevc_epel_uni_h32_8_c: 13232.0 ( 1.00x)
put_hevc_epel_uni_h32_8_rvv_i32: 1240.0 (10.67x)
put_hevc_epel_uni_h48_8_c: 29608.1 ( 1.00x)
put_hevc_epel_uni_h48_8_rvv_i32: 2710.5 (10.92x)
put_hevc_epel_uni_h64_8_c: 52452.8 ( 1.00x)
put_hevc_epel_uni_h64_8_rvv_i32: 4775.5 (10.98x)
put_hevc_epel_uni_w_h4_8_c: 298.5 ( 1.00x)
put_hevc_epel_uni_w_h4_8_rvv_i32: 176.6 ( 1.69x)
put_hevc_epel_uni_w_h6_8_c: 645.3 ( 1.00x)
put_hevc_epel_uni_w_h6_8_rvv_i32: 254.9 ( 2.53x)
put_hevc_epel_uni_w_h8_8_c: 1187.0 ( 1.00x)
put_hevc_epel_uni_w_h8_8_rvv_i32: 335.3 ( 3.54x)
put_hevc_epel_uni_w_h12_8_c: 2535.6 ( 1.00x)
put_hevc_epel_uni_w_h12_8_rvv_i32: 487.8 ( 5.20x)
put_hevc_epel_uni_w_h16_8_c: 4491.0 ( 1.00x)
put_hevc_epel_uni_w_h16_8_rvv_i32: 641.8 ( 7.00x)
put_hevc_epel_uni_w_h24_8_c: 9974.7 ( 1.00x)
put_hevc_epel_uni_w_h24_8_rvv_i32: 1791.4 ( 5.57x)
put_hevc_epel_uni_w_h32_8_c: 17646.1 ( 1.00x)
put_hevc_epel_uni_w_h32_8_rvv_i32: 2379.0 ( 7.42x)
put_hevc_epel_uni_w_h48_8_c: 39569.2 ( 1.00x)
put_hevc_epel_uni_w_h48_8_rvv_i32: 5226.0 ( 7.57x)
put_hevc_epel_uni_w_h64_8_c: 70274.5 ( 1.00x)
put_hevc_epel_uni_w_h64_8_rvv_i32: 9214.3 ( 7.63x)
put_hevc_epel_bi_h4_8_c: 234.5 ( 1.00x)
put_hevc_epel_bi_h4_8_rvv_i32: 128.3 ( 1.83x)
put_hevc_epel_bi_h6_8_c: 505.0 ( 1.00x)
put_hevc_epel_bi_h6_8_rvv_i32: 177.1 ( 2.85x)
put_hevc_epel_bi_h8_8_c: 958.2 ( 1.00x)
put_hevc_epel_bi_h8_8_rvv_i32: 235.2 ( 4.07x)
put_hevc_epel_bi_h12_8_c: 2001.0 ( 1.00x)
put_hevc_epel_bi_h12_8_rvv_i32: 338.5 ( 5.91x)
put_hevc_epel_bi_h16_8_c: 3510.2 ( 1.00x)
put_hevc_epel_bi_h16_8_rvv_i32: 446.5 ( 7.86x)
put_hevc_epel_bi_h24_8_c: 7803.2 ( 1.00x)
put_hevc_epel_bi_h24_8_rvv_i32: 1189.6 ( 6.56x)
put_hevc_epel_bi_h32_8_c: 13764.5 ( 1.00x)
put_hevc_epel_bi_h32_8_rvv_i32: 1579.3 ( 8.72x)
put_hevc_epel_bi_h48_8_c: 30827.4 ( 1.00x)
put_hevc_epel_bi_h48_8_rvv_i32: 3422.3 ( 9.01x)
put_hevc_epel_bi_h64_8_c: 54715.6 ( 1.00x)
put_hevc_epel_bi_h64_8_rvv_i32: 6059.8 ( 9.03x)
Signed-off-by: Zhanheng Yang <zhanheng.yang@linux.alibaba.com>
---
libavcodec/riscv/Makefile | 3 +-
libavcodec/riscv/h26x/h2656dsp.h | 12 ++
libavcodec/riscv/h26x/hevcepel_rvv.S | 265 +++++++++++++++++++++++++++
libavcodec/riscv/hevcdsp_init.c | 4 +
4 files changed, 283 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/riscv/h26x/hevcepel_rvv.S
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 414790ae0c..bf65e827e7 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -37,7 +37,8 @@ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
RVV-OBJS-$(CONFIG_HEVC_DECODER) += riscv/h26x/h2656_inter_rvv.o \
- riscv/h26x/hevcqpel_rvv.o
+ riscv/h26x/hevcqpel_rvv.o \
+ riscv/h26x/hevcepel_rvv.o
OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
diff --git a/libavcodec/riscv/h26x/h2656dsp.h b/libavcodec/riscv/h26x/h2656dsp.h
index 2dabc16aee..fa2f5a88e3 100644
--- a/libavcodec/riscv/h26x/h2656dsp.h
+++ b/libavcodec/riscv/h26x/h2656dsp.h
@@ -47,4 +47,16 @@ void ff_hevc_put_qpel_uni_w_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride,
void ff_hevc_put_qpel_bi_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
mx, intptr_t my, int width);
+
+void ff_hevc_put_epel_h_8_m1_rvv(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_uni_h_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
+ ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_uni_w_h_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride,
+ const uint8_t *_src, ptrdiff_t _srcstride,
+ int height, int denom, int wx, int ox,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_bi_h_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
+ ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
+ mx, intptr_t my, int width);
#endif
diff --git a/libavcodec/riscv/h26x/hevcepel_rvv.S b/libavcodec/riscv/h26x/hevcepel_rvv.S
new file mode 100644
index 0000000000..81044846f7
--- /dev/null
+++ b/libavcodec/riscv/h26x/hevcepel_rvv.S
@@ -0,0 +1,265 @@
+ /*
+ * Copyright (C) 2026 Alibaba Group Holding Limited.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+.data
+.align 2
+qpel_filters:
+ .byte 0, 0, 0, 0
+ .byte -2, 58, 10, -2
+ .byte -4, 54, 16, -2
+ .byte -6, 46, 28, -4
+ .byte -4, 36, 36, -4
+ .byte -4, 28, 46, -6
+ .byte -2, 16, 54, -4
+ .byte -2, 10, 58, -2
+
+.text
+#include "libavutil/riscv/asm.S"
+#define HEVC_MAX_PB_SIZE 64
+
+.macro lx rd, addr
+#if (__riscv_xlen == 32)
+ lw \rd, \addr
+#elif (__riscv_xlen == 64)
+ ld \rd, \addr
+#else
+ lq \rd, \addr
+#endif
+.endm
+
+.macro sx rd, addr
+#if (__riscv_xlen == 32)
+ sw \rd, \addr
+#elif (__riscv_xlen == 64)
+ sd \rd, \addr
+#else
+ sq \rd, \addr
+#endif
+.endm
+
+/* clobbers t0, t1 */
+.macro load_filter m
+ la t0, qpel_filters
+ slli t1, \m, 2
+ add t0, t0, t1
+ lb s1, 0(t0)
+ lb s2, 1(t0)
+ lb s3, 2(t0)
+ lb s4, 3(t0)
+.endm
+
+/* output is unclipped; clobbers t4 */
+.macro filter_h vdst, vsrc0, vsrc1, vsrc2, vsrc3, src
+ addi t4, \src, -1
+ vle8.v \vsrc0, (t4)
+ vmv.v.x \vsrc3, s1
+ vwmulsu.vv \vdst, \vsrc3, \vsrc0
+ vle8.v \vsrc1, (\src)
+ addi t4, \src, 1
+ vle8.v \vsrc2, (t4)
+ addi t4, \src, 2
+ vle8.v \vsrc3, (t4)
+
+ vwmaccsu.vx \vdst, s2, \vsrc1
+ vwmaccsu.vx \vdst, s3, \vsrc2
+ vwmaccsu.vx \vdst, s4, \vsrc3
+.endm
+
+.macro vreg
+
+.endm
+
+.macro hevc_epel_h lmul, lmul2, lmul4
+func ff_hevc_put_epel_h_8_\lmul\()_rvv, zve32x
+ addi sp, sp, -32
+ sx s1, 0(sp)
+ sx s2, 8(sp)
+ sx s3, 16(sp)
+ sx s4, 24(sp)
+ load_filter a4
+ mv t3, a6
+ li t1, 0 # offset
+
+1:
+ vsetvli t6, t3, e8, \lmul, ta, ma
+ add t2, a1, t1
+ filter_h v0, v16, v18, v20, v22, t2
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ slli t2, t1, 1
+ add t2, a0, t2
+ vse16.v v0, (t2)
+ sub t3, t3, t6
+ add t1, t1, t6
+ bgt t3, zero, 1b
+ addi a3, a3, -1
+ mv t3, a6
+ add a1, a1, a2
+ addi a0, a0, 2*HEVC_MAX_PB_SIZE
+ li t1, 0
+ bgt a3, zero, 1b
+
+ lx s1, 0(sp)
+ lx s2, 8(sp)
+ lx s3, 16(sp)
+ lx s4, 24(sp)
+ addi sp, sp, 32
+ ret
+endfunc
+
+func ff_hevc_put_epel_uni_h_8_\lmul\()_rvv, zve32x
+ csrwi vxrm, 0
+ addi sp, sp, -32
+ sx s1, 0(sp)
+ sx s2, 8(sp)
+ sx s3, 16(sp)
+ sx s4, 24(sp)
+ load_filter a5
+ mv t3, a7
+ li t1, 0 # offset
+
+1:
+ vsetvli t6, t3, e8, \lmul, ta, ma
+ add t2, a2, t1
+ filter_h v0, v16, v18, v20, v22, t2
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ vmax.vx v0, v0, zero
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vnclipu.wi v0, v0, 6
+ add t2, a0, t1
+ vse8.v v0, (t2)
+ sub t3, t3, t6
+ add t1, t1, t6
+ bgt t3, zero, 1b
+ addi a4, a4, -1
+ mv t3, a7
+ add a2, a2, a3
+ add a0, a0, a1
+ li t1, 0
+ bgt a4, zero, 1b
+
+ lx s1, 0(sp)
+ lx s2, 8(sp)
+ lx s3, 16(sp)
+ lx s4, 24(sp)
+ addi sp, sp, 32
+ ret
+endfunc
+
+func ff_hevc_put_epel_uni_w_h_8_\lmul\()_rvv, zve32x
+ csrwi vxrm, 0
+ lx t2, 0(sp) # mx
+ addi a5, a5, 6 # shift
+#if (__riscv_xlen == 32)
+ lw t3, 8(sp) # width
+#elif (__riscv_xlen == 64)
+ lw t3, 16(sp)
+#endif
+ addi sp, sp, -32
+ sx s1, 0(sp)
+ sx s2, 8(sp)
+ sx s3, 16(sp)
+ sx s4, 24(sp)
+ load_filter t2
+ li t2, 0 # offset
+
+1:
+ vsetvli t6, t3, e8, \lmul, ta, ma
+ add t1, a2, t2
+ filter_h v8, v16, v18, v20, v22, t1
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ vwmul.vx v0, v8, a6
+ vsetvli zero, zero, e32, \lmul4, ta, ma
+ vssra.vx v0, v0, a5
+ vsadd.vx v0, v0, a7
+ vmax.vx v0, v0, zero
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ vnclip.wi v0, v0, 0
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vnclipu.wi v0, v0, 0
+ add t1, a0, t2
+ vse8.v v0, (t1)
+ sub t3, t3, t6
+ add t2, t2, t6
+ bgt t3, zero, 1b
+ addi a4, a4, -1
+#if (__riscv_xlen == 32)
+ lw t3, 40(sp)
+#elif (__riscv_xlen == 64)
+ ld t3, 48(sp)
+#endif
+ add a2, a2, a3
+ add a0, a0, a1
+ li t2, 0
+ bgt a4, zero, 1b
+
+ lx s1, 0(sp)
+ lx s2, 8(sp)
+ lx s3, 16(sp)
+ lx s4, 24(sp)
+ addi sp, sp, 32
+ ret
+endfunc
+
+func ff_hevc_put_epel_bi_h_8_\lmul\()_rvv, zve32x
+ csrwi vxrm, 0
+ lw t3, 0(sp) # width
+ addi sp, sp, -32
+ sx s1, 0(sp)
+ sx s2, 8(sp)
+ sx s3, 16(sp)
+ sx s4, 24(sp)
+ load_filter a6
+ li t1, 0 # offset
+
+1:
+ vsetvli t6, t3, e16, \lmul2, ta, ma
+ slli t2, t1, 1
+ add t2, a4, t2
+ vle16.v v12, (t2)
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ add t2, a2, t1
+ filter_h v0, v16, v18, v20, v22, t2
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ vsadd.vv v0, v0, v12
+ vmax.vx v0, v0, zero
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vnclipu.wi v0, v0, 7
+ add t2, a0, t1
+ vse8.v v0, (t2)
+ sub t3, t3, t6
+ add t1, t1, t6
+ bgt t3, zero, 1b
+ addi a5, a5, -1
+ lw t3, 32(sp)
+ add a2, a2, a3
+ add a0, a0, a1
+ addi a4, a4, 2*HEVC_MAX_PB_SIZE
+ li t1, 0
+ bgt a5, zero, 1b
+
+ lx s1, 0(sp)
+ lx s2, 8(sp)
+ lx s3, 16(sp)
+ lx s4, 24(sp)
+ addi sp, sp, 32
+ ret
+endfunc
+.endm
+
+hevc_epel_h m1, m2, m4
\ No newline at end of file
diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c
index 480cfd2968..8608fdbd19 100644
--- a/libavcodec/riscv/hevcdsp_init.c
+++ b/libavcodec/riscv/hevcdsp_init.c
@@ -90,6 +90,10 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int bit_depth)
RVV_FNASSIGN_PEL(c->put_hevc_qpel_uni_w, 1, 0, ff_hevc_put_qpel_uni_w_v_8_m1_rvv);
RVV_FNASSIGN_PEL(c->put_hevc_qpel_bi, 1, 0, ff_hevc_put_qpel_bi_v_8_m1_rvv);
+ RVV_FNASSIGN_PEL(c->put_hevc_epel, 0, 1, ff_hevc_put_epel_h_8_m1_rvv);
+ RVV_FNASSIGN_PEL(c->put_hevc_epel_uni, 0, 1, ff_hevc_put_epel_uni_h_8_m1_rvv);
+ RVV_FNASSIGN_PEL(c->put_hevc_epel_uni_w, 0, 1, ff_hevc_put_epel_uni_w_h_8_m1_rvv);
+ RVV_FNASSIGN_PEL(c->put_hevc_epel_bi, 0, 1, ff_hevc_put_epel_bi_h_8_m1_rvv);
break;
default:
break;
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
next prev parent reply other threads:[~2026-01-24 15:57 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-22 4:23 [FFmpeg-devel] [PATCH 1/6] libavcodec/riscv: add RVV optimized for qpel_h " zhanheng.yang--- via ffmpeg-devel
2026-01-22 4:23 ` [FFmpeg-devel] [PATCH 2/6] libavcodec/riscv: add RVV optimized for qpel_v " zhanheng.yang--- via ffmpeg-devel
2026-01-22 4:23 ` zhanheng.yang--- via ffmpeg-devel [this message]
2026-01-22 4:23 ` [FFmpeg-devel] [PATCH 4/6] libavcodec/riscv: add RVV optimized for epel_v " zhanheng.yang--- via ffmpeg-devel
2026-01-22 4:23 ` [FFmpeg-devel] [PATCH 5/6] libavcodec/riscv: add RVV optimized for qpel_hv " zhanheng.yang--- via ffmpeg-devel
2026-01-22 4:23 ` [FFmpeg-devel] [PATCH 6/6] libavcodec/riscv: add RVV optimized for epel_hv " zhanheng.yang--- via ffmpeg-devel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260122042357.1438-3-zhanheng.yang@linux.alibaba.com \
--to=ffmpeg-devel@ffmpeg.org \
--cc=zhanheng.yang@linux.alibaba.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git