From: "zhanheng.yang--- via ffmpeg-devel" <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: Zhanheng Yang <zhanheng.yang@linux.alibaba.com>
Subject: [FFmpeg-devel] [PATCH 4/6] libavcodec/riscv: add RVV optimized for epel_v in HEVC.
Date: Thu, 22 Jan 2026 12:23:55 +0800
Message-ID: <20260122042357.1438-4-zhanheng.yang@linux.alibaba.com> (raw)
In-Reply-To: <20260122042357.1438-1-zhanheng.yang@linux.alibaba.com>
From: Zhanheng Yang <zhanheng.yang@linux.alibaba.com>
Bench on A210 C908 core(VLEN 128).
put_hevc_epel_v4_8_c: 157.8 ( 1.00x)
put_hevc_epel_v4_8_rvv_i32: 73.2 ( 2.16x)
put_hevc_epel_v6_8_c: 314.6 ( 1.00x)
put_hevc_epel_v6_8_rvv_i32: 101.2 ( 3.11x)
put_hevc_epel_v8_8_c: 545.5 ( 1.00x)
put_hevc_epel_v8_8_rvv_i32: 124.4 ( 4.39x)
put_hevc_epel_v12_8_c: 1240.8 ( 1.00x)
put_hevc_epel_v12_8_rvv_i32: 183.6 ( 6.76x)
put_hevc_epel_v16_8_c: 2170.7 ( 1.00x)
put_hevc_epel_v16_8_rvv_i32: 235.1 ( 9.23x)
put_hevc_epel_v24_8_c: 4743.5 ( 1.00x)
put_hevc_epel_v24_8_rvv_i32: 677.5 ( 7.00x)
put_hevc_epel_v32_8_c: 8353.4 ( 1.00x)
put_hevc_epel_v32_8_rvv_i32: 892.1 ( 9.36x)
put_hevc_epel_v48_8_c: 18608.1 ( 1.00x)
put_hevc_epel_v48_8_rvv_i32: 1956.1 ( 9.51x)
put_hevc_epel_v64_8_c: 32934.3 ( 1.00x)
put_hevc_epel_v64_8_rvv_i32: 3454.1 ( 9.53x)
put_hevc_epel_uni_v4_8_c: 237.5 ( 1.00x)
put_hevc_epel_uni_v4_8_rvv_i32: 87.5 ( 2.72x)
put_hevc_epel_uni_v6_8_c: 509.5 ( 1.00x)
put_hevc_epel_uni_v6_8_rvv_i32: 119.6 ( 4.26x)
put_hevc_epel_uni_v8_8_c: 982.8 ( 1.00x)
put_hevc_epel_uni_v8_8_rvv_i32: 147.1 ( 6.68x)
put_hevc_epel_uni_v12_8_c: 2027.7 ( 1.00x)
put_hevc_epel_uni_v12_8_rvv_i32: 211.0 ( 9.61x)
put_hevc_epel_uni_v16_8_c: 3525.4 ( 1.00x)
put_hevc_epel_uni_v16_8_rvv_i32: 278.8 (12.64x)
put_hevc_epel_uni_v24_8_c: 7804.3 ( 1.00x)
put_hevc_epel_uni_v24_8_rvv_i32: 778.9 (10.02x)
put_hevc_epel_uni_v32_8_c: 13807.3 ( 1.00x)
put_hevc_epel_uni_v32_8_rvv_i32: 1028.7 (13.42x)
put_hevc_epel_uni_v48_8_c: 30934.9 ( 1.00x)
put_hevc_epel_uni_v48_8_rvv_i32: 2265.1 (13.66x)
put_hevc_epel_uni_v64_8_c: 54705.5 ( 1.00x)
put_hevc_epel_uni_v64_8_rvv_i32: 4003.7 (13.66x)
put_hevc_epel_uni_w_v4_8_c: 313.8 ( 1.00x)
put_hevc_epel_uni_w_v4_8_rvv_i32: 156.6 ( 2.00x)
put_hevc_epel_uni_w_v6_8_c: 674.3 ( 1.00x)
put_hevc_epel_uni_w_v6_8_rvv_i32: 222.8 ( 3.03x)
put_hevc_epel_uni_w_v8_8_c: 1253.3 ( 1.00x)
put_hevc_epel_uni_w_v8_8_rvv_i32: 279.4 ( 4.49x)
put_hevc_epel_uni_w_v12_8_c: 2619.4 ( 1.00x)
put_hevc_epel_uni_w_v12_8_rvv_i32: 410.2 ( 6.39x)
put_hevc_epel_uni_w_v16_8_c: 4614.2 ( 1.00x)
put_hevc_epel_uni_w_v16_8_rvv_i32: 535.8 ( 8.61x)
put_hevc_epel_uni_w_v24_8_c: 10290.6 ( 1.00x)
put_hevc_epel_uni_w_v24_8_rvv_i32: 1550.6 ( 6.64x)
put_hevc_epel_uni_w_v32_8_c: 18169.4 ( 1.00x)
put_hevc_epel_uni_w_v32_8_rvv_i32: 2047.2 ( 8.88x)
put_hevc_epel_uni_w_v48_8_c: 40704.3 ( 1.00x)
put_hevc_epel_uni_w_v48_8_rvv_i32: 4552.4 ( 8.94x)
put_hevc_epel_uni_w_v64_8_c: 72197.1 ( 1.00x)
put_hevc_epel_uni_w_v64_8_rvv_i32: 8069.4 ( 8.95x)
put_hevc_epel_bi_v4_8_c: 262.7 ( 1.00x)
put_hevc_epel_bi_v4_8_rvv_i32: 105.9 ( 2.48x)
put_hevc_epel_bi_v6_8_c: 553.0 ( 1.00x)
put_hevc_epel_bi_v6_8_rvv_i32: 145.4 ( 3.80x)
put_hevc_epel_bi_v8_8_c: 1045.5 ( 1.00x)
put_hevc_epel_bi_v8_8_rvv_i32: 180.3 ( 5.80x)
put_hevc_epel_bi_v12_8_c: 2172.7 ( 1.00x)
put_hevc_epel_bi_v12_8_rvv_i32: 264.2 ( 8.22x)
put_hevc_epel_bi_v16_8_c: 3791.6 ( 1.00x)
put_hevc_epel_bi_v16_8_rvv_i32: 336.5 (11.27x)
put_hevc_epel_bi_v24_8_c: 8424.1 ( 1.00x)
put_hevc_epel_bi_v24_8_rvv_i32: 967.2 ( 8.71x)
put_hevc_epel_bi_v32_8_c: 14910.8 ( 1.00x)
put_hevc_epel_bi_v32_8_rvv_i32: 1270.7 (11.73x)
put_hevc_epel_bi_v48_8_c: 33326.5 ( 1.00x)
put_hevc_epel_bi_v48_8_rvv_i32: 2804.7 (11.88x)
put_hevc_epel_bi_v64_8_c: 59177.9 ( 1.00x)
put_hevc_epel_bi_v64_8_rvv_i32: 5022.3 (11.78x)
Signed-off-by: Zhanheng Yang <zhanheng.yang@linux.alibaba.com>
---
libavcodec/riscv/h26x/h2656dsp.h | 11 ++
libavcodec/riscv/h26x/hevcepel_rvv.S | 235 ++++++++++++++++++++++++++-
libavcodec/riscv/hevcdsp_init.c | 4 +
3 files changed, 249 insertions(+), 1 deletion(-)
diff --git a/libavcodec/riscv/h26x/h2656dsp.h b/libavcodec/riscv/h26x/h2656dsp.h
index fa2f5a88e3..085ed4cf14 100644
--- a/libavcodec/riscv/h26x/h2656dsp.h
+++ b/libavcodec/riscv/h26x/h2656dsp.h
@@ -59,4 +59,15 @@ void ff_hevc_put_epel_uni_w_h_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride,
void ff_hevc_put_epel_bi_h_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
mx, intptr_t my, int width);
+void ff_hevc_put_epel_v_8_m1_rvv(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_uni_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
+ ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_uni_w_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride,
+ const uint8_t *_src, ptrdiff_t _srcstride,
+ int height, int denom, int wx, int ox,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_bi_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
+ ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
+ mx, intptr_t my, int width);
#endif
diff --git a/libavcodec/riscv/h26x/hevcepel_rvv.S b/libavcodec/riscv/h26x/hevcepel_rvv.S
index 81044846f7..caca0b88ab 100644
--- a/libavcodec/riscv/h26x/hevcepel_rvv.S
+++ b/libavcodec/riscv/h26x/hevcepel_rvv.S
@@ -262,4 +262,237 @@ func ff_hevc_put_epel_bi_h_8_\lmul\()_rvv, zve32x
endfunc
.endm
-hevc_epel_h m1, m2, m4
\ No newline at end of file
+hevc_epel_h m1, m2, m4
+
+/* output is unclipped; clobbers v4 */
+.macro filter_v vdst, vsrc0, vsrc1, vsrc2, vsrc3
+ vmv.v.x v4, s1
+ vwmulsu.vv \vdst, v4, \vsrc0
+ vwmaccsu.vx \vdst, s2, \vsrc1
+ vmv.v.v \vsrc0, \vsrc1
+ vwmaccsu.vx \vdst, s3, \vsrc2
+ vmv.v.v \vsrc1, \vsrc2
+ vwmaccsu.vx \vdst, s4, \vsrc3
+ vmv.v.v \vsrc2, \vsrc3
+.endm
+
+.macro hevc_epel_v lmul, lmul2, lmul4
+func ff_hevc_put_epel_v_8_\lmul\()_rvv, zve32x
+ addi sp, sp, -32
+ sx s1, 0(sp)
+ sx s2, 8(sp)
+ sx s3, 16(sp)
+ sx s4, 24(sp)
+ load_filter a5
+ sub a1, a1, a2 # src - src_stride
+ li t1, 0 # offset
+ mv t4, a3
+
+1:
+ add t2, a1, t1
+ slli t3, t1, 1
+ add t3, a0, t3
+
+ vsetvli t5, a6, e8, \lmul, ta, ma
+ vle8.V v16, (t2)
+ add t2, t2, a2
+ vle8.V v18, (t2)
+ add t2, t2, a2
+ vle8.V v20, (t2)
+ add t2, t2, a2
+
+2:
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vle8.V v22, (t2)
+ add t2, t2, a2
+ filter_v v0, v16, v18, v20, v22
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ vse16.v v0, (t3)
+ add t3, t3, 2*HEVC_MAX_PB_SIZE
+ addi a3, a3, -1
+ bgt a3, zero, 2b
+ add t1, t1, t5
+ sub a6, a6, t5
+ mv a3, t4
+ bgt a6, zero, 1b
+
+ lx s1, 0(sp)
+ lx s2, 8(sp)
+ lx s3, 16(sp)
+ lx s4, 24(sp)
+ addi sp, sp, 32
+ ret
+endfunc
+
+func ff_hevc_put_epel_uni_v_8_\lmul\()_rvv, zve32x
+ csrwi vxrm, 0
+ addi sp, sp, -32
+ sx s1, 0(sp)
+ sx s2, 8(sp)
+ sx s3, 16(sp)
+ sx s4, 24(sp)
+ load_filter a6
+ sub a2, a2, a3 # src - src_stride
+ li t1, 0 # offset
+ mv t4, a4
+
+1:
+ add t2, a2, t1
+ add t3, a0, t1
+
+ vsetvli t5, a7, e8, \lmul, ta, ma
+ vle8.V v16, (t2)
+ add t2, t2, a3
+ vle8.V v18, (t2)
+ add t2, t2, a3
+ vle8.V v20, (t2)
+ add t2, t2, a3
+
+2:
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vle8.V v22, (t2)
+ add t2, t2, a3
+ filter_v v0, v16, v18, v20, v22
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ vmax.vx v0, v0, zero
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vnclipu.wi v0, v0, 6
+ vse8.v v0, (t3)
+ add t3, t3, a1
+ addi a4, a4, -1
+ bgt a4, zero, 2b
+ add t1, t1, t5
+ sub a7, a7, t5
+ mv a4, t4
+ bgt a7, zero, 1b
+
+ lx s1, 0(sp)
+ lx s2, 8(sp)
+ lx s3, 16(sp)
+ lx s4, 24(sp)
+ addi sp, sp, 32
+ ret
+endfunc
+
+func ff_hevc_put_epel_uni_w_v_8_\lmul\()_rvv, zve32x
+ csrwi vxrm, 0
+#if (__riscv_xlen == 32)
+ lw t1, 4(sp) # my
+ lw t6, 8(sp) # width
+#elif (__riscv_xlen == 64)
+ ld t1, 8(sp)
+ lw t6, 16(sp)
+#endif
+ addi sp, sp, -32
+ sx s1, 0(sp)
+ sx s2, 8(sp)
+ sx s3, 16(sp)
+ sx s4, 24(sp)
+ load_filter t1
+ addi a5, a5, 6 # shift
+ sub a2, a2, a3 # src - src_stride
+ li t1, 0 # offset
+ mv t4, a4
+
+1:
+ add t2, a2, t1
+ add t3, a0, t1
+
+ vsetvli t5, t6, e8, \lmul, ta, ma
+ vle8.V v16, (t2)
+ add t2, t2, a3
+ vle8.V v18, (t2)
+ add t2, t2, a3
+ vle8.V v20, (t2)
+ add t2, t2, a3
+
+2:
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vle8.V v22, (t2)
+ add t2, t2, a3
+ filter_v v0, v16, v18, v20, v22
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ vwmul.vx v8, v0, a6
+ vsetvli zero, zero, e32, \lmul4, ta, ma
+ vssra.vx v0, v8, a5
+ vsadd.vx v0, v0, a7
+ vmax.vx v0, v0, zero
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ vnclip.wi v0, v0, 0
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vnclipu.wi v0, v0, 0
+ vse8.v v0, (t3)
+ add t3, t3, a1
+ addi a4, a4, -1
+ bgt a4, zero, 2b
+ add t1, t1, t5
+ sub t6, t6, t5
+ mv a4, t4
+ bgt t6, zero, 1b
+
+ lx s1, 0(sp)
+ lx s2, 8(sp)
+ lx s3, 16(sp)
+ lx s4, 24(sp)
+ addi sp, sp, 32
+ ret
+endfunc
+
+func ff_hevc_put_epel_bi_v_8_\lmul\()_rvv, zve32x
+ csrwi vxrm, 0
+ lw t6, 0(sp) # width
+ addi sp, sp, -32
+ sx s1, 0(sp)
+ sx s2, 8(sp)
+ sx s3, 16(sp)
+ sx s4, 24(sp)
+ load_filter a7
+ sub a2, a2, a3 # src - src_stride
+ li t1, 0 # offset
+ mv t4, a5
+
+1:
+ add t2, a2, t1
+ add t3, a0, t1
+ slli t0, t1, 1
+ add t0, a4, t0
+
+ vsetvli t5, t6, e8, \lmul, ta, ma
+ vle8.V v16, (t2)
+ add t2, t2, a3
+ vle8.V v18, (t2)
+ add t2, t2, a3
+ vle8.V v20, (t2)
+ add t2, t2, a3
+
+2:
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vle8.V v22, (t2)
+ add t2, t2, a3
+ filter_v v0, v16, v18, v20, v22
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ vle16.v v8, (t0)
+ addi t0, t0, 2*HEVC_MAX_PB_SIZE
+ vsadd.vv v0, v0, v8
+ vmax.vx v0, v0, zero
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vnclipu.wi v0, v0, 7
+ vse8.v v0, (t3)
+ add t3, t3, a1
+ addi a5, a5, -1
+ bgt a5, zero, 2b
+ add t1, t1, t5
+ sub t6, t6, t5
+ mv a5, t4
+ bgt t6, zero, 1b
+
+ lx s1, 0(sp)
+ lx s2, 8(sp)
+ lx s3, 16(sp)
+ lx s4, 24(sp)
+ addi sp, sp, 32
+ ret
+endfunc
+.endm
+
+hevc_epel_v m1, m2, m4
\ No newline at end of file
diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c
index 8608fdbd19..c7874996a8 100644
--- a/libavcodec/riscv/hevcdsp_init.c
+++ b/libavcodec/riscv/hevcdsp_init.c
@@ -94,6 +94,10 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int bit_depth)
RVV_FNASSIGN_PEL(c->put_hevc_epel_uni, 0, 1, ff_hevc_put_epel_uni_h_8_m1_rvv);
RVV_FNASSIGN_PEL(c->put_hevc_epel_uni_w, 0, 1, ff_hevc_put_epel_uni_w_h_8_m1_rvv);
RVV_FNASSIGN_PEL(c->put_hevc_epel_bi, 0, 1, ff_hevc_put_epel_bi_h_8_m1_rvv);
+ RVV_FNASSIGN_PEL(c->put_hevc_epel, 1, 0, ff_hevc_put_epel_v_8_m1_rvv);
+ RVV_FNASSIGN_PEL(c->put_hevc_epel_uni, 1, 0, ff_hevc_put_epel_uni_v_8_m1_rvv);
+ RVV_FNASSIGN_PEL(c->put_hevc_epel_uni_w, 1, 0, ff_hevc_put_epel_uni_w_v_8_m1_rvv);
+ RVV_FNASSIGN_PEL(c->put_hevc_epel_bi, 1, 0, ff_hevc_put_epel_bi_v_8_m1_rvv);
break;
default:
break;
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
next prev parent reply other threads:[~2026-01-24 15:57 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-22 4:23 [FFmpeg-devel] [PATCH 1/6] libavcodec/riscv: add RVV optimized for qpel_h " zhanheng.yang--- via ffmpeg-devel
2026-01-22 4:23 ` [FFmpeg-devel] [PATCH 2/6] libavcodec/riscv: add RVV optimized for qpel_v " zhanheng.yang--- via ffmpeg-devel
2026-01-22 4:23 ` [FFmpeg-devel] [PATCH 3/6] libavcodec/riscv: add RVV optimized for epel_h " zhanheng.yang--- via ffmpeg-devel
2026-01-22 4:23 ` zhanheng.yang--- via ffmpeg-devel [this message]
2026-01-22 4:23 ` [FFmpeg-devel] [PATCH 5/6] libavcodec/riscv: add RVV optimized for qpel_hv " zhanheng.yang--- via ffmpeg-devel
2026-01-22 4:23 ` [FFmpeg-devel] [PATCH 6/6] libavcodec/riscv: add RVV optimized for epel_hv " zhanheng.yang--- via ffmpeg-devel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260122042357.1438-4-zhanheng.yang@linux.alibaba.com \
--to=ffmpeg-devel@ffmpeg.org \
--cc=zhanheng.yang@linux.alibaba.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git