* [FFmpeg-devel] [PR] qpel_h16_v1 (PR #21761)
@ 2026-02-15 11:32 Jun Zhao via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: Jun Zhao via ffmpeg-devel @ 2026-02-15 11:32 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Jun Zhao
PR #21761 opened by Jun Zhao (mypopydev)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21761
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21761.patch
These two commits clean up and optimize the HEVC/VVC qpel horizontal filter NEON assembly. The first removes a duplicate
mov mx, x30 instruction in the VVC qpel h16/h32 functions where the link register save was needlessly performed twice.
The second rewrites the HEVC qpel H-pass for width >= 16 to use byte-domain widening multiply (calc_qpelb/calc_qpelb2)
instead of the previous int16-domain approach, eliminating uxtl expansion and bl/ret call overhead, yielding a ~1.39x
speedup on the H-pass itself and ~1.16x geometric mean improvement across all HV-path variants on Apple M4; VVC qpel
h16/h32 are separated into self-contained int16-domain functions since VVC filters are incompatible with the hardcoded
sign pattern used in the byte-domain macros.
>From 70b6e89c5a604612498e39e700dea0dafdb05bf1 Mon Sep 17 00:00:00 2001
From: Jun Zhao <barryjzhao@tencent.com>
Date: Sun, 15 Feb 2026 19:27:10 +0800
Subject: [PATCH 1/2] lavc/vvc: remove duplicate 'mov mx, x30' in VVC qpel
h16/h32
The VVC qpel h16 and h32 functions had a redundant 'mov mx, x30'
instruction. The first one was placed before vvc_load_filter had
finished using mx (the filter pointer argument), making it a dead
store immediately overwritten by the second 'mov mx, x30'.
Remove the first instance and reorder so that 'sub src, src, #3'
comes before 'mov mx, x30', ensuring the filter pointer in mx is
fully consumed by vvc_load_filter before being overwritten with the
link register.
Signed-off-by: Jun Zhao <barryjzhao@tencent.com>
---
libavcodec/aarch64/h26x/qpel_neon.S | 2 --
1 file changed, 2 deletions(-)
diff --git a/libavcodec/aarch64/h26x/qpel_neon.S b/libavcodec/aarch64/h26x/qpel_neon.S
index 7901fedaf3..b7d2e0f34a 100644
--- a/libavcodec/aarch64/h26x/qpel_neon.S
+++ b/libavcodec/aarch64/h26x/qpel_neon.S
@@ -556,7 +556,6 @@ endfunc
function ff_vvc_put_\type\()_h16_8_neon, export=1
vvc_load_filter mx
sxtw height, heightw
- mov mx, x30
sub src, src, #3
mov mx, x30
.ifc \type, qpel
@@ -634,7 +633,6 @@ endfunc
function ff_vvc_put_\type\()_h32_8_neon, export=1
vvc_load_filter mx
sxtw height, heightw
- mov mx, x30
sub src, src, #3
mov mx, x30
.ifc \type, qpel
--
2.52.0
>From 912c260a0d37fc205d27eeae2e448eb4bd73106f Mon Sep 17 00:00:00 2001
From: Jun Zhao <barryjzhao@tencent.com>
Date: Sun, 15 Feb 2026 13:23:24 +0800
Subject: [PATCH 2/2] lavc/hevc: optimize qpel H-pass for width>=16 with
byte-domain widening multiply
Rewrite ff_hevc_put_hevc_qpel_h16_8_neon and h32 to use byte-domain
widening multiply (umull/umlal/umlsl via calc_qpelb/calc_qpelb2 macros)
instead of the previous int16-domain approach (uxtl + mul/mla).
The byte-domain approach eliminates the uxtl expansion step and halves
the ext stride (1 byte vs 2 bytes per tap), reducing per-row instruction
count from ~32 to ~23. The functions are also inlined, removing bl/ret
call overhead.
This benefits all HV-path callers (hv/uni_hv/bi_hv/uni_w_hv/bi_w_hv)
at widths 16/32/48/64.
checkasm benchmarks on Apple M4 (5-run average):
H-pass standalone (NEON):
h16: 34.0 -> 24.4 cycles (1.39x speedup)
h32: 132.0 -> 95.0 cycles (1.39x speedup)
h64: 521.8 -> 373.9 cycles (1.40x speedup)
HV compound paths geometric mean speedup (NEON, width >= 16):
qpel_hv: 1.144x (4 functions)
qpel_bi_hv: 1.158x (4 functions)
qpel_uni_hv: 1.188x (4 functions)
qpel_uni_w_hv: 1.158x (3 functions)
Overall: 1.162x (15 functions)
VVC qpel h16/h32 are separated into self-contained functions retaining
the int16-domain approach, as VVC filters have arbitrary coefficients
incompatible with the hardcoded sign pattern in calc_qpelb.
Signed-off-by: Jun Zhao <barryjzhao@tencent.com>
---
libavcodec/aarch64/h26x/qpel_neon.S | 165 ++++++++++++++++++++++------
1 file changed, 129 insertions(+), 36 deletions(-)
diff --git a/libavcodec/aarch64/h26x/qpel_neon.S b/libavcodec/aarch64/h26x/qpel_neon.S
index b7d2e0f34a..423db38491 100644
--- a/libavcodec/aarch64/h26x/qpel_neon.S
+++ b/libavcodec/aarch64/h26x/qpel_neon.S
@@ -552,20 +552,64 @@ function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1
ret mx
endfunc
+.ifc \type, qpel
+// VVC qpel h16: self-contained int16-domain implementation
+function ff_vvc_put_qpel_h16_8_neon, export=1
+ vvc_load_filter mx
+ sxtw height, heightw
+ sub src, src, #3
+ mov mx, x30
+ mov dststride, #(VVC_MAX_PB_SIZE << 1)
+ lsl x13, srcstride, #1 // srcstridel
+ mov x14, #(VVC_MAX_PB_SIZE << 2)
+ add x10, dst, dststride // dstb
+ add x12, src, srcstride // srcb
+1: ld1 {v16.8b-v18.8b}, [src], x13
+ ld1 {v19.8b-v21.8b}, [x12], x13
+ uxtl v16.8h, v16.8b
+ uxtl v19.8h, v19.8b
+ bl ff_hevc_put_hevc_h16_8_neon
+ subs height, height, #2
+ st1 {v26.8h, v27.8h}, [dst], x14
+ st1 {v28.8h, v29.8h}, [x10], x14
+ b.gt 1b // double line
+ ret mx
+endfunc
+
+// HEVC qpel h16: byte-domain widening multiply
+function ff_hevc_put_hevc_qpel_h16_8_neon, export=1
+ load_qpel_filterb mx, x15
+ sxtw height, heightw
+ sub src, src, #3
+ mov dststride, #(HEVC_MAX_PB_SIZE << 1)
+1:
+ ld1 {v16.16b, v17.16b}, [src], srcstride
+ ext v18.16b, v16.16b, v17.16b, #1
+ ext v19.16b, v16.16b, v17.16b, #2
+ ext v20.16b, v16.16b, v17.16b, #3
+ ext v21.16b, v16.16b, v17.16b, #4
+ ext v22.16b, v16.16b, v17.16b, #5
+ ext v23.16b, v16.16b, v17.16b, #6
+ ext v24.16b, v16.16b, v17.16b, #7
+ calc_qpelb v26, v16, v18, v19, v20, v21, v22, v23, v24
+ calc_qpelb2 v27, v16, v18, v19, v20, v21, v22, v23, v24
+ stp q26, q27, [dst]
+ add dst, dst, dststride
+ subs height, height, #1
+ b.gt 1b
+ ret
+endfunc
+
+.else // qpel_uni, qpel_bi
+
.ifnc \type, qpel_bi
function ff_vvc_put_\type\()_h16_8_neon, export=1
vvc_load_filter mx
sxtw height, heightw
sub src, src, #3
mov mx, x30
-.ifc \type, qpel
- mov dststride, #(VVC_MAX_PB_SIZE << 1)
- lsl x13, srcstride, #1 // srcstridel
- mov x14, #(VVC_MAX_PB_SIZE << 2)
-.else
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
-.endif
b 0f
endfunc
.endif // !qpel_bi
@@ -581,14 +625,8 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
.endif
sub src, src, #3
mov mx, x30
-.ifc \type, qpel
- mov dststride, #(HEVC_MAX_PB_SIZE << 1)
- lsl x13, srcstride, #1 // srcstridel
- mov x14, #(HEVC_MAX_PB_SIZE << 2)
-.else
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
-.endif
0:
add x10, dst, dststride // dstb
add x12, src, srcstride // srcb
@@ -601,10 +639,6 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
bl ff_hevc_put_hevc_h16_8_neon
subs height, height, #2
-.ifc \type, qpel
- st1 {v26.8h, v27.8h}, [dst], x14
- st1 {v28.8h, v29.8h}, [x10], x14
-.else
.ifc \type, qpel_bi
ld1 {v16.8h, v17.8h}, [ x4], x16
ld1 {v18.8h, v19.8h}, [x15], x16
@@ -624,27 +658,96 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
.endif
st1 {v26.8b, v27.8b}, [dst], x14
st1 {v28.8b, v29.8b}, [x10], x14
-.endif
b.gt 1b // double line
ret mx
endfunc
+.endif // qpel vs qpel_uni/qpel_bi
+
+.ifc \type, qpel
+// VVC qpel h32: self-contained int16-domain implementation
+function ff_vvc_put_qpel_h32_8_neon, export=1
+ vvc_load_filter mx
+ sxtw height, heightw
+ mov mx, x30
+ sub src, src, #3
+ mov dststride, #(VVC_MAX_PB_SIZE << 1)
+ lsl x13, srcstride, #1 // srcstridel
+ mov x14, #(VVC_MAX_PB_SIZE << 2)
+ sub x14, x14, width, uxtw #1
+ sub x13, x13, width, uxtw
+ sub x13, x13, #8
+ add x10, dst, dststride // dstb
+ add x12, src, srcstride // srcb
+0: mov w9, width
+ ld1 {v16.8b}, [src], #8
+ ld1 {v19.8b}, [x12], #8
+ uxtl v16.8h, v16.8b
+ uxtl v19.8h, v19.8b
+1:
+ ld1 {v17.8b-v18.8b}, [src], #16
+ ld1 {v20.8b-v21.8b}, [x12], #16
+ bl ff_hevc_put_hevc_h16_8_neon
+ subs w9, w9, #16
+ mov v16.16b, v18.16b
+ mov v19.16b, v21.16b
+ st1 {v26.8h, v27.8h}, [dst], #32
+ st1 {v28.8h, v29.8h}, [x10], #32
+ b.gt 1b // double line
+ subs height, height, #2
+ add src, src, x13
+ add x12, x12, x13
+ add dst, dst, x14
+ add x10, x10, x14
+ b.gt 0b
+ ret mx
+endfunc
+
+// HEVC qpel h32: byte-domain widening multiply with width loop
+function ff_hevc_put_hevc_qpel_h32_8_neon, export=1
+ load_qpel_filterb mx, x15
+ sxtw height, heightw
+ sub src, src, #3
+ mov dststride, #(HEVC_MAX_PB_SIZE << 1)
+ sub x13, dststride, width, uxtw #1 // stride adjustment
+0:
+ mov w9, width
+ mov x10, src
+ mov x11, dst
+1:
+ ld1 {v16.16b, v17.16b}, [x10]
+ add x10, x10, #16
+ ext v18.16b, v16.16b, v17.16b, #1
+ ext v19.16b, v16.16b, v17.16b, #2
+ ext v20.16b, v16.16b, v17.16b, #3
+ ext v21.16b, v16.16b, v17.16b, #4
+ ext v22.16b, v16.16b, v17.16b, #5
+ ext v23.16b, v16.16b, v17.16b, #6
+ ext v24.16b, v16.16b, v17.16b, #7
+ calc_qpelb v26, v16, v18, v19, v20, v21, v22, v23, v24
+ calc_qpelb2 v27, v16, v18, v19, v20, v21, v22, v23, v24
+ stp q26, q27, [x11], #32
+ subs w9, w9, #16
+ b.gt 1b
+ add src, src, srcstride
+ add dst, dst, x13
+ add dst, dst, width, uxtw #1
+ subs height, height, #1
+ b.gt 0b
+ ret
+endfunc
+
+.else // qpel_uni, qpel_bi
+
.ifnc \type, qpel_bi
function ff_vvc_put_\type\()_h32_8_neon, export=1
vvc_load_filter mx
sxtw height, heightw
sub src, src, #3
mov mx, x30
-.ifc \type, qpel
- mov dststride, #(VVC_MAX_PB_SIZE << 1)
- lsl x13, srcstride, #1 // srcstridel
- mov x14, #(VVC_MAX_PB_SIZE << 2)
- sub x14, x14, width, uxtw #1
-.else
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
sub x14, x14, width, uxtw
-.endif
b 1f
endfunc
.endif // !qpel_bi
@@ -662,16 +765,9 @@ function ff_hevc_put_hevc_\type\()_h32_8_neon, export=1
.endif
sub src, src, #3
mov mx, x30
-.ifc \type, qpel
- mov dststride, #(HEVC_MAX_PB_SIZE << 1)
- lsl x13, srcstride, #1 // srcstridel
- mov x14, #(HEVC_MAX_PB_SIZE << 2)
- sub x14, x14, width, uxtw #1
-.else
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
sub x14, x14, width, uxtw
-.endif
1:
sub x13, x13, width, uxtw
sub x13, x13, #8
@@ -691,10 +787,6 @@ function ff_hevc_put_hevc_\type\()_h32_8_neon, export=1
mov v16.16b, v18.16b
mov v19.16b, v21.16b
-.ifc \type, qpel
- st1 {v26.8h, v27.8h}, [dst], #32
- st1 {v28.8h, v29.8h}, [x10], #32
-.else
.ifc \type, qpel_bi
ld1 {v20.8h, v21.8h}, [ x4], #32
ld1 {v22.8h, v23.8h}, [x15], #32
@@ -714,7 +806,6 @@ function ff_hevc_put_hevc_\type\()_h32_8_neon, export=1
.endif
st1 {v26.8b, v27.8b}, [dst], #16
st1 {v28.8b, v29.8b}, [x10], #16
-.endif
b.gt 1b // double line
subs height, height, #2
add src, src, x13
@@ -729,6 +820,8 @@ function ff_hevc_put_hevc_\type\()_h32_8_neon, export=1
ret mx
endfunc
+.endif // qpel vs qpel_uni/qpel_bi
+
.unreq height
.unreq heightw
.unreq width
--
2.52.0
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2026-02-15 11:32 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-02-15 11:32 [FFmpeg-devel] [PR] qpel_h16_v1 (PR #21761) Jun Zhao via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git