* [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: Optimize dmvr_hv_10 (PR #20517)
@ 2025-09-14 18:20 welder via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: welder via ffmpeg-devel @ 2025-09-14 18:20 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: welder
PR #20517 opened by welder
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20517
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20517.patch
Nothing spectacular, merged a few adds and shifts into rounding shifts.
>From 7809ff9746abf83bc41c1f13d9e1b2f1da6b0fb9 Mon Sep 17 00:00:00 2001
From: Krzysztof Pyrkosz <ffmpeg@szaka.eu>
Date: Fri, 5 Sep 2025 19:52:11 +0200
Subject: [PATCH] avcodec/aarch64/vvc: Optimize dmvr_hv_10
Before and ofter on A53:
dmvr_hv_10_12x20_neon: 1838.2 ( 3.02x)
dmvr_hv_10_20x12_neon: 1330.2 ( 1.83x)
dmvr_hv_10_20x20_neon: 2148.2 ( 1.85x)
dmvr_hv_12_12x20_neon: 1839.2 ( 3.02x)
dmvr_hv_12_20x12_neon: 1330.6 ( 1.83x)
dmvr_hv_12_20x20_neon: 2147.2 ( 1.85x)
dmvr_hv_10_12x20_neon: 1755.0 ( 3.17x)
dmvr_hv_10_20x12_neon: 1165.8 ( 2.09x)
dmvr_hv_10_20x20_neon: 1876.1 ( 2.12x)
dmvr_hv_12_12x20_neon: 1754.4 ( 3.17x)
dmvr_hv_12_20x12_neon: 1167.8 ( 2.09x)
dmvr_hv_12_20x20_neon: 1878.8 ( 2.12x)
---
libavcodec/aarch64/vvc/inter.S | 58 ++++++++++------------------------
1 file changed, 17 insertions(+), 41 deletions(-)
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 01d2ff155c..79ff720cdd 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -599,18 +599,13 @@ function ff_vvc_dmvr_hv_8_neon, export=1
endfunc
function ff_vvc_dmvr_hv_12_neon, export=1
- movi v29.4s, #(12 - 6)
- movi v30.4s, #(1 << (12 - 7)) // offset1
+ mvni v29.4s, #(12 - 6 - 1)
b 0f
endfunc
function ff_vvc_dmvr_hv_10_neon, export=1
- movi v29.4s, #(10 - 6)
- movi v30.4s, #(1 << (10 - 7)) // offset1
+ mvni v29.4s, #(10 - 6 - 1)
0:
- movi v31.4s, #8 // offset2
- neg v29.4s, v29.4s
-
sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
@@ -626,7 +621,6 @@ function ff_vvc_dmvr_hv_10_neon, export=1
add x12, x9, my, lsl #1
ldrb w10, [x12]
ldrb w11, [x12, #1]
- sxtw x6, w6
dup v2.8h, w10 // filter_y[0]
dup v3.8h, w11 // filter_y[1]
@@ -635,7 +629,7 @@ function ff_vvc_dmvr_hv_10_neon, export=1
mov w10, #0 // start filter_y or not
add height, height, #1
sub dst, dst, #(VVC_MAX_PB_SIZE * 2)
- sub src_stride, src_stride, x6, lsl #1
+ sub src_stride, src_stride, w6, sxtw #1
cset w15, gt // width > 16
1:
mov x12, tmp0
@@ -656,14 +650,10 @@ function ff_vvc_dmvr_hv_10_neon, export=1
umlal v18.4s, v17.4h, v1.4h
umlal2 v19.4s, v17.8h, v1.8h
- add v4.4s, v4.4s, v30.4s
- add v5.4s, v5.4s, v30.4s
- add v18.4s, v18.4s, v30.4s
- add v19.4s, v19.4s, v30.4s
- ushl v4.4s, v4.4s, v29.4s
- ushl v5.4s, v5.4s, v29.4s
- ushl v18.4s, v18.4s, v29.4s
- ushl v19.4s, v19.4s, v29.4s
+ urshl v4.4s, v4.4s, v29.4s
+ urshl v5.4s, v5.4s, v29.4s
+ urshl v18.4s, v18.4s, v29.4s
+ urshl v19.4s, v19.4s, v29.4s
uqxtn v6.4h, v4.4s
uqxtn2 v6.8h, v5.4s
uqxtn v7.4h, v18.4s
@@ -681,18 +671,10 @@ function ff_vvc_dmvr_hv_10_neon, export=1
umlal2 v18.4s, v6.8h, v3.8h
umlal v19.4s, v7.4h, v3.4h
umlal2 v20.4s, v7.8h, v3.8h
- add v17.4s, v17.4s, v31.4s
- add v18.4s, v18.4s, v31.4s
- add v19.4s, v19.4s, v31.4s
- add v20.4s, v20.4s, v31.4s
- ushr v17.4s, v17.4s, #4
- ushr v18.4s, v18.4s, #4
- ushr v19.4s, v19.4s, #4
- ushr v20.4s, v20.4s, #4
- uqxtn v6.4h, v17.4s
- uqxtn2 v6.8h, v18.4s
- uqxtn v7.4h, v19.4s
- uqxtn2 v7.8h, v20.4s
+ uqrshrn v6.4h, v17.4s, #4
+ uqrshrn2 v6.8h, v18.4s, #4
+ uqrshrn v7.4h, v19.4s, #4
+ uqrshrn2 v7.8h, v20.4s, #4
stp q6, q7, [x14], #32
b 3f
2:
@@ -704,10 +686,8 @@ function ff_vvc_dmvr_hv_10_neon, export=1
umlal v4.4s, v7.4h, v1.4h
umlal2 v5.4s, v7.8h, v1.8h
- add v4.4s, v4.4s, v30.4s
- add v5.4s, v5.4s, v30.4s
- ushl v4.4s, v4.4s, v29.4s
- ushl v5.4s, v5.4s, v29.4s
+ urshl v4.4s, v4.4s, v29.4s
+ urshl v5.4s, v5.4s, v29.4s
uqxtn v6.4h, v4.4s
uqxtn2 v6.8h, v5.4s
str q6, [x13], #16
@@ -719,10 +699,8 @@ function ff_vvc_dmvr_hv_10_neon, export=1
umull2 v18.4s, v16.8h, v2.8h
umlal v17.4s, v6.4h, v3.4h
umlal2 v18.4s, v6.8h, v3.8h
- add v17.4s, v17.4s, v31.4s
- add v18.4s, v18.4s, v31.4s
- ushr v17.4s, v17.4s, #4
- ushr v18.4s, v18.4s, #4
+ urshr v17.4s, v17.4s, #4
+ urshr v18.4s, v18.4s, #4
uqxtn v16.4h, v17.4s
uqxtn2 v16.8h, v18.4s
str q16, [x14], #16
@@ -731,8 +709,7 @@ function ff_vvc_dmvr_hv_10_neon, export=1
ldr d6, [src], #8
umull v4.4s, v7.4h, v1.4h
umlal v4.4s, v6.4h, v0.4h
- add v4.4s, v4.4s, v30.4s
- ushl v4.4s, v4.4s, v29.4s
+ urshl v4.4s, v4.4s, v29.4s
uqxtn v6.4h, v4.4s
str d6, [x13], #8
@@ -741,8 +718,7 @@ function ff_vvc_dmvr_hv_10_neon, export=1
ldr d16, [x12], #8
umull v17.4s, v16.4h, v2.4h
umlal v17.4s, v6.4h, v3.4h
- add v17.4s, v17.4s, v31.4s
- ushr v17.4s, v17.4s, #4
+ urshr v17.4s, v17.4s, #4
uqxtn v16.4h, v17.4s
str d16, [x14], #8
4:
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-09-14 18:20 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-09-14 18:20 [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: Optimize dmvr_hv_10 (PR #20517) welder via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git