* [FFmpeg-devel] [PATCH] Optimize vvc_apply_bdof_block_8x (PR #20448)
@ 2025-09-05 17:24 welder via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: welder via ffmpeg-devel @ 2025-09-05 17:24 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: welder
PR #20448 opened by welder
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20448
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20448.patch
The speed improvement is attached in the commit message. The count of arithmetic operation is down from 10 to 6 and some cruft is cleaned up.
>From 8967e3a6a725358494307b51add9349d3e7dd075 Mon Sep 17 00:00:00 2001
From: Krzysztof Pyrkosz <ffmpeg@szaka.eu>
Date: Fri, 5 Sep 2025 19:17:48 +0200
Subject: [PATCH] Optimize vvc_apply_bdof_block_8x
Before and after:
A53:
apply_bdof_8_8x16_neon: 3320.5 ( 4.02x)
apply_bdof_10_8x16_neon: 3317.8 ( 3.90x)
apply_bdof_12_8x16_neon: 3303.6 ( 3.91x)
apply_bdof_8_8x16_neon: 3216.2 ( 4.18x)
apply_bdof_10_8x16_neon: 3181.0 ( 4.09x)
apply_bdof_12_8x16_neon: 3172.1 ( 4.09x)
A72:
apply_bdof_8_8x16_neon: 1827.4 ( 5.02x)
apply_bdof_10_8x16_neon: 1838.5 ( 4.89x)
apply_bdof_12_8x16_neon: 1841.1 ( 4.83x)
apply_bdof_8_8x16_neon: 1691.6 ( 5.46x)
apply_bdof_10_8x16_neon: 1695.9 ( 5.23x)
apply_bdof_12_8x16_neon: 1695.4 ( 5.29x)
A78
apply_bdof_8_8x16_neon: 648.9 ( 7.43x)
apply_bdof_10_8x16_neon: 646.1 ( 7.04x)
apply_bdof_12_8x16_neon: 643.8 ( 7.04x)
apply_bdof_8_8x16_neon: 603.2 ( 7.97x)
apply_bdof_10_8x16_neon: 604.1 ( 7.52x)
apply_bdof_12_8x16_neon: 604.5 ( 7.52x)
---
libavcodec/aarch64/vvc/inter.S | 39 ++++++++++++----------------------
1 file changed, 13 insertions(+), 26 deletions(-)
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index a6648b64fc..f27b5a47f4 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -802,43 +802,33 @@ endfunc
vy .req x7
ldr w8, [sp]
- movi v7.4s, #(1 << (14 - \bit_depth))
mov x12, #(BDOF_BLOCK_SIZE * 2)
mov x14, #(VVC_MAX_PB_SIZE * 2)
.if \bit_depth >= 10
// clip pixel
mov w15, #((1 << \bit_depth) - 1)
- movi v18.8h, #0
dup v19.8h, w15
.endif
0:
- ld1r {v0.8h}, [vx], #2
- ld1r {v1.8h}, [vy], #2
- ld1r {v2.8h}, [vx]
- ld1r {v3.8h}, [vy]
+ ldr s0, [vx], #(2 * BDOF_MIN_BLOCK_SIZE)
+ ldr s1, [vy], #(2 * BDOF_MIN_BLOCK_SIZE)
mov w13, #(BDOF_MIN_BLOCK_SIZE)
- ins v0.d[1], v2.d[1]
- ins v1.d[1], v3.d[1]
1:
- ld1 {v2.8h}, [gh], x12
- ld1 {v4.8h}, [gv], x12
- smull v3.4s, v0.4h, v2.4h
- smull2 v16.4s, v0.8h, v2.8h
- smlal v3.4s, v1.4h, v4.4h
- smlal2 v16.4s, v1.8h, v4.8h
-
ld1 {v5.8h}, [src0], x14
ld1 {v6.8h}, [src1], x14
- saddl v2.4s, v5.4h, v6.4h
- add v2.4s, v2.4s, v7.4s
- add v2.4s, v2.4s, v3.4s
- saddl2 v4.4s, v5.8h, v6.8h
- add v4.4s, v4.4s, v7.4s
- add v4.4s, v4.4s, v16.4s
+ ld1 {v2.8h}, [gh], x12
+ ld1 {v4.8h}, [gv], x12
- sqshrn v5.4h, v2.4s, #(15 - \bit_depth)
- sqshrn2 v5.8h, v4.4s, #(15 - \bit_depth)
+ saddl v17.4s, v5.4h, v6.4h
+ saddl2 v16.4s, v5.8h, v6.8h
+ smlal v17.4s, v4.4h, v1.h[0]
+ smlal2 v16.4s, v4.8h, v1.h[1]
+ smlal v17.4s, v2.4h, v0.h[0]
+ smlal2 v16.4s, v2.8h, v0.h[1]
+
+ sqrshrun v5.4h, v17.4s, #(15 - \bit_depth)
+ sqrshrun2 v5.8h, v16.4s, #(15 - \bit_depth)
subs w13, w13, #1
.if \bit_depth == 8
sqxtun v5.8b, v5.8h
@@ -846,14 +836,11 @@ endfunc
add dst, dst, dst_stride
.else
smin v5.8h, v5.8h, v19.8h
- smax v5.8h, v5.8h, v18.8h
st1 {v5.8h}, [dst], dst_stride
.endif
b.ne 1b
subs w8, w8, #(BDOF_MIN_BLOCK_SIZE)
- add vx, vx, #(2 * BDOF_MIN_BLOCK_SIZE - 2)
- add vy, vy, #(2 * BDOF_MIN_BLOCK_SIZE - 2)
b.ne 0b
ret
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-09-05 17:25 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-09-05 17:24 [FFmpeg-devel] [PATCH] Optimize vvc_apply_bdof_block_8x (PR #20448) welder via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git