* [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: Unroll vvc_bdof_grad_filter_8x_neon (PR #20519)
@ 2025-09-14 19:56 welder via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: welder via ffmpeg-devel @ 2025-09-14 19:56 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: welder
PR #20519 opened by welder
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20519
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20519.patch
I hope it's not an overkill, I unrolled the 16 width variant, interleaved the loads, stores and arithmetic ops to the best of my ability. Additionally I got rid of the internal loop and the mov/add preamble and epilogue.
>From 668ddf2d4a0b9213403f7468ab9d7542a0119afb Mon Sep 17 00:00:00 2001
From: Krzysztof Pyrkosz <ffmpeg@szaka.eu>
Date: Sun, 14 Sep 2025 19:13:24 +0200
Subject: [PATCH] avcodec/aarch64/vvc: Unroll vvc_bdof_grad_filter_8x_neon
Before and after:
A53:
apply_bdof_8_16x8_neon: 2733.1 ( 4.88x)
apply_bdof_8_16x16_neon: 5458.6 ( 4.86x)
apply_bdof_10_16x8_neon: 2789.8 ( 4.64x)
apply_bdof_10_16x16_neon: 5523.8 ( 4.68x)
apply_bdof_12_16x8_neon: 2792.8 ( 4.58x)
apply_bdof_12_16x16_neon: 5519.5 ( 4.63x)
apply_bdof_8_16x8_neon: 2571.8 ( 5.12x)
apply_bdof_8_16x16_neon: 5173.3 ( 5.12x)
apply_bdof_10_16x8_neon: 2635.1 ( 4.87x)
apply_bdof_10_16x16_neon: 5243.0 ( 4.89x)
apply_bdof_12_16x8_neon: 2613.0 ( 4.89x)
apply_bdof_12_16x16_neon: 5231.7 ( 4.90x)
A78:
apply_bdof_8_16x8_neon: 565.3 ( 8.43x)
apply_bdof_8_16x16_neon: 1109.5 ( 8.60x)
apply_bdof_10_16x8_neon: 568.2 ( 7.92x)
apply_bdof_10_16x16_neon: 1114.1 ( 8.08x)
apply_bdof_12_16x8_neon: 570.2 ( 7.87x)
apply_bdof_12_16x16_neon: 1116.3 ( 8.03x)
apply_bdof_8_16x8_neon: 541.4 ( 8.81x)
apply_bdof_8_16x16_neon: 1065.9 ( 8.97x)
apply_bdof_10_16x8_neon: 543.2 ( 8.32x)
apply_bdof_10_16x16_neon: 1071.5 ( 8.39x)
apply_bdof_12_16x8_neon: 544.2 ( 8.25x)
apply_bdof_12_16x16_neon: 1074.1 ( 8.37x)
---
libavcodec/aarch64/vvc/inter.S | 85 +++++++++++++++++++++++++++++++---
1 file changed, 78 insertions(+), 7 deletions(-)
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 01d2ff155c..47810ec3c1 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -827,10 +827,10 @@ function vvc_bdof_grad_filter_8x_neon, export=0
src1 .req x5
width .req w6
height .req w7
+ tbnz w6, #4, 16f
1:
mov x10, src0
- mov w11, width
mov x12, gh0
mov x13, gv0
mov x14, src1
@@ -863,16 +863,11 @@ function vvc_bdof_grad_filter_8x_neon, export=0
// results of gradient_v1
sub v6.8h, v6.8h, v7.8h
- add x10, x10, #16
- add x14, x14, #16
-
// (gradient_h0 + gradient_h1) >> 1
shadd v1.8h, v0.8h, v4.8h
// gradient_h0 - gradient_h1
sub v5.8h, v0.8h, v4.8h
- subs w11, w11, #8
-
// (gradient_v0 + gradient_v1) >> 1
shadd v3.8h, v2.8h, v6.8h
// gradient_v0 - gradient_v1
@@ -882,7 +877,6 @@ function vvc_bdof_grad_filter_8x_neon, export=0
st1 {v5.8h}, [x15], #16
st1 {v3.8h}, [x13], #16
st1 {v7.8h}, [x16], #16
- b.ne 2b
subs height, height, #1
add gh0, gh0, #(BDOF_BLOCK_SIZE << 1)
@@ -894,6 +888,83 @@ function vvc_bdof_grad_filter_8x_neon, export=0
b.ne 1b
ret
+16:
+ ldur q0, [x4, #2]
+ ldur q1, [x4, #18]
+ ldur q16, [x4, #-2]
+ sshr v0.8h, v0.8h, #6
+ ldur q17, [x4, #14]
+ sshr v1.8h, v1.8h, #6
+ ldp q18, q19, [x4, #-(VVC_MAX_PB_SIZE << 1)]
+ sshr v16.8h, v16.8h, #6
+ ldp q2, q3, [x4, #(VVC_MAX_PB_SIZE << 1)]!
+ ldur q20, [x5, #2]
+ sshr v17.8h, v17.8h, #6
+ ldur q21, [x5, #18]
+ sshr v2.8h, v2.8h, #6
+ ldur q22, [x5, #-2]
+ sshr v3.8h, v3.8h, #6
+ ldur q23, [x5, #14]
+ sshr v18.8h, v18.8h, #6
+ ldp q26, q27, [x5, #-(VVC_MAX_PB_SIZE << 1)]
+ sshr v19.8h, v19.8h, #6
+ ldp q24, q25, [x5, #(VVC_MAX_PB_SIZE << 1)]!
+
+ // results of gradient_h0
+ sub v0.8h, v0.8h, v16.8h
+ sub v1.8h, v1.8h, v17.8h
+
+ // results of gradient_v0
+ sub v2.8h, v2.8h, v18.8h
+ sub v3.8h, v3.8h, v19.8h
+
+ sshr v20.8h, v20.8h, #6
+ sshr v21.8h, v21.8h, #6
+ sshr v22.8h, v22.8h, #6
+ sshr v23.8h, v23.8h, #6
+
+ // results of gradient_h1
+ sub v20.8h, v20.8h, v22.8h
+ sub v21.8h, v21.8h, v23.8h
+
+ sshr v24.8h, v24.8h, #6
+ sshr v25.8h, v25.8h, #6
+
+ // gradient_h0 - gradient_h1
+ sub v22.8h, v0.8h, v20.8h
+ sub v23.8h, v1.8h, v21.8h
+
+ // (gradient_h0 + gradient_h1) >> 1
+ shadd v16.8h, v0.8h, v20.8h
+ shadd v17.8h, v1.8h, v21.8h
+
+ st1 {v22.8h, v23.8h}, [gh1], #32
+
+ sshr v26.8h, v26.8h, #6
+ sshr v27.8h, v27.8h, #6
+
+ st1 {v16.8h, v17.8h}, [gh0], #32
+
+ // results of gradient_v1
+ sub v24.8h, v24.8h, v26.8h
+ sub v25.8h, v25.8h, v27.8h
+
+ // (gradient_v0 + gradient_v1) >> 1
+ shadd v18.8h, v2.8h, v24.8h
+ shadd v19.8h, v3.8h, v25.8h
+
+ // gradient_v0 - gradient_v1
+ sub v26.8h, v2.8h, v24.8h
+ sub v27.8h, v3.8h, v25.8h
+
+ st1 {v18.8h,v19.8h}, [gv0], #32
+
+ subs height, height, #1
+ st1 {v26.8h,v27.8h}, [gv1], #32
+
+ b.ne 16b
+ ret
+
.unreq gh0
.unreq gh1
.unreq gv0
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-09-14 19:56 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-09-14 19:56 [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: Unroll vvc_bdof_grad_filter_8x_neon (PR #20519) welder via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git