Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: welder via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: welder <code@ffmpeg.org>
Subject: [FFmpeg-devel] [PATCH] Optimize vvc_apply_bdof_block_8x (PR #20448)
Date: Fri, 05 Sep 2025 17:24:58 -0000
Message-ID: <175709309882.25.14586465106777414812@463a07221176> (raw)

PR #20448 opened by welder
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20448
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20448.patch

The speed improvement is attached in the commit message. The count of arithmetic operation is down from 10 to 6 and some cruft is cleaned up.


>From 8967e3a6a725358494307b51add9349d3e7dd075 Mon Sep 17 00:00:00 2001
From: Krzysztof Pyrkosz <ffmpeg@szaka.eu>
Date: Fri, 5 Sep 2025 19:17:48 +0200
Subject: [PATCH] Optimize vvc_apply_bdof_block_8x

Before and after:
A53:
apply_bdof_8_8x16_neon:                               3320.5 ( 4.02x)
apply_bdof_10_8x16_neon:                              3317.8 ( 3.90x)
apply_bdof_12_8x16_neon:                              3303.6 ( 3.91x)

apply_bdof_8_8x16_neon:                               3216.2 ( 4.18x)
apply_bdof_10_8x16_neon:                              3181.0 ( 4.09x)
apply_bdof_12_8x16_neon:                              3172.1 ( 4.09x)

A72:
apply_bdof_8_8x16_neon:                               1827.4 ( 5.02x)
apply_bdof_10_8x16_neon:                              1838.5 ( 4.89x)
apply_bdof_12_8x16_neon:                              1841.1 ( 4.83x)

apply_bdof_8_8x16_neon:                               1691.6 ( 5.46x)
apply_bdof_10_8x16_neon:                              1695.9 ( 5.23x)
apply_bdof_12_8x16_neon:                              1695.4 ( 5.29x)

A78
apply_bdof_8_8x16_neon:                                648.9 ( 7.43x)
apply_bdof_10_8x16_neon:                               646.1 ( 7.04x)
apply_bdof_12_8x16_neon:                               643.8 ( 7.04x)

apply_bdof_8_8x16_neon:                                603.2 ( 7.97x)
apply_bdof_10_8x16_neon:                               604.1 ( 7.52x)
apply_bdof_12_8x16_neon:                               604.5 ( 7.52x)
---
 libavcodec/aarch64/vvc/inter.S | 39 ++++++++++++----------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index a6648b64fc..f27b5a47f4 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -802,43 +802,33 @@ endfunc
         vy              .req x7
 
         ldr             w8, [sp]
-        movi            v7.4s, #(1 << (14 - \bit_depth))
         mov             x12, #(BDOF_BLOCK_SIZE * 2)
         mov             x14, #(VVC_MAX_PB_SIZE * 2)
 .if \bit_depth >= 10
         // clip pixel
         mov             w15, #((1 << \bit_depth) - 1)
-        movi            v18.8h, #0
         dup             v19.8h, w15
 .endif
 
 0:
-        ld1r            {v0.8h}, [vx], #2
-        ld1r            {v1.8h}, [vy], #2
-        ld1r            {v2.8h}, [vx]
-        ld1r            {v3.8h}, [vy]
+        ldr             s0, [vx], #(2 * BDOF_MIN_BLOCK_SIZE)
+        ldr             s1, [vy], #(2 * BDOF_MIN_BLOCK_SIZE)
         mov             w13, #(BDOF_MIN_BLOCK_SIZE)
-        ins             v0.d[1], v2.d[1]
-        ins             v1.d[1], v3.d[1]
 1:
-        ld1             {v2.8h}, [gh], x12
-        ld1             {v4.8h}, [gv], x12
-        smull           v3.4s, v0.4h, v2.4h
-        smull2          v16.4s, v0.8h, v2.8h
-        smlal           v3.4s, v1.4h, v4.4h
-        smlal2          v16.4s, v1.8h, v4.8h
-
         ld1             {v5.8h}, [src0], x14
         ld1             {v6.8h}, [src1], x14
-        saddl           v2.4s, v5.4h, v6.4h
-        add             v2.4s, v2.4s, v7.4s
-        add             v2.4s, v2.4s, v3.4s
-        saddl2          v4.4s, v5.8h, v6.8h
-        add             v4.4s, v4.4s, v7.4s
-        add             v4.4s, v4.4s, v16.4s
+        ld1             {v2.8h}, [gh], x12
+        ld1             {v4.8h}, [gv], x12
 
-        sqshrn          v5.4h, v2.4s, #(15 - \bit_depth)
-        sqshrn2         v5.8h, v4.4s, #(15 - \bit_depth)
+        saddl           v17.4s, v5.4h, v6.4h
+        saddl2          v16.4s, v5.8h, v6.8h
+        smlal           v17.4s, v4.4h, v1.h[0]
+        smlal2          v16.4s, v4.8h, v1.h[1]
+        smlal           v17.4s, v2.4h, v0.h[0]
+        smlal2          v16.4s, v2.8h, v0.h[1]
+
+        sqrshrun        v5.4h, v17.4s, #(15 - \bit_depth)
+        sqrshrun2       v5.8h, v16.4s, #(15 - \bit_depth)
         subs            w13, w13, #1
 .if \bit_depth == 8
         sqxtun          v5.8b, v5.8h
@@ -846,14 +836,11 @@ endfunc
         add             dst, dst, dst_stride
 .else
         smin            v5.8h, v5.8h, v19.8h
-        smax            v5.8h, v5.8h, v18.8h
         st1             {v5.8h}, [dst], dst_stride
 .endif
         b.ne            1b
 
         subs            w8, w8, #(BDOF_MIN_BLOCK_SIZE)
-        add             vx, vx, #(2 * BDOF_MIN_BLOCK_SIZE - 2)
-        add             vy, vy, #(2 * BDOF_MIN_BLOCK_SIZE - 2)
         b.ne            0b
         ret
 
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

                 reply	other threads:[~2025-09-05 17:25 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=175709309882.25.14586465106777414812@463a07221176 \
    --to=ffmpeg-devel@ffmpeg.org \
    --cc=code@ffmpeg.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git