From: welder via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> To: ffmpeg-devel@ffmpeg.org Cc: welder <code@ffmpeg.org> Subject: [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: Optimize dmvr_hv_10 (PR #20517) Date: Sun, 14 Sep 2025 18:20:10 -0000 Message-ID: <175787401067.25.4999638338512204626@463a07221176> (raw) PR #20517 opened by welder URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20517 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20517.patch Nothing spectacular, merged a few adds and shifts into rounding shifts. >From 7809ff9746abf83bc41c1f13d9e1b2f1da6b0fb9 Mon Sep 17 00:00:00 2001 From: Krzysztof Pyrkosz <ffmpeg@szaka.eu> Date: Fri, 5 Sep 2025 19:52:11 +0200 Subject: [PATCH] avcodec/aarch64/vvc: Optimize dmvr_hv_10 Before and ofter on A53: dmvr_hv_10_12x20_neon: 1838.2 ( 3.02x) dmvr_hv_10_20x12_neon: 1330.2 ( 1.83x) dmvr_hv_10_20x20_neon: 2148.2 ( 1.85x) dmvr_hv_12_12x20_neon: 1839.2 ( 3.02x) dmvr_hv_12_20x12_neon: 1330.6 ( 1.83x) dmvr_hv_12_20x20_neon: 2147.2 ( 1.85x) dmvr_hv_10_12x20_neon: 1755.0 ( 3.17x) dmvr_hv_10_20x12_neon: 1165.8 ( 2.09x) dmvr_hv_10_20x20_neon: 1876.1 ( 2.12x) dmvr_hv_12_12x20_neon: 1754.4 ( 3.17x) dmvr_hv_12_20x12_neon: 1167.8 ( 2.09x) dmvr_hv_12_20x20_neon: 1878.8 ( 2.12x) --- libavcodec/aarch64/vvc/inter.S | 58 ++++++++++------------------------ 1 file changed, 17 insertions(+), 41 deletions(-) diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S index 01d2ff155c..79ff720cdd 100644 --- a/libavcodec/aarch64/vvc/inter.S +++ b/libavcodec/aarch64/vvc/inter.S @@ -599,18 +599,13 @@ function ff_vvc_dmvr_hv_8_neon, export=1 endfunc function ff_vvc_dmvr_hv_12_neon, export=1 - movi v29.4s, #(12 - 6) - movi v30.4s, #(1 << (12 - 7)) // offset1 + mvni v29.4s, #(12 - 6 - 1) b 0f endfunc function ff_vvc_dmvr_hv_10_neon, export=1 - movi v29.4s, #(10 - 6) - movi v30.4s, #(1 << (10 - 7)) // offset1 + mvni v29.4s, #(10 - 6 - 1) 0: - movi v31.4s, #8 // offset2 - neg v29.4s, v29.4s - sub sp, sp, #(VVC_MAX_PB_SIZE * 4) movrel x9, X(ff_vvc_inter_luma_dmvr_filters) @@ -626,7 +621,6 @@ function ff_vvc_dmvr_hv_10_neon, export=1 add x12, x9, my, lsl #1 ldrb w10, [x12] ldrb w11, [x12, #1] - sxtw x6, w6 dup v2.8h, w10 // filter_y[0] dup v3.8h, w11 // filter_y[1] @@ -635,7 +629,7 @@ function ff_vvc_dmvr_hv_10_neon, export=1 mov w10, #0 // start filter_y or not add height, height, #1 sub dst, dst, #(VVC_MAX_PB_SIZE * 2) - sub src_stride, src_stride, x6, lsl #1 + sub src_stride, src_stride, w6, sxtw #1 cset w15, gt // width > 16 1: mov x12, tmp0 @@ -656,14 +650,10 @@ function ff_vvc_dmvr_hv_10_neon, export=1 umlal v18.4s, v17.4h, v1.4h umlal2 v19.4s, v17.8h, v1.8h - add v4.4s, v4.4s, v30.4s - add v5.4s, v5.4s, v30.4s - add v18.4s, v18.4s, v30.4s - add v19.4s, v19.4s, v30.4s - ushl v4.4s, v4.4s, v29.4s - ushl v5.4s, v5.4s, v29.4s - ushl v18.4s, v18.4s, v29.4s - ushl v19.4s, v19.4s, v29.4s + urshl v4.4s, v4.4s, v29.4s + urshl v5.4s, v5.4s, v29.4s + urshl v18.4s, v18.4s, v29.4s + urshl v19.4s, v19.4s, v29.4s uqxtn v6.4h, v4.4s uqxtn2 v6.8h, v5.4s uqxtn v7.4h, v18.4s @@ -681,18 +671,10 @@ function ff_vvc_dmvr_hv_10_neon, export=1 umlal2 v18.4s, v6.8h, v3.8h umlal v19.4s, v7.4h, v3.4h umlal2 v20.4s, v7.8h, v3.8h - add v17.4s, v17.4s, v31.4s - add v18.4s, v18.4s, v31.4s - add v19.4s, v19.4s, v31.4s - add v20.4s, v20.4s, v31.4s - ushr v17.4s, v17.4s, #4 - ushr v18.4s, v18.4s, #4 - ushr v19.4s, v19.4s, #4 - ushr v20.4s, v20.4s, #4 - uqxtn v6.4h, v17.4s - uqxtn2 v6.8h, v18.4s - uqxtn v7.4h, v19.4s - uqxtn2 v7.8h, v20.4s + uqrshrn v6.4h, v17.4s, #4 + uqrshrn2 v6.8h, v18.4s, #4 + uqrshrn v7.4h, v19.4s, #4 + uqrshrn2 v7.8h, v20.4s, #4 stp q6, q7, [x14], #32 b 3f 2: @@ -704,10 +686,8 @@ function ff_vvc_dmvr_hv_10_neon, export=1 umlal v4.4s, v7.4h, v1.4h umlal2 v5.4s, v7.8h, v1.8h - add v4.4s, v4.4s, v30.4s - add v5.4s, v5.4s, v30.4s - ushl v4.4s, v4.4s, v29.4s - ushl v5.4s, v5.4s, v29.4s + urshl v4.4s, v4.4s, v29.4s + urshl v5.4s, v5.4s, v29.4s uqxtn v6.4h, v4.4s uqxtn2 v6.8h, v5.4s str q6, [x13], #16 @@ -719,10 +699,8 @@ function ff_vvc_dmvr_hv_10_neon, export=1 umull2 v18.4s, v16.8h, v2.8h umlal v17.4s, v6.4h, v3.4h umlal2 v18.4s, v6.8h, v3.8h - add v17.4s, v17.4s, v31.4s - add v18.4s, v18.4s, v31.4s - ushr v17.4s, v17.4s, #4 - ushr v18.4s, v18.4s, #4 + urshr v17.4s, v17.4s, #4 + urshr v18.4s, v18.4s, #4 uqxtn v16.4h, v17.4s uqxtn2 v16.8h, v18.4s str q16, [x14], #16 @@ -731,8 +709,7 @@ function ff_vvc_dmvr_hv_10_neon, export=1 ldr d6, [src], #8 umull v4.4s, v7.4h, v1.4h umlal v4.4s, v6.4h, v0.4h - add v4.4s, v4.4s, v30.4s - ushl v4.4s, v4.4s, v29.4s + urshl v4.4s, v4.4s, v29.4s uqxtn v6.4h, v4.4s str d6, [x13], #8 @@ -741,8 +718,7 @@ function ff_vvc_dmvr_hv_10_neon, export=1 ldr d16, [x12], #8 umull v17.4s, v16.4h, v2.4h umlal v17.4s, v6.4h, v3.4h - add v17.4s, v17.4s, v31.4s - ushr v17.4s, v17.4s, #4 + urshr v17.4s, v17.4s, #4 uqxtn v16.4h, v17.4s str d16, [x14], #8 4: -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2025-09-14 18:20 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=175787401067.25.4999638338512204626@463a07221176 \ --to=ffmpeg-devel@ffmpeg.org \ --cc=code@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git