From: Krzysztof Pyrkosz via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> To: ffmpeg-devel@ffmpeg.org Cc: Krzysztof Pyrkosz <ffmpeg@szaka.eu> Subject: [FFmpeg-devel] [PATCH 2/2] avcodec/aarch64/vvc: Use rounding shift NEON instruction Date: Wed, 19 Feb 2025 18:40:12 +0100 Message-ID: <20250219174010.3911-4-ffmpeg@szaka.eu> (raw) In-Reply-To: <20250219174010.3911-2-ffmpeg@szaka.eu> --- Before and after on A78 dmvr_8_12x20_neon: 86.2 ( 6.90x) dmvr_8_20x12_neon: 94.8 ( 5.93x) dmvr_8_20x20_neon: 141.5 ( 6.50x) dmvr_12_12x20_neon: 158.0 ( 3.76x) dmvr_12_20x12_neon: 151.2 ( 3.73x) dmvr_12_20x20_neon: 247.2 ( 3.71x) dmvr_hv_8_12x20_neon: 423.2 ( 3.75x) dmvr_hv_8_20x12_neon: 434.0 ( 3.69x) dmvr_hv_8_20x20_neon: 706.0 ( 3.69x) dmvr_8_12x20_neon: 77.2 ( 7.70x) dmvr_8_20x12_neon: 66.5 ( 8.49x) dmvr_8_20x20_neon: 92.2 ( 9.90x) dmvr_12_12x20_neon: 80.2 ( 7.38x) dmvr_12_20x12_neon: 58.2 ( 9.59x) dmvr_12_20x20_neon: 90.0 (10.15x) dmvr_hv_8_12x20_neon: 369.0 ( 4.34x) dmvr_hv_8_20x12_neon: 355.8 ( 4.49x) dmvr_hv_8_20x20_neon: 574.2 ( 4.51x) libavcodec/aarch64/vvc/inter.S | 72 ++++++++++------------------------ 1 file changed, 20 insertions(+), 52 deletions(-) diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S index c9d698ee29..45add44b6e 100644 --- a/libavcodec/aarch64/vvc/inter.S +++ b/libavcodec/aarch64/vvc/inter.S @@ -369,22 +369,18 @@ function ff_vvc_dmvr_8_neon, export=1 1: cbz w15, 2f ldr q0, [src], #16 - uxtl v1.8h, v0.8b - uxtl2 v2.8h, v0.16b - ushl v1.8h, v1.8h, v16.8h - ushl v2.8h, v2.8h, v16.8h + ushll v1.8h, v0.8b, #2 + ushll2 v2.8h, v0.16b, #2 stp q1, q2, [dst], #32 b 3f 2: ldr d0, [src], #8 - uxtl v1.8h, v0.8b - ushl v1.8h, v1.8h, v16.8h + ushll v1.8h, v0.8b, #2 str q1, [dst], #16 3: subs height, height, #1 ldr s3, [src], #4 - uxtl v4.8h, v3.8b - ushl v4.4h, v4.4h, v16.4h + ushll v4.8h, v3.8b, #2 st1 {v4.4h}, [dst], x7 add src, src, src_stride @@ -399,42 +395,24 @@ function ff_vvc_dmvr_12_neon, export=1 cmp width, #16 sub src_stride, src_stride, x6, lsl #1 cset w15, gt // width > 16 - movi v16.8h, #2 // offset4 sub x7, x7, x6, lsl #1 1: cbz w15, 2f ldp q0, q1, [src], #32 - uaddl v2.4s, v0.4h, v16.4h - uaddl2 v3.4s, v0.8h, v16.8h - uaddl v4.4s, v1.4h, v16.4h - uaddl2 v5.4s, v1.8h, v16.8h - ushr v2.4s, v2.4s, #2 - ushr v3.4s, v3.4s, #2 - ushr v4.4s, v4.4s, #2 - ushr v5.4s, v5.4s, #2 - uqxtn v2.4h, v2.4s - uqxtn2 v2.8h, v3.4s - uqxtn v4.4h, v4.4s - uqxtn2 v4.8h, v5.4s - - stp q2, q4, [dst], #32 + urshr v0.8h, v0.8h, #2 + urshr v1.8h, v1.8h, #2 + + stp q0, q1, [dst], #32 b 3f 2: ldr q0, [src], #16 - uaddl v2.4s, v0.4h, v16.4h - uaddl2 v3.4s, v0.8h, v16.8h - ushr v2.4s, v2.4s, #2 - ushr v3.4s, v3.4s, #2 - uqxtn v2.4h, v2.4s - uqxtn2 v2.8h, v3.4s - str q2, [dst], #16 + urshr v0.8h, v0.8h, #2 + str q0, [dst], #16 3: subs height, height, #1 ldr d0, [src], #8 - uaddl v3.4s, v0.4h, v16.4h - ushr v3.4s, v3.4s, #2 - uqxtn v3.4h, v3.4s - st1 {v3.4h}, [dst], x7 + urshr v0.4h, v0.4h, #2 + st1 {v0.4h}, [dst], x7 add src, src, src_stride b.ne 1b @@ -462,8 +440,6 @@ function ff_vvc_dmvr_hv_8_neon, export=1 ldrb w10, [x12] ldrb w11, [x12, #1] sxtw x6, w6 - movi v30.8h, #(1 << (8 - 7)) // offset1 - movi v31.8h, #8 // offset2 dup v2.8h, w10 // filter_y[0] dup v3.8h, w11 // filter_y[1] @@ -491,10 +467,8 @@ function ff_vvc_dmvr_hv_8_neon, export=1 mul v16.8h, v16.8h, v0.8h mla v6.8h, v7.8h, v1.8h mla v16.8h, v17.8h, v1.8h - add v6.8h, v6.8h, v30.8h - add v16.8h, v16.8h, v30.8h - ushr v6.8h, v6.8h, #(8 - 6) - ushr v7.8h, v16.8h, #(8 - 6) + urshr v6.8h, v6.8h, #(8 - 6) + urshr v7.8h, v16.8h, #(8 - 6) stp q6, q7, [x13], #32 cbz w10, 3f @@ -504,10 +478,8 @@ function ff_vvc_dmvr_hv_8_neon, export=1 mul v17.8h, v17.8h, v2.8h mla v16.8h, v6.8h, v3.8h mla v17.8h, v7.8h, v3.8h - add v16.8h, v16.8h, v31.8h - add v17.8h, v17.8h, v31.8h - ushr v16.8h, v16.8h, #4 - ushr v17.8h, v17.8h, #4 + urshr v16.8h, v16.8h, #4 + urshr v17.8h, v17.8h, #4 stp q16, q17, [x14], #32 b 3f 2: @@ -518,8 +490,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1 uxtl v6.8h, v4.8b mul v6.8h, v6.8h, v0.8h mla v6.8h, v7.8h, v1.8h - add v6.8h, v6.8h, v30.8h - ushr v6.8h, v6.8h, #(8 - 6) + urshr v6.8h, v6.8h, #(8 - 6) str q6, [x13], #16 cbz w10, 3f @@ -527,8 +498,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1 ldr q16, [x12], #16 mul v16.8h, v16.8h, v2.8h mla v16.8h, v6.8h, v3.8h - add v16.8h, v16.8h, v31.8h - ushr v16.8h, v16.8h, #4 + urshr v16.8h, v16.8h, #4 str q16, [x14], #16 3: ldur s5, [src, #1] @@ -537,8 +507,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1 uxtl v6.8h, v4.8b mul v6.4h, v6.4h, v0.4h mla v6.4h, v7.4h, v1.4h - add v6.4h, v6.4h, v30.4h - ushr v6.4h, v6.4h, #(8 - 6) + urshr v6.4h, v6.4h, #(8 - 6) str d6, [x13], #8 cbz w10, 4f @@ -546,8 +515,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1 ldr d16, [x12], #8 mul v16.4h, v16.4h, v2.4h mla v16.4h, v6.4h, v3.4h - add v16.4h, v16.4h, v31.4h - ushr v16.4h, v16.4h, #4 + urshr v16.4h, v16.4h, #4 str d16, [x14], #8 4: subs height, height, #1 -- 2.47.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2025-02-19 18:01 UTC|newest] Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top 2025-02-19 17:40 [FFmpeg-devel] [PATCH 1/2] avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12} Krzysztof Pyrkosz via ffmpeg-devel 2025-02-19 17:40 ` Krzysztof Pyrkosz via ffmpeg-devel [this message] 2025-02-20 8:08 ` [FFmpeg-devel] [PATCH 2/2] avcodec/aarch64/vvc: Use rounding shift NEON instruction Zhao Zhili 2025-02-20 7:20 ` [FFmpeg-devel] [PATCH 1/2] avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12} Zhao Zhili 2025-02-20 18:49 ` Krzysztof Pyrkosz via ffmpeg-devel 2025-02-20 18:49 ` [FFmpeg-devel] [PATCH 2/2] avcodec/aarch64/vvc: Use rounding shift NEON instruction Krzysztof Pyrkosz via ffmpeg-devel
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20250219174010.3911-4-ffmpeg@szaka.eu \ --to=ffmpeg-devel@ffmpeg.org \ --cc=ffmpeg@szaka.eu \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git