From: Zhao Zhili <quinkblack-at-foxmail.com@ffmpeg.org> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: Krzysztof Pyrkosz <ffmpeg@szaka.eu> Subject: Re: [FFmpeg-devel] [PATCH 2/2] avcodec/aarch64/vvc: Use rounding shift NEON instruction Date: Thu, 20 Feb 2025 16:08:04 +0800 Message-ID: <tencent_59919664CD30A044CD87CFB85503A72A3807@qq.com> (raw) In-Reply-To: <20250219174010.3911-4-ffmpeg@szaka.eu> > On Feb 20, 2025, at 01:40, Krzysztof Pyrkosz via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> wrote: > > --- > > Before and after on A78 > > dmvr_8_12x20_neon: 86.2 ( 6.90x) > dmvr_8_20x12_neon: 94.8 ( 5.93x) > dmvr_8_20x20_neon: 141.5 ( 6.50x) > dmvr_12_12x20_neon: 158.0 ( 3.76x) > dmvr_12_20x12_neon: 151.2 ( 3.73x) > dmvr_12_20x20_neon: 247.2 ( 3.71x) > dmvr_hv_8_12x20_neon: 423.2 ( 3.75x) > dmvr_hv_8_20x12_neon: 434.0 ( 3.69x) > dmvr_hv_8_20x20_neon: 706.0 ( 3.69x) > > dmvr_8_12x20_neon: 77.2 ( 7.70x) > dmvr_8_20x12_neon: 66.5 ( 8.49x) > dmvr_8_20x20_neon: 92.2 ( 9.90x) > dmvr_12_12x20_neon: 80.2 ( 7.38x) > dmvr_12_20x12_neon: 58.2 ( 9.59x) > dmvr_12_20x20_neon: 90.0 (10.15x) > dmvr_hv_8_12x20_neon: 369.0 ( 4.34x) > dmvr_hv_8_20x12_neon: 355.8 ( 4.49x) > dmvr_hv_8_20x20_neon: 574.2 ( 4.51x) > > libavcodec/aarch64/vvc/inter.S | 72 ++++++++++------------------------ > 1 file changed, 20 insertions(+), 52 deletions(-) > > diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S > index c9d698ee29..45add44b6e 100644 > --- a/libavcodec/aarch64/vvc/inter.S > +++ b/libavcodec/aarch64/vvc/inter.S > @@ -369,22 +369,18 @@ function ff_vvc_dmvr_8_neon, export=1 > 1: > cbz w15, 2f > ldr q0, [src], #16 > - uxtl v1.8h, v0.8b > - uxtl2 v2.8h, v0.16b > - ushl v1.8h, v1.8h, v16.8h > - ushl v2.8h, v2.8h, v16.8h Please remove assignment to v16. LGTM otherwise. > + ushll v1.8h, v0.8b, #2 > + ushll2 v2.8h, v0.16b, #2 > stp q1, q2, [dst], #32 > b 3f > 2: > ldr d0, [src], #8 > - uxtl v1.8h, v0.8b > - ushl v1.8h, v1.8h, v16.8h > + ushll v1.8h, v0.8b, #2 > str q1, [dst], #16 > 3: > subs height, height, #1 > ldr s3, [src], #4 > - uxtl v4.8h, v3.8b > - ushl v4.4h, v4.4h, v16.4h > + ushll v4.8h, v3.8b, #2 > st1 {v4.4h}, [dst], x7 > > add src, src, src_stride > @@ -399,42 +395,24 @@ function ff_vvc_dmvr_12_neon, export=1 > cmp width, #16 > sub src_stride, src_stride, x6, lsl #1 > cset w15, gt // width > 16 > - movi v16.8h, #2 // offset4 > sub x7, x7, x6, lsl #1 > 1: > cbz w15, 2f > ldp q0, q1, [src], #32 > - uaddl v2.4s, v0.4h, v16.4h > - uaddl2 v3.4s, v0.8h, v16.8h > - uaddl v4.4s, v1.4h, v16.4h > - uaddl2 v5.4s, v1.8h, v16.8h > - ushr v2.4s, v2.4s, #2 > - ushr v3.4s, v3.4s, #2 > - ushr v4.4s, v4.4s, #2 > - ushr v5.4s, v5.4s, #2 > - uqxtn v2.4h, v2.4s > - uqxtn2 v2.8h, v3.4s > - uqxtn v4.4h, v4.4s > - uqxtn2 v4.8h, v5.4s > - > - stp q2, q4, [dst], #32 > + urshr v0.8h, v0.8h, #2 > + urshr v1.8h, v1.8h, #2 > + > + stp q0, q1, [dst], #32 > b 3f > 2: > ldr q0, [src], #16 > - uaddl v2.4s, v0.4h, v16.4h > - uaddl2 v3.4s, v0.8h, v16.8h > - ushr v2.4s, v2.4s, #2 > - ushr v3.4s, v3.4s, #2 > - uqxtn v2.4h, v2.4s > - uqxtn2 v2.8h, v3.4s > - str q2, [dst], #16 > + urshr v0.8h, v0.8h, #2 > + str q0, [dst], #16 > 3: > subs height, height, #1 > ldr d0, [src], #8 > - uaddl v3.4s, v0.4h, v16.4h > - ushr v3.4s, v3.4s, #2 > - uqxtn v3.4h, v3.4s > - st1 {v3.4h}, [dst], x7 > + urshr v0.4h, v0.4h, #2 > + st1 {v0.4h}, [dst], x7 > > add src, src, src_stride > b.ne 1b > @@ -462,8 +440,6 @@ function ff_vvc_dmvr_hv_8_neon, export=1 > ldrb w10, [x12] > ldrb w11, [x12, #1] > sxtw x6, w6 > - movi v30.8h, #(1 << (8 - 7)) // offset1 > - movi v31.8h, #8 // offset2 > dup v2.8h, w10 // filter_y[0] > dup v3.8h, w11 // filter_y[1] > > @@ -491,10 +467,8 @@ function ff_vvc_dmvr_hv_8_neon, export=1 > mul v16.8h, v16.8h, v0.8h > mla v6.8h, v7.8h, v1.8h > mla v16.8h, v17.8h, v1.8h > - add v6.8h, v6.8h, v30.8h > - add v16.8h, v16.8h, v30.8h > - ushr v6.8h, v6.8h, #(8 - 6) > - ushr v7.8h, v16.8h, #(8 - 6) > + urshr v6.8h, v6.8h, #(8 - 6) > + urshr v7.8h, v16.8h, #(8 - 6) > stp q6, q7, [x13], #32 > > cbz w10, 3f > @@ -504,10 +478,8 @@ function ff_vvc_dmvr_hv_8_neon, export=1 > mul v17.8h, v17.8h, v2.8h > mla v16.8h, v6.8h, v3.8h > mla v17.8h, v7.8h, v3.8h > - add v16.8h, v16.8h, v31.8h > - add v17.8h, v17.8h, v31.8h > - ushr v16.8h, v16.8h, #4 > - ushr v17.8h, v17.8h, #4 > + urshr v16.8h, v16.8h, #4 > + urshr v17.8h, v17.8h, #4 > stp q16, q17, [x14], #32 > b 3f > 2: > @@ -518,8 +490,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1 > uxtl v6.8h, v4.8b > mul v6.8h, v6.8h, v0.8h > mla v6.8h, v7.8h, v1.8h > - add v6.8h, v6.8h, v30.8h > - ushr v6.8h, v6.8h, #(8 - 6) > + urshr v6.8h, v6.8h, #(8 - 6) > str q6, [x13], #16 > > cbz w10, 3f > @@ -527,8 +498,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1 > ldr q16, [x12], #16 > mul v16.8h, v16.8h, v2.8h > mla v16.8h, v6.8h, v3.8h > - add v16.8h, v16.8h, v31.8h > - ushr v16.8h, v16.8h, #4 > + urshr v16.8h, v16.8h, #4 > str q16, [x14], #16 > 3: > ldur s5, [src, #1] > @@ -537,8 +507,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1 > uxtl v6.8h, v4.8b > mul v6.4h, v6.4h, v0.4h > mla v6.4h, v7.4h, v1.4h > - add v6.4h, v6.4h, v30.4h > - ushr v6.4h, v6.4h, #(8 - 6) > + urshr v6.4h, v6.4h, #(8 - 6) > str d6, [x13], #8 > > cbz w10, 4f > @@ -546,8 +515,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1 > ldr d16, [x12], #8 > mul v16.4h, v16.4h, v2.4h > mla v16.4h, v6.4h, v3.4h > - add v16.4h, v16.4h, v31.4h > - ushr v16.4h, v16.4h, #4 > + urshr v16.4h, v16.4h, #4 > str d16, [x14], #8 > 4: > subs height, height, #1 > -- > 2.47.2 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2025-02-20 8:08 UTC|newest] Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top 2025-02-19 17:40 [FFmpeg-devel] [PATCH 1/2] avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12} Krzysztof Pyrkosz via ffmpeg-devel 2025-02-19 17:40 ` [FFmpeg-devel] [PATCH 2/2] avcodec/aarch64/vvc: Use rounding shift NEON instruction Krzysztof Pyrkosz via ffmpeg-devel 2025-02-20 8:08 ` Zhao Zhili [this message] 2025-02-20 7:20 ` [FFmpeg-devel] [PATCH 1/2] avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12} Zhao Zhili 2025-02-20 18:49 ` Krzysztof Pyrkosz via ffmpeg-devel 2025-02-20 18:49 ` [FFmpeg-devel] [PATCH 2/2] avcodec/aarch64/vvc: Use rounding shift NEON instruction Krzysztof Pyrkosz via ffmpeg-devel
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=tencent_59919664CD30A044CD87CFB85503A72A3807@qq.com \ --to=quinkblack-at-foxmail.com@ffmpeg.org \ --cc=ffmpeg-devel@ffmpeg.org \ --cc=ffmpeg@szaka.eu \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git