From: Nuo Mi <nuomi2021@gmail.com> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: Krzysztof Pyrkosz <ffmpeg@szaka.eu> Subject: Re: [FFmpeg-devel] [PATCH v2] avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12} Date: Sun, 9 Mar 2025 21:43:14 +0800 Message-ID: <CAFXK13eS+c7h72L0hDkwOBheu1ec+3sdTWcfY+OXwx=aD_4TiQ@mail.gmail.com> (raw) In-Reply-To: <8c7bea1-5f7f-9ae5-8be8-861f21e4f8ae@martin.st> On Fri, Mar 7, 2025 at 9:56 PM Martin Storsjö <martin@martin.st> wrote: > On Mon, 3 Mar 2025, Krzysztof Pyrkosz via ffmpeg-devel wrote: > > > This patch replaces integer widening with halving addition, and > > multi-step "emulated" rounding shift with a single asm instruction doing > > exactly that. > > > > Benchmarks before and after: > > A78 > > avg_8_64x64_neon: 2686.2 ( 6.12x) > > avg_8_128x128_neon: 10734.2 ( 5.88x) > > avg_10_64x64_neon: 2536.8 ( 5.40x) > > avg_10_128x128_neon: 10079.0 ( 5.22x) > > avg_12_64x64_neon: 2548.2 ( 5.38x) > > avg_12_128x128_neon: 10133.8 ( 5.19x) > > > > avg_8_64x64_neon: 897.8 (18.26x) > > avg_8_128x128_neon: 3608.5 (17.37x) > > avg_10_32x32_neon: 444.2 ( 8.51x) > > avg_10_64x64_neon: 1711.8 ( 8.00x) > > avg_12_64x64_neon: 1706.2 ( 8.02x) > > avg_12_128x128_neon: 7010.0 ( 7.46x) > > > > A72 > > avg_8_64x64_neon: 5823.4 ( 3.88x) > > avg_8_128x128_neon: 17430.5 ( 4.73x) > > avg_10_64x64_neon: 5228.1 ( 3.71x) > > avg_10_128x128_neon: 16722.2 ( 4.17x) > > avg_12_64x64_neon: 5379.1 ( 3.51x) > > avg_12_128x128_neon: 16715.7 ( 4.17x) > > > > avg_8_64x64_neon: 2006.5 (10.61x) > > avg_8_128x128_neon: 9158.7 ( 8.96x) > > avg_10_64x64_neon: 3357.7 ( 5.60x) > > avg_10_128x128_neon: 12411.7 ( 5.56x) > > avg_12_64x64_neon: 3317.5 ( 5.67x) > > avg_12_128x128_neon: 12358.5 ( 5.58x) > > > > A53 > > avg_8_64x64_neon: 8327.8 ( 5.18x) > > avg_8_128x128_neon: 31631.3 ( 5.34x) > > avg_10_64x64_neon: 8783.5 ( 4.98x) > > avg_10_128x128_neon: 32617.0 ( 5.25x) > > avg_12_64x64_neon: 8686.0 ( 5.06x) > > avg_12_128x128_neon: 32487.5 ( 5.25x) > > > > avg_8_64x64_neon: 6032.3 ( 7.17x) > > avg_8_128x128_neon: 22008.5 ( 7.69x) > > avg_10_64x64_neon: 7738.0 ( 5.68x) > > avg_10_128x128_neon: 27813.8 ( 6.14x) > > avg_12_64x64_neon: 7844.5 ( 5.60x) > > avg_12_128x128_neon: 26999.5 ( 6.34x) > > --- > > libavcodec/aarch64/vvc/inter.S | 177 ++++++++++++++++++++++++--------- > > 1 file changed, 130 insertions(+), 47 deletions(-) > > > > diff --git a/libavcodec/aarch64/vvc/inter.S > b/libavcodec/aarch64/vvc/inter.S > > index 0edc861f97..b2f44697d3 100644 > > --- a/libavcodec/aarch64/vvc/inter.S > > +++ b/libavcodec/aarch64/vvc/inter.S > > @@ -24,9 +24,9 @@ > > #define BDOF_BLOCK_SIZE 16 > > #define BDOF_MIN_BLOCK_SIZE 4 > > > > -.macro vvc_avg type, bit_depth > > +.macro vvc_avg bit_depth > > > > -.macro vvc_\type\()_\bit_depth\()_2_4 tap > > +.macro vvc_w_avg_\bit_depth\()_2_4 tap > > .if \tap == 2 > > ldr s0, [src0] > > ldr s2, [src1] > > @@ -34,18 +34,11 @@ > > ldr d0, [src0] > > ldr d2, [src1] > > .endif > > - > > -.ifc \type, avg > > - saddl v4.4s, v0.4h, v2.4h > > - add v4.4s, v4.4s, v16.4s > > - sqshrun v4.4h, v4.4s, #(15 - \bit_depth) > > -.else > > mov v4.16b, v16.16b > > smlal v4.4s, v0.4h, v19.4h > > smlal v4.4s, v2.4h, v20.4h > > sqshl v4.4s, v4.4s, v22.4s > > sqxtun v4.4h, v4.4s > > -.endif > > > > .if \bit_depth == 8 > > sqxtun v4.8b, v4.8h > > @@ -68,7 +61,7 @@ > > add dst, dst, dst_stride > > .endm > > > > -function ff_vvc_\type\()_\bit_depth\()_neon, export=1 > > +function ff_vvc_w_avg_\bit_depth\()_neon, export=1 > > dst .req x0 > > dst_stride .req x1 > > src0 .req x2 > > @@ -78,9 +71,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 > > > > mov x10, #(VVC_MAX_PB_SIZE * 2) > > cmp width, #8 > > -.ifc \type, avg > > - movi v16.4s, #(1 << (14 - \bit_depth)) > > -.else > > lsr x11, x6, #32 // weight0 > > mov w12, w6 // weight1 > > lsr x13, x7, #32 // offset > > @@ -91,9 +81,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 > > dup v20.8h, w12 > > dup v16.4s, w13 > > dup v22.4s, w14 > > -.endif // avg > > > > - .if \bit_depth >= 10 > > +.if \bit_depth >= 10 > > // clip pixel > > mov w6, #((1 << \bit_depth) - 1) > > dup v17.8h, w6 > > @@ -105,25 +94,17 @@ function ff_vvc_\type\()_\bit_depth\()_neon, > export=1 > > b.eq 4f > > 2: // width == 2 > > subs height, height, #1 > > - vvc_\type\()_\bit_depth\()_2_4 2 > > + vvc_w_avg_\bit_depth\()_2_4 2 > > b.ne 2b > > b 32f > > 4: // width == 4 > > subs height, height, #1 > > - vvc_\type\()_\bit_depth\()_2_4 4 > > + vvc_w_avg_\bit_depth\()_2_4 4 > > b.ne 4b > > b 32f > > 8: // width == 8 > > ld1 {v0.8h}, [src0], x10 > > ld1 {v2.8h}, [src1], x10 > > -.ifc \type, avg > > - saddl v4.4s, v0.4h, v2.4h > > - saddl2 v5.4s, v0.8h, v2.8h > > - add v4.4s, v4.4s, v16.4s > > - add v5.4s, v5.4s, v16.4s > > - sqshrun v4.4h, v4.4s, #(15 - \bit_depth) > > - sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth) > > -.else > > mov v4.16b, v16.16b > > mov v5.16b, v16.16b > > smlal v4.4s, v0.4h, v19.4h > > @@ -134,7 +115,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 > > sqshl v5.4s, v5.4s, v22.4s > > sqxtun v4.4h, v4.4s > > sqxtun2 v4.8h, v5.4s > > -.endif > > subs height, height, #1 > > .if \bit_depth == 8 > > sqxtun v4.8b, v4.8h > > @@ -153,20 +133,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, > export=1 > > 17: > > ldp q0, q1, [x7], #32 > > ldp q2, q3, [x8], #32 > > -.ifc \type, avg > > - saddl v4.4s, v0.4h, v2.4h > > - saddl2 v5.4s, v0.8h, v2.8h > > - saddl v6.4s, v1.4h, v3.4h > > - saddl2 v7.4s, v1.8h, v3.8h > > - add v4.4s, v4.4s, v16.4s > > - add v5.4s, v5.4s, v16.4s > > - add v6.4s, v6.4s, v16.4s > > - add v7.4s, v7.4s, v16.4s > > - sqshrun v4.4h, v4.4s, #(15 - \bit_depth) > > - sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth) > > - sqshrun v6.4h, v6.4s, #(15 - \bit_depth) > > - sqshrun2 v6.8h, v7.4s, #(15 - \bit_depth) > > -.else // avg > > mov v4.16b, v16.16b > > mov v5.16b, v16.16b > > mov v6.16b, v16.16b > > @@ -187,7 +153,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 > > sqxtun v6.4h, v6.4s > > sqxtun2 v4.8h, v5.4s > > sqxtun2 v6.8h, v7.4s > > -.endif // w_avg > > subs w6, w6, #16 > > .if \bit_depth == 8 > > sqxtun v4.8b, v4.8h > > @@ -217,12 +182,130 @@ function ff_vvc_\type\()_\bit_depth\()_neon, > export=1 > > endfunc > > .endm > > > > -vvc_avg avg, 8 > > -vvc_avg avg, 10 > > -vvc_avg avg, 12 > > -vvc_avg w_avg, 8 > > -vvc_avg w_avg, 10 > > -vvc_avg w_avg, 12 > > +vvc_avg 8 > > +vvc_avg 10 > > +vvc_avg 12 > > + > > +.macro vvc_avg2 bit_depth > > Instead of naming this vvc_avg2, and the old one (which only produces the > w_avg function now) vvc_avg, we could rename the old one to vvc_w_avg, and > the new one to plain vvc_avg. > > I did that change and pushed this patch now, thanks! > Great! Thank you, Krzystof, Zhili and Martin > > // Martin > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
prev parent reply other threads:[~2025-03-09 13:43 UTC|newest] Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top 2025-02-19 17:40 [FFmpeg-devel] [PATCH 1/2] " Krzysztof Pyrkosz via ffmpeg-devel 2025-02-19 17:40 ` [FFmpeg-devel] [PATCH 2/2] avcodec/aarch64/vvc: Use rounding shift NEON instruction Krzysztof Pyrkosz via ffmpeg-devel 2025-02-20 8:08 ` Zhao Zhili 2025-03-01 22:34 ` Martin Storsjö 2025-03-03 21:32 ` [FFmpeg-devel] [PATCH v2] avcodec/aarch64/vvc: Optimize NEON version of vvc_dmvr Krzysztof Pyrkosz via ffmpeg-devel 2025-03-04 8:36 ` Martin Storsjö 2025-02-20 7:20 ` [FFmpeg-devel] [PATCH 1/2] avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12} Zhao Zhili 2025-02-20 18:49 ` Krzysztof Pyrkosz via ffmpeg-devel 2025-02-20 18:49 ` [FFmpeg-devel] [PATCH 2/2] avcodec/aarch64/vvc: Use rounding shift NEON instruction Krzysztof Pyrkosz via ffmpeg-devel 2025-02-26 8:54 ` [FFmpeg-devel] [PATCH 1/2] avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12} Zhao Zhili 2025-03-01 22:21 ` Martin Storsjö 2025-03-03 21:18 ` [FFmpeg-devel] [PATCH v2] " Krzysztof Pyrkosz via ffmpeg-devel 2025-03-07 13:56 ` Martin Storsjö 2025-03-09 13:43 ` Nuo Mi [this message]
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to='CAFXK13eS+c7h72L0hDkwOBheu1ec+3sdTWcfY+OXwx=aD_4TiQ@mail.gmail.com' \ --to=nuomi2021@gmail.com \ --cc=ffmpeg-devel@ffmpeg.org \ --cc=ffmpeg@szaka.eu \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git