From: Dmitriy Kovalenko <dmtr.kovalenko@outlook.com> To: ffmpeg-devel@ffmpeg.org Cc: Dmitriy Kovalenko <dmtr.kovalenko@outlook.com> Subject: [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 32 pixels at a time Date: Sat, 31 May 2025 11:11:45 +0200 Message-ID: <DBAP193MB0956E3B62E794F364ED459C98D60A@DBAP193MB0956.EURP193.PROD.OUTLOOK.COM> (raw) In-Reply-To: <20250531091631.45342-1-dmtr.kovalenko@outlook.com> This patch integrates so called double bufferring when we are loading 2 batch of elements at a time and then processing them in parallel. On the moden arm processors especially Apple Silicon it gives a visible benefit, for subsampled pixel processing it is especially nice because it allows to read elements w/ 2 instructions and write with a single one (especially visible on a platforms with slower memory like ios). Including the previous patch in a stack on macbook pro m4 max rgb_to_yuv_half in checkasm goes up 2x of the c version --- libswscale/aarch64/input.S | 130 ++++++++++++++++++++++++++----------- 1 file changed, 91 insertions(+), 39 deletions(-) diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S index 260a26e965..b90ca05996 100644 --- a/libswscale/aarch64/input.S +++ b/libswscale/aarch64/input.S @@ -178,7 +178,7 @@ rgbToY_neon abgr32, argb32, element=4, alpha_first=1 .macro rgbToUV_half_neon fmt_bgr, fmt_rgb, element, alpha_first=0 function ff_\fmt_bgr\()ToUV_half_neon, export=1 - cbz w5, 3f // check width > 0 + cbz w5, 3f ldp w12, w11, [x6, #12] ldp w10, w15, [x6, #20] @@ -187,49 +187,101 @@ function ff_\fmt_bgr\()ToUV_half_neon, export=1 endfunc function ff_\fmt_rgb\()ToUV_half_neon, export=1 - cmp w5, #0 // check width > 0 + cmp w5, #0 b.le 3f - ldp w10, w11, [x6, #12] // w10: ru, w11: gu - ldp w12, w13, [x6, #20] // w12: bu, w13: rv - ldp w14, w15, [x6, #28] // w14: gv, w15: bv + ldp w10, w11, [x6, #12] + ldp w12, w13, [x6, #20] + ldp w14, w15, [x6, #28] 4: - cmp w5, #8 rgb_set_uv_coeff half=1 + + cmp w5, #16 b.lt 2f -1: // load 16 pixels + +1: .if \element == 3 ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48 + ld3 { v26.16b, v27.16b, v28.16b }, [x3], #48 .else ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64 + ld4 { v26.16b, v27.16b, v28.16b, v29.16b }, [x3], #64 .endif .if \alpha_first - uaddlp v21.8h, v19.16b // v21: summed b pairs - uaddlp v20.8h, v18.16b // v20: summed g pairs - uaddlp v19.8h, v17.16b // v19: summed r pairs + uaddlp v21.8h, v19.16b + uaddlp v20.8h, v18.16b + uaddlp v19.8h, v17.16b + uaddlp v31.8h, v29.16b + uaddlp v30.8h, v28.16b + uaddlp v29.8h, v27.16b .else - uaddlp v19.8h, v16.16b // v19: summed r pairs - uaddlp v20.8h, v17.16b // v20: summed g pairs - uaddlp v21.8h, v18.16b // v21: summed b pairs + uaddlp v19.8h, v16.16b + uaddlp v20.8h, v17.16b + uaddlp v21.8h, v18.16b + uaddlp v29.8h, v26.16b + uaddlp v30.8h, v27.16b + uaddlp v31.8h, v28.16b .endif - mov v22.16b, v6.16b // U first half - mov v23.16b, v6.16b // U second half - mov v24.16b, v6.16b // V first half - mov v25.16b, v6.16b // V second half - - rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v22, v23, v24, v25, v16, v17, #10 - - str q16, [x0], #16 // store dst_u - str q17, [x1], #16 // store dst_v + mov v7.16b, v6.16b + mov v16.16b, v6.16b + mov v17.16b, v6.16b + mov v18.16b, v6.16b + mov v26.16b, v6.16b + mov v27.16b, v6.16b + mov v28.16b, v6.16b + mov v25.16b, v6.16b - sub w5, w5, #8 // width -= 8 - cmp w5, #8 // width >= 8 ? + smlal v7.4s, v0.4h, v19.4h + smlal v17.4s, v3.4h, v19.4h + smlal v26.4s, v0.4h, v29.4h + smlal v28.4s, v3.4h, v29.4h + + smlal2 v16.4s, v0.8h, v19.8h + smlal2 v18.4s, v3.8h, v19.8h + smlal2 v27.4s, v0.8h, v29.8h + smlal2 v25.4s, v3.8h, v29.8h + + smlal v7.4s, v1.4h, v20.4h + smlal v17.4s, v4.4h, v20.4h + smlal v26.4s, v1.4h, v30.4h + smlal v28.4s, v4.4h, v30.4h + + smlal2 v16.4s, v1.8h, v20.8h + smlal2 v18.4s, v4.8h, v20.8h + smlal2 v27.4s, v1.8h, v30.8h + smlal2 v25.4s, v4.8h, v30.8h + + smlal v7.4s, v2.4h, v21.4h + smlal v17.4s, v5.4h, v21.4h + smlal v26.4s, v2.4h, v31.4h + smlal v28.4s, v5.4h, v31.4h + + smlal2 v16.4s, v2.8h, v21.8h + smlal2 v18.4s, v5.8h, v21.8h + smlal2 v27.4s, v2.8h, v31.8h + smlal2 v25.4s, v5.8h, v31.8h + + sqshrn v19.4h, v7.4s, #10 + sqshrn v20.4h, v17.4s, #10 + sqshrn v22.4h, v26.4s, #10 + sqshrn v23.4h, v28.4s, #10 + + sqshrn2 v19.8h, v16.4s, #10 + sqshrn2 v20.8h, v18.4s, #10 + sqshrn2 v22.8h, v27.4s, #10 + sqshrn2 v23.8h, v25.4s, #10 + + stp q19, q22, [x0], #32 + stp q20, q23, [x1], #32 + + sub w5, w5, #16 + cmp w5, #16 b.ge 1b - cbz w5, 3f // No pixels left? Exit + cbz w5, 3f -2: // Scalar fallback for remaining pixels +2: .if \alpha_first rgb_load_add_half 1, 5, 2, 6, 3, 7 .else @@ -239,24 +291,24 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1 rgb_load_add_half 0, 4, 1, 5, 2, 6 .endif .endif - smaddl x8, w2, w10, x9 // dst_u = ru * r + const_offset - smaddl x16, w2, w13, x9 // dst_v = rv * r + const_offset (parallel) + smaddl x8, w2, w10, x9 + smaddl x16, w2, w13, x9 - smaddl x8, w4, w11, x8 // dst_u += gu * g - smaddl x16, w4, w14, x16 // dst_v += gv * g (parallel) + smaddl x8, w4, w11, x8 + smaddl x16, w4, w14, x16 - smaddl x8, w7, w12, x8 // dst_u += bu * b - smaddl x16, w7, w15, x16 // dst_v += bv * b (parallel) + smaddl x8, w7, w12, x8 + smaddl x16, w7, w15, x16 - asr w8, w8, #10 // dst_u >>= 10 - asr w16, w16, #10 // dst_v >>= 10 + asr w8, w8, #10 + asr w16, w16, #10 - strh w8, [x0], #2 // store dst_u - strh w16, [x1], #2 // store dst_v + strh w8, [x0], #2 + strh w16, [x1], #2 - sub w5, w5, #1 // width-- - add x3, x3, #(2*\element) // Advance source pointer - cbnz w5, 2b // Process next pixel if any left + sub w5, w5, #1 + add x3, x3, #(2*\element) + cbnz w5, 2b 3: ret endfunc -- 2.49.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2025-05-31 9:17 UTC|newest] Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top [not found] <20250531091631.45342-1-dmtr.kovalenko@outlook.com> 2025-05-31 9:11 ` [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations Dmitriy Kovalenko 2025-06-05 12:00 ` Martin Storsjö 2025-05-31 9:11 ` Dmitriy Kovalenko [this message] 2025-05-31 10:32 ` [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 32 pixels at a time Kieran Kunhya via ffmpeg-devel 2025-05-31 10:43 ` Dmitriy Kovalenko 2025-05-31 12:13 ` Martin Storsjö 2025-05-31 12:21 ` Dmitriy Kovalenko 2025-06-05 12:13 ` Martin Storsjö
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=DBAP193MB0956E3B62E794F364ED459C98D60A@DBAP193MB0956.EURP193.PROD.OUTLOOK.COM \ --to=dmtr.kovalenko@outlook.com \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git