From: Zhao Zhili <quinkblack@foxmail.com> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Subject: Re: [FFmpeg-devel] [PATCH v3 2/3] swscale/aarch64: Add bgra/rgba to yuv Date: Mon, 24 Jun 2024 21:04:48 +0800 Message-ID: <tencent_DBC19A5D40ED9A269F88AB4A4E9B706F2D08@qq.com> (raw) In-Reply-To: <caf7dc3-4022-6a1-99eb-c5f99947770@martin.st> > On Jun 24, 2024, at 19:55, Martin Storsjö <martin@martin.st> wrote: > > On Mon, 24 Jun 2024, Zhao Zhili wrote: > >> From: Zhao Zhili <zhilizhao@tencent.com> >> >> Test on Apple M1 with kperf >> : -O3 : -O3 -fno-vectorize >> bgra_to_uv_8_c : 13.4 : 27.5 >> bgra_to_uv_8_neon : 37.4 : 41.7 >> bgra_to_uv_128_c : 155.9 : 550.2 >> bgra_to_uv_128_neon : 91.7 : 92.7 >> bgra_to_uv_1080_c : 1173.2 : 4558.2 >> bgra_to_uv_1080_neon : 822.7 : 809.5 >> bgra_to_uv_1920_c : 2078.2 : 8115.2 >> bgra_to_uv_1920_neon : 1437.7 : 1438.7 >> bgra_to_uv_half_8_c : 17.9 : 14.2 >> bgra_to_uv_half_8_neon : 37.4 : 10.5 >> bgra_to_uv_half_128_c : 103.9 : 326.0 >> bgra_to_uv_half_128_neon : 73.9 : 68.7 >> bgra_to_uv_half_1080_c : 850.2 : 3732.0 >> bgra_to_uv_half_1080_neon : 484.2 : 490.0 >> bgra_to_uv_half_1920_c : 1479.2 : 4942.7 >> bgra_to_uv_half_1920_neon : 824.2 : 824.7 >> bgra_to_y_8_c : 8.2 : 29.5 >> bgra_to_y_8_neon : 18.2 : 32.7 >> bgra_to_y_128_c : 101.4 : 361.5 >> bgra_to_y_128_neon : 74.9 : 73.7 >> bgra_to_y_1080_c : 739.4 : 3018.0 >> bgra_to_y_1080_neon : 613.4 : 544.2 >> bgra_to_y_1920_c : 1298.7 : 5326.0 >> bgra_to_y_1920_neon : 918.7 : 934.2 >> --- >> libswscale/aarch64/input.S | 91 ++++++++++++++++++++++++++++++------ >> libswscale/aarch64/swscale.c | 16 +++++++ >> 2 files changed, 94 insertions(+), 13 deletions(-) >> >> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S >> index 2cfec4cb6a..6d2c6034bb 100644 >> --- a/libswscale/aarch64/input.S >> +++ b/libswscale/aarch64/input.S >> @@ -20,8 +20,12 @@ >> >> #include "libavutil/aarch64/asm.S" >> >> -.macro rgb_to_yuv_load_rgb src >> +.macro rgb_to_yuv_load_rgb src, element=3 >> + .if \element == 3 >> ld3 { v16.16b, v17.16b, v18.16b }, [\src] >> + .else >> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src] >> + .endif >> uxtl v19.8h, v16.8b // v19: r >> uxtl v20.8h, v17.8b // v20: g >> uxtl v21.8h, v18.8b // v21: b >> @@ -51,7 +55,8 @@ function ff_bgr24ToY_neon, export=1 >> ret >> endfunc >> >> -function ff_rgb24ToY_neon, export=1 >> +.macro rgbToY_neon fmt, element >> +function ff_\fmt\()ToY_neon, export=1 >> cmp w4, #0 // check width > 0 >> ldp w10, w11, [x5] // w10: ry, w11: gy >> ldr w12, [x5, #8] // w12: by >> @@ -67,11 +72,11 @@ function ff_rgb24ToY_neon, export=1 >> dup v2.8h, w12 >> b.lt 2f >> 1: >> - rgb_to_yuv_load_rgb x1 >> + rgb_to_yuv_load_rgb x1, \element >> rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 >> rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 >> sub w4, w4, #16 // width -= 16 >> - add x1, x1, #48 // src += 48 >> + add x1, x1, #(16*\element) >> cmp w4, #16 // width >= 16 ? >> stp q16, q17, [x0], #32 // store to dst >> b.ge 1b >> @@ -86,12 +91,25 @@ function ff_rgb24ToY_neon, export=1 >> smaddl x13, w15, w12, x13 // x13 += by * b >> asr w13, w13, #9 // x13 >>= 9 >> sub w4, w4, #1 // width-- >> - add x1, x1, #3 // src += 3 >> + add x1, x1, #\element >> strh w13, [x0], #2 // store to dst >> cbnz w4, 2b >> 3: >> ret >> endfunc >> +.endm >> + >> +rgbToY_neon fmt=rgb24, element=3 >> + >> +function ff_bgra32ToY_neon, export=1 >> + cmp w4, #0 // check width > 0 >> + ldp w12, w11, [x5] // w12: ry, w11: gy >> + ldr w10, [x5, #8] // w10: by >> + b.gt 4f >> + ret >> +endfunc >> + >> +rgbToY_neon fmt=rgba32, element=4 > > It is extremely obscure to jump to a local label (4f) that is defined by the following macro. I think this would be much more readable if you'd include the bgr(a) version in the macro, so the reference to 4f is near to the actual label it refers to. Good idea, it saved a lot of typing. Fixed in v4. > >> .macro rgb_set_uv_coeff half >> .if \half >> @@ -120,7 +138,8 @@ function ff_bgr24ToUV_half_neon, export=1 >> b 4f >> endfunc >> >> -function ff_rgb24ToUV_half_neon, export=1 >> +.macro rgbToUV_half_neon fmt, element >> +function ff_\fmt\()ToUV_half_neon, export=1 >> cmp w5, #0 // check width > 0 >> b.le 3f >> >> @@ -132,7 +151,11 @@ function ff_rgb24ToUV_half_neon, export=1 >> rgb_set_uv_coeff half=1 >> b.lt 2f >> 1: >> + .if \element == 3 >> ld3 { v16.16b, v17.16b, v18.16b }, [x3] >> + .else >> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3] >> + .endif >> uaddlp v19.8h, v16.16b // v19: r >> uaddlp v20.8h, v17.16b // v20: g >> uaddlp v21.8h, v18.16b // v21: b >> @@ -140,7 +163,7 @@ function ff_rgb24ToUV_half_neon, export=1 >> rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10 >> rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10 >> sub w5, w5, #8 // width -= 8 >> - add x3, x3, #48 // src += 48 >> + add x3, x3, #(16*\element) >> cmp w5, #8 // width >= 8 ? >> str q16, [x0], #16 // store dst_u >> str q17, [x1], #16 // store dst_v >> @@ -148,9 +171,10 @@ function ff_rgb24ToUV_half_neon, export=1 >> cbz w5, 3f >> 2: >> ldrb w2, [x3] // w2: r1 >> - ldrb w4, [x3, #3] // w4: r2 >> + ldrb w4, [x3, #\element] // w4: r2 >> add w2, w2, w4 // w2 = r1 + r2 >> >> + .if \element == 3 >> ldrb w4, [x3, #1] // w4: g1 >> ldrb w7, [x3, #4] // w7: g2 >> add w4, w4, w7 // w4 = g1 + g2 >> @@ -158,6 +182,15 @@ function ff_rgb24ToUV_half_neon, export=1 >> ldrb w7, [x3, #2] // w7: b1 >> ldrb w8, [x3, #5] // w8: b2 >> add w7, w7, w8 // w7 = b1 + b2 >> + .else >> + ldrb w4, [x3, #1] // w4: g1 >> + ldrb w7, [x3, #5] // w7: g2 >> + add w4, w4, w7 // w4 = g1 + g2 >> + >> + ldrb w7, [x3, #2] // w7: b1 >> + ldrb w8, [x3, #6] // w8: b2 >> + add w7, w7, w8 // w7 = b1 + b2 >> + .endif >> >> smaddl x8, w2, w10, x9 // dst_u = ru * r + const_offset >> smaddl x8, w4, w11, x8 // dst_u += gu * g >> @@ -170,12 +203,28 @@ function ff_rgb24ToUV_half_neon, export=1 >> smaddl x8, w7, w15, x8 // dst_v += bv * b >> asr x8, x8, #10 // dst_v >>= 10 >> sub w5, w5, #1 >> - add x3, x3, #6 // src += 6 >> + ldrb w4, [x3, #1] // w4: g1 >> + add x3, x3, #(2*\element) > > Is the new ldrb a typo/copypaste mistake here? Yes, it’s a copypaste mistake. Fixed in v4. > > // Martin > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2024-06-24 13:05 UTC|newest] Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top [not found] <20240624113701.94616-1-quinkblack@foxmail.com> 2024-06-24 11:37 ` Zhao Zhili 2024-06-24 11:55 ` Martin Storsjö 2024-06-24 13:02 ` [FFmpeg-devel] [PATCH v4 1/3] swscale/aarch64: Add bgr24 " Zhao Zhili [not found] ` <20240624130213.71634-1-quinkblack@foxmail.com> 2024-06-24 13:02 ` [FFmpeg-devel] [PATCH v4 2/3] swscale/aarch64: Add bgra/rgba " Zhao Zhili 2024-06-24 13:02 ` [FFmpeg-devel] [PATCH v4 3/3] swscale/aarch64: Add argb/abgr " Zhao Zhili 2024-06-24 13:10 ` Martin Storsjö 2024-06-24 13:04 ` Zhao Zhili [this message] 2024-06-24 11:37 ` [FFmpeg-devel] [PATCH v3 " Zhao Zhili
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=tencent_DBC19A5D40ED9A269F88AB4A4E9B706F2D08@qq.com \ --to=quinkblack@foxmail.com \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git