From: "Martin Storsjö" <martin@martin.st> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Subject: Re: [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add bgra/rgba to yuv Date: Thu, 20 Jun 2024 15:49:15 +0300 (EEST) Message-ID: <9b71b95e-1d51-cf1f-56c-5e8333adfc14@martin.st> (raw) In-Reply-To: <tencent_138357919C2EE1825A463473813A35D93E09@qq.com> On Thu, 20 Jun 2024, Zhao Zhili wrote: >> On Jun 19, 2024, at 20:05, Rémi Denis-Courmont <remi@remlab.net> wrote: >> >> Le 19 juin 2024 11:24:28 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit : >>> >>>> On Jun 19, 2024, at 15:07, Rémi Denis-Courmont <remi@remlab.net> wrote: >>>> >>>> >>>> >>>> Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit : >>>>> >>>>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S >>>>> index 2b956fe5c2..37f1158504 100644 >>>>> --- a/libswscale/aarch64/input.S >>>>> +++ b/libswscale/aarch64/input.S >>>>> @@ -20,8 +20,12 @@ >>>>> >>>>> #include "libavutil/aarch64/asm.S" >>>>> >>>>> -.macro rgb_to_yuv_load_rgb src >>>>> +.macro rgb_to_yuv_load_rgb src, element=3 >>>>> + .if \element == 3 >>>>> ld3 { v16.16b, v17.16b, v18.16b }, [\src] >>>>> + .else >>>>> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src] >>>>> + .endif >>>>> uxtl v19.8h, v16.8b // v19: r >>>>> uxtl v20.8h, v17.8b // v20: g >>>>> uxtl v21.8h, v18.8b // v21: b >>>>> @@ -43,7 +47,7 @@ >>>>> sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift >>>>> .endm >>>>> >>>>> -.macro rgbToY bgr >>>>> +.macro rgbToY bgr, element=3 >>>> >>>> AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop. >>> >>> I’m not sure where to add the branch. Could you elaborate? Do you mean load coefficients first like the following: >>> >>> function ff_bgr24ToUV_half_neon, export=1 >>> ldr w12, [x6, #12] >>> ldr w11, [x6, #16] >>> ldr w10, [x6, #20] >>> ldr w15, [x6, #24] >>> ldr w14, [x6, #28] >>> ldr w13, [x6, #32] >>> rgbToUV_half >>> endfunc >> >> Hmm, no. You need to jump past the loading of red and blue coefficients. It might help to load green coefficients last. >> >> By the way, I think you can use LDP instead of LDR. > > Patch v2 replace LDR by LDP, then the "jump past the loading of red and blue coefficients” doesn’t apply now. Rémi's point is that you don't need to duplicate the whole function, when the only thing you're changing is a couple of instructions in the prologue of the function. By reusing the actual bulk of the function, you save on binary size. One way of doing it looks like this: diff --git a/libavutil/aarch64/asm.S b/libavutil/aarch64/asm.S index 1840f9fb01..eb870e4dca 100644 --- a/libavutil/aarch64/asm.S +++ b/libavutil/aarch64/asm.S @@ -256,5 +256,11 @@ ELF .size \name, . - \name #define JOIN(a, b) GLUE(a, b) #define X(s) JOIN(EXTERN_ASM, s) +#ifdef __APPLE__ +#define L(x) L ## x +#else +#define L(x) .L ## x +#endif + #define x18 do_not_use_x18 #define w18 do_not_use_w18 diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S index 33afa34111..ce10d584c6 100644 --- a/libswscale/aarch64/input.S +++ b/libswscale/aarch64/input.S @@ -49,6 +49,7 @@ function ff_rgb24ToY_neon, export=1 ldr w12, [x5, #8] // w12: by b.le 3f +L(rgb24ToY_internal): mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7) movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1) dup v6.4s, w9 // w9: const_offset @@ -85,6 +86,14 @@ function ff_rgb24ToY_neon, export=1 ret endfunc +function ff_bgr24ToY_neon, export=1 + cmp w4, #0 // check width > 0 + ldp w12, w11, [x5] // w12: ry, w11: gy + ldr w10, [x5, #8] // w10: by + b.gt L(rgb24ToY_internal) + ret +endfunc + .macro rgb24_load_uv_coeff half ldp w10, w11, [x6, #12] // w10: ru, w11: gu ldp w12, w13, [x6, #20] // w12: bu, w13: rv Another way looks like this: diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S index 33afa34111..5c4b7a41fd 100644 --- a/libswscale/aarch64/input.S +++ b/libswscale/aarch64/input.S @@ -43,12 +43,23 @@ sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift .endm +function ff_bgr24ToY_neon, export=1 + cmp w4, #0 // check width > 0 + ldp w12, w11, [x5] // w12: ry, w11: gy + ldr w10, [x5, #8] // w10: by + b.gt rgb24ToY_internal + ret +endfunc + function ff_rgb24ToY_neon, export=1 cmp w4, #0 // check width > 0 ldp w10, w11, [x5] // w10: ry, w11: gy ldr w12, [x5, #8] // w12: by - b.le 3f + b.gt rgb24ToY_internal + ret +endfunc +function rgb24ToY_internal mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7) movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1) dup v6.4s, w9 // w9: const_offset Or if you want to be really adventurous, you can make a fallthrough: diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S index 33afa34111..025a965b76 100644 --- a/libswscale/aarch64/input.S +++ b/libswscale/aarch64/input.S @@ -43,12 +43,22 @@ sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift .endm +function ff_bgr24ToY_neon, export=1 + cmp w4, #0 // check width > 0 + ldp w12, w11, [x5] // w12: ry, w11: gy + ldr w10, [x5, #8] // w10: by + b.gt rgb24ToY_internal + ret +endfunc + function ff_rgb24ToY_neon, export=1 cmp w4, #0 // check width > 0 ldp w10, w11, [x5] // w10: ry, w11: gy ldr w12, [x5, #8] // w12: by b.le 3f +endfunc +function rgb24ToY_internal mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7) movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1) dup v6.4s, w9 // w9: const_offset // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2024-06-20 12:49 UTC|newest] Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top [not found] <20240615095718.37319-1-quinkblack@foxmail.com> 2024-06-15 9:57 ` Zhao Zhili 2024-06-18 20:32 ` Martin Storsjö 2024-06-19 7:07 ` Rémi Denis-Courmont 2024-06-19 9:24 ` Zhao Zhili 2024-06-19 12:05 ` Rémi Denis-Courmont 2024-06-19 17:15 ` Zhao Zhili 2024-06-20 12:49 ` Martin Storsjö [this message] 2024-06-20 16:02 ` Zhao Zhili 2024-06-20 16:25 ` Rémi Denis-Courmont
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=9b71b95e-1d51-cf1f-56c-5e8333adfc14@martin.st \ --to=martin@martin.st \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git