From: Krzysztof Pyrkosz via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> To: ffmpeg-devel@ffmpeg.org Cc: Krzysztof Pyrkosz <ffmpeg@szaka.eu> Subject: [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented {yuyv, uyvy}toyuv{420, 422} Date: Thu, 13 Feb 2025 20:02:29 +0100 Message-ID: <20250213190228.1351-2-ffmpeg@szaka.eu> (raw) In-Reply-To: <60217252-6f0-5563-bab4-4410e6cdab9@martin.st> This patch succesfully passes the github pipeline. The previous one, which adds tests fails only the first check on linux x86, probably because of that mmx issue. The tiny patch in the second email chain (the one about right shift by 2) completes the checks chain as-is. Krzysztof --- libswscale/aarch64/rgb2rgb.c | 16 ++ libswscale/aarch64/rgb2rgb_neon.S | 262 ++++++++++++++++++++++++++++++ 2 files changed, 278 insertions(+) diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c index 7e1dba572d..f474228298 100644 --- a/libswscale/aarch64/rgb2rgb.c +++ b/libswscale/aarch64/rgb2rgb.c @@ -67,6 +67,18 @@ void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size); +void ff_uyvytoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); +void ff_uyvytoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); +void ff_yuyvtoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); +void ff_yuyvtoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); av_cold void rgb2rgb_init_aarch64(void) { int cpu_flags = av_get_cpu_flags(); @@ -84,5 +96,9 @@ av_cold void rgb2rgb_init_aarch64(void) shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon; shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon; shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon; + uyvytoyuv422 = ff_uyvytoyuv422_neon; + uyvytoyuv420 = ff_uyvytoyuv420_neon; + yuyvtoyuv422 = ff_yuyvtoyuv422_neon; + yuyvtoyuv420 = ff_yuyvtoyuv420_neon; } } diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S index 22ecdf7ac8..191d20be25 100644 --- a/libswscale/aarch64/rgb2rgb_neon.S +++ b/libswscale/aarch64/rgb2rgb_neon.S @@ -427,3 +427,265 @@ neon_shuf 2013 neon_shuf 1203 neon_shuf 2130 neon_shuf 3210 + +/* +v0-v7 - two consecutive lines +x0 - upper Y destination +x1 - U destination +x2 - V destination +x3 - upper src line +w5 - width/iteration counter - count of line pairs for yuv420, of single lines for 422 +x6 - lum padding +x7 - chrom padding +x8 - src padding +w9 - number of bytes remaining in the tail +x10 - lower Y destination +w12 - tmp +x13 - lower src line +w14 - tmp +w17 - set to 1 if last line has to be handled separately (odd height) +*/ + +// one fast path iteration processes 16 uyvy tuples +// is_line_tail is set to 1 when final 16 tuples are being processed +// skip_storing_chroma is set to 1 when final line is processed and the height is odd +.macro fastpath_iteration src_fmt, dst_fmt, is_line_tail, skip_storing_chroma + ld4 {v0.16b - v3.16b}, [x3], #64 +.if ! \is_line_tail + subs w14, w14, #32 +.endif + +.if ! \skip_storing_chroma +.ifc \dst_fmt, yuv420 + ld4 {v4.16b - v7.16b}, [x13], #64 +.endif + +.ifc \dst_fmt, yuv420 // store UV +.ifc \src_fmt, uyvy + uhadd v0.16b, v4.16b, v0.16b // halving sum of U + uhadd v2.16b, v6.16b, v2.16b // halving sum of V +.else + uhadd v1.16b, v5.16b, v1.16b // halving sum of U + uhadd v3.16b, v7.16b, v3.16b // halving sum of V +.endif +.endif + +.ifc \src_fmt, uyvy + st1 {v2.16b}, [x2], #16 + st1 {v0.16b}, [x1], #16 +.else + st1 {v3.16b}, [x2], #16 + st1 {v1.16b}, [x1], #16 +.endif + +.ifc \dst_fmt, yuv420 // store_y +.ifc \src_fmt, uyvy + mov v6.16b, v5.16b + st2 {v6.16b,v7.16b}, [x10], #32 +.else + mov v5.16b, v4.16b + st2 {v5.16b,v6.16b}, [x10], #32 +.endif +.endif + +.endif // ! \skip_storing_chroma + +.ifc \src_fmt, uyvy + mov v2.16b, v1.16b + st2 {v2.16b,v3.16b}, [x0], #32 +.else + mov v1.16b, v0.16b + st2 {v1.16b,v2.16b}, [x0], #32 +.endif +.endm + +// shift pointers back to width - 32 to process the tail of the line +// if the height is odd, processing the final line is simplified +.macro fastpath_shift_back_pointers src_fmt, dst_fmt, is_final_odd_line + add x3, x3, w9, sxtw #1 + sub x3, x3, #64 +.if ! \is_final_odd_line +.ifc \dst_fmt, yuv420 + add x13, x13, w9, sxtw #1 + sub x13, x13, #64 + add x10, x10, w9, sxtw + sub x10, x10, #32 +.endif +.endif + add x0, x0, w9, sxtw + sub x0, x0, #32 +.if ! \is_final_odd_line + asr w14, w9, #1 + add x1, x1, w14, sxtw + sub x1, x1, #16 + add x2, x2, w14, sxtw + sub x2, x2, #16 +.endif +.endm + +.macro slowpath_iteration src_fmt, dst_fmt, skip_storing_chroma +.ifc \dst_fmt, yuv422 +.ifc \src_fmt, uyvy + ldrb w12, [x3], #1 + ldrb w14, [x3], #1 + strb w12, [x1], #1 + strb w14, [x0], #1 + ldrb w12, [x3], #1 + ldrb w14, [x3], #1 + strb w12, [x2], #1 + strb w14, [x0], #1 +.else + ldrb w12, [x3], #1 + ldrb w14, [x3], #1 + strb w12, [x0], #1 + strb w14, [x1], #1 + ldrb w12, [x3], #1 + ldrb w14, [x3], #1 + strb w12, [x0], #1 + strb w14, [x2], #1 +.endif +.endif +.ifc \dst_fmt, yuv420 +.ifc \src_fmt, uyvy +.if \skip_storing_chroma + ldrb w12, [x3], #2 + ldrb w14, [x3], #2 + strb w12, [x0], #1 + strb w14, [x0], #1 +.else + ldrb w12, [x3], #1 + ldrb w14, [x13], #1 + add w12, w12, w14 + lsr w12, w12, #1 + strb w12, [x1], #1 + ldrb w14, [x3], #1 + ldrb w12, [x13], #1 + strb w14, [x0], #1 + strb w12, [x10], #1 + ldrb w14, [x13], #1 + ldrb w12, [x3], #1 + add w12, w12, w14 + lsr w12, w12, #1 + strb w12, [x2], #1 + ldrb w14, [x3], #1 + ldrb w12, [x13], #1 + strb w14, [x0], #1 + strb w12, [x10], #1 +.endif +.else +.if \skip_storing_chroma + ldrb w12, [x3], #2 + ldrb w14, [x3], #2 + strb w12, [x0], #1 + strb w14, [x0], #1 +.else + ldrb w12, [x3], #1 + ldrb w14, [x13], #1 + strb w12, [x0], #1 + strb w14, [x10], #1 + ldrb w12, [x3], #1 + ldrb w14, [x13], #1 + add w12, w12, w14 + lsr w12, w12, #1 + strb w12, [x1], #1 + ldrb w14, [x3], #1 + ldrb w12, [x13], #1 + strb w14, [x0], #1 + strb w12, [x10], #1 + ldrb w14, [x13], #1 + ldrb w12, [x3], #1 + add w12, w12, w14 + lsr w12, w12, #1 + strb w12, [x2], #1 +.endif +.endif +.endif +.endm + +.macro move_pointers_to_next_line src_fmt, dst_fmt, is_final_odd_line + add x3, x3, x8 + add x0, x0, x6 +.ifc \dst_fmt, yuv420 + add x13, x13, x8 + add x10, x10, x6 +.endif + add x1, x1, x7 + add x2, x2, x7 +.endm + +.macro interleaved_yuv_to_planar src_fmt, dst_fmt +function ff_\src_fmt\()to\dst_fmt\()_neon, export=1 + sxtw x6, w6 + sxtw x7, w7 + ldrsw x8, [sp] + ands w11, w4, #~31 // choose between fast and slow path + +.ifc \dst_fmt, yuv420 + add x10, x0, x6 + add x13, x3, x8 + add x8, x8, x8 + add x6, x6, x6 + and w17, w5, #1 + asr w5, w5, #1 +.endif + asr w9, w4, #1 + sub x8, x8, w4, sxtw #1 // src offset + sub x6, x6, w4, sxtw // lum offset + sub x7, x7, x9 // chr offset + + b.eq 6f + +1: // fast path - the width is at least 32 + and w14, w4, #~31 // w14 is the main loop counter + and w9, w4, #31 // w9 holds the remaining width, 0 to 31 +2: + fastpath_iteration \src_fmt, \dst_fmt, 0, 0 + b.ne 2b + fastpath_shift_back_pointers \src_fmt, \dst_fmt, 0 + fastpath_iteration \src_fmt, \dst_fmt, 0, 0 + subs w5, w5, #1 + move_pointers_to_next_line \src_fmt, \dst_fmt + b.ne 1b + +.ifc \dst_fmt, yuv420 // handle the last line in case the height is odd + cbz w17, 3f + and w14, w4, #~31 +4: + fastpath_iteration \src_fmt, \dst_fmt, 0, 1 + b.ne 4b + fastpath_shift_back_pointers \src_fmt, \dst_fmt, 1 + fastpath_iteration \src_fmt, \dst_fmt, 1, 1 +3: +.endif + ret + +6: // slow path - width is at most 31 + and w9, w4, #31 +7: + subs w9, w9, #2 + slowpath_iteration \src_fmt, \dst_fmt, 0 + b.ne 7b + subs w5, w5, #1 + move_pointers_to_next_line \src_fmt, \dst_fmt + b.ne 6b + +.ifc \dst_fmt, yuv420 + cbz w17, 8f + and w9, w4, #31 +.ifc \src_fmt, uyvy + add x3, x3, #1 +.endif +5: + subs w9, w9, #2 + slowpath_iteration \src_fmt, \dst_fmt, 1 + b.ne 5b +8: +.endif + ret +endfunc +.endm + +interleaved_yuv_to_planar uyvy, yuv422 +interleaved_yuv_to_planar uyvy, yuv420 +interleaved_yuv_to_planar yuyv, yuv422 +interleaved_yuv_to_planar yuyv, yuv420 -- 2.47.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2025-02-13 19:06 UTC|newest] Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top 2025-02-07 19:06 [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422 Krzysztof Pyrkosz via ffmpeg-devel 2025-02-10 13:15 ` Martin Storsjö 2025-02-11 21:24 ` Krzysztof Pyrkosz via ffmpeg-devel 2025-02-11 21:33 ` Krzysztof Pyrkosz via ffmpeg-devel 2025-02-11 21:53 ` Martin Storsjö 2025-02-11 22:06 ` [FFmpeg-devel] [PATCH 1/2] tests/checkasm/sw_rgb: Added {yuyv, uyvy}toyuv{420, 422} test cases Krzysztof Pyrkosz via ffmpeg-devel 2025-02-12 9:37 ` Martin Storsjö 2025-02-17 9:06 ` Martin Storsjö 2025-02-17 9:37 ` Martin Storsjö 2025-02-11 22:06 ` [FFmpeg-devel] [PATCH 2/2] swscale/aarch64/rgb2rgb_neon: Implemented {yuyv, uyvy}toyuv{420, 422} Krzysztof Pyrkosz via ffmpeg-devel 2025-02-12 12:02 ` Martin Storsjö 2025-02-13 19:02 ` Krzysztof Pyrkosz via ffmpeg-devel [this message] 2025-02-17 9:39 ` [FFmpeg-devel] [PATCH] " Martin Storsjö
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20250213190228.1351-2-ffmpeg@szaka.eu \ --to=ffmpeg-devel@ffmpeg.org \ --cc=ffmpeg@szaka.eu \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git