From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTPS id 8C82B4CF3E for ; Thu, 13 Feb 2025 19:06:23 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id F147B68BEF1; Thu, 13 Feb 2025 21:06:19 +0200 (EET) Received: from szaka.eu (szaka.eu [144.217.86.229]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id A087468B224 for ; Thu, 13 Feb 2025 21:06:13 +0200 (EET) To: ffmpeg-devel@ffmpeg.org Date: Thu, 13 Feb 2025 20:02:29 +0100 Message-ID: <20250213190228.1351-2-ffmpeg@szaka.eu> X-Mailer: git-send-email 2.47.2 In-Reply-To: <60217252-6f0-5563-bab4-4410e6cdab9@martin.st> References: <60217252-6f0-5563-bab4-4410e6cdab9@martin.st> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented {yuyv, uyvy}toyuv{420, 422} X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , From: Krzysztof Pyrkosz via ffmpeg-devel Reply-To: FFmpeg development discussions and patches Cc: Krzysztof Pyrkosz Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: This patch succesfully passes the github pipeline. The previous one, which adds tests fails only the first check on linux x86, probably because of that mmx issue. The tiny patch in the second email chain (the one about right shift by 2) completes the checks chain as-is. Krzysztof --- libswscale/aarch64/rgb2rgb.c | 16 ++ libswscale/aarch64/rgb2rgb_neon.S | 262 ++++++++++++++++++++++++++++++ 2 files changed, 278 insertions(+) diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c index 7e1dba572d..f474228298 100644 --- a/libswscale/aarch64/rgb2rgb.c +++ b/libswscale/aarch64/rgb2rgb.c @@ -67,6 +67,18 @@ void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size); +void ff_uyvytoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); +void ff_uyvytoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); +void ff_yuyvtoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); +void ff_yuyvtoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); av_cold void rgb2rgb_init_aarch64(void) { int cpu_flags = av_get_cpu_flags(); @@ -84,5 +96,9 @@ av_cold void rgb2rgb_init_aarch64(void) shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon; shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon; shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon; + uyvytoyuv422 = ff_uyvytoyuv422_neon; + uyvytoyuv420 = ff_uyvytoyuv420_neon; + yuyvtoyuv422 = ff_yuyvtoyuv422_neon; + yuyvtoyuv420 = ff_yuyvtoyuv420_neon; } } diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S index 22ecdf7ac8..191d20be25 100644 --- a/libswscale/aarch64/rgb2rgb_neon.S +++ b/libswscale/aarch64/rgb2rgb_neon.S @@ -427,3 +427,265 @@ neon_shuf 2013 neon_shuf 1203 neon_shuf 2130 neon_shuf 3210 + +/* +v0-v7 - two consecutive lines +x0 - upper Y destination +x1 - U destination +x2 - V destination +x3 - upper src line +w5 - width/iteration counter - count of line pairs for yuv420, of single lines for 422 +x6 - lum padding +x7 - chrom padding +x8 - src padding +w9 - number of bytes remaining in the tail +x10 - lower Y destination +w12 - tmp +x13 - lower src line +w14 - tmp +w17 - set to 1 if last line has to be handled separately (odd height) +*/ + +// one fast path iteration processes 16 uyvy tuples +// is_line_tail is set to 1 when final 16 tuples are being processed +// skip_storing_chroma is set to 1 when final line is processed and the height is odd +.macro fastpath_iteration src_fmt, dst_fmt, is_line_tail, skip_storing_chroma + ld4 {v0.16b - v3.16b}, [x3], #64 +.if ! \is_line_tail + subs w14, w14, #32 +.endif + +.if ! \skip_storing_chroma +.ifc \dst_fmt, yuv420 + ld4 {v4.16b - v7.16b}, [x13], #64 +.endif + +.ifc \dst_fmt, yuv420 // store UV +.ifc \src_fmt, uyvy + uhadd v0.16b, v4.16b, v0.16b // halving sum of U + uhadd v2.16b, v6.16b, v2.16b // halving sum of V +.else + uhadd v1.16b, v5.16b, v1.16b // halving sum of U + uhadd v3.16b, v7.16b, v3.16b // halving sum of V +.endif +.endif + +.ifc \src_fmt, uyvy + st1 {v2.16b}, [x2], #16 + st1 {v0.16b}, [x1], #16 +.else + st1 {v3.16b}, [x2], #16 + st1 {v1.16b}, [x1], #16 +.endif + +.ifc \dst_fmt, yuv420 // store_y +.ifc \src_fmt, uyvy + mov v6.16b, v5.16b + st2 {v6.16b,v7.16b}, [x10], #32 +.else + mov v5.16b, v4.16b + st2 {v5.16b,v6.16b}, [x10], #32 +.endif +.endif + +.endif // ! \skip_storing_chroma + +.ifc \src_fmt, uyvy + mov v2.16b, v1.16b + st2 {v2.16b,v3.16b}, [x0], #32 +.else + mov v1.16b, v0.16b + st2 {v1.16b,v2.16b}, [x0], #32 +.endif +.endm + +// shift pointers back to width - 32 to process the tail of the line +// if the height is odd, processing the final line is simplified +.macro fastpath_shift_back_pointers src_fmt, dst_fmt, is_final_odd_line + add x3, x3, w9, sxtw #1 + sub x3, x3, #64 +.if ! \is_final_odd_line +.ifc \dst_fmt, yuv420 + add x13, x13, w9, sxtw #1 + sub x13, x13, #64 + add x10, x10, w9, sxtw + sub x10, x10, #32 +.endif +.endif + add x0, x0, w9, sxtw + sub x0, x0, #32 +.if ! \is_final_odd_line + asr w14, w9, #1 + add x1, x1, w14, sxtw + sub x1, x1, #16 + add x2, x2, w14, sxtw + sub x2, x2, #16 +.endif +.endm + +.macro slowpath_iteration src_fmt, dst_fmt, skip_storing_chroma +.ifc \dst_fmt, yuv422 +.ifc \src_fmt, uyvy + ldrb w12, [x3], #1 + ldrb w14, [x3], #1 + strb w12, [x1], #1 + strb w14, [x0], #1 + ldrb w12, [x3], #1 + ldrb w14, [x3], #1 + strb w12, [x2], #1 + strb w14, [x0], #1 +.else + ldrb w12, [x3], #1 + ldrb w14, [x3], #1 + strb w12, [x0], #1 + strb w14, [x1], #1 + ldrb w12, [x3], #1 + ldrb w14, [x3], #1 + strb w12, [x0], #1 + strb w14, [x2], #1 +.endif +.endif +.ifc \dst_fmt, yuv420 +.ifc \src_fmt, uyvy +.if \skip_storing_chroma + ldrb w12, [x3], #2 + ldrb w14, [x3], #2 + strb w12, [x0], #1 + strb w14, [x0], #1 +.else + ldrb w12, [x3], #1 + ldrb w14, [x13], #1 + add w12, w12, w14 + lsr w12, w12, #1 + strb w12, [x1], #1 + ldrb w14, [x3], #1 + ldrb w12, [x13], #1 + strb w14, [x0], #1 + strb w12, [x10], #1 + ldrb w14, [x13], #1 + ldrb w12, [x3], #1 + add w12, w12, w14 + lsr w12, w12, #1 + strb w12, [x2], #1 + ldrb w14, [x3], #1 + ldrb w12, [x13], #1 + strb w14, [x0], #1 + strb w12, [x10], #1 +.endif +.else +.if \skip_storing_chroma + ldrb w12, [x3], #2 + ldrb w14, [x3], #2 + strb w12, [x0], #1 + strb w14, [x0], #1 +.else + ldrb w12, [x3], #1 + ldrb w14, [x13], #1 + strb w12, [x0], #1 + strb w14, [x10], #1 + ldrb w12, [x3], #1 + ldrb w14, [x13], #1 + add w12, w12, w14 + lsr w12, w12, #1 + strb w12, [x1], #1 + ldrb w14, [x3], #1 + ldrb w12, [x13], #1 + strb w14, [x0], #1 + strb w12, [x10], #1 + ldrb w14, [x13], #1 + ldrb w12, [x3], #1 + add w12, w12, w14 + lsr w12, w12, #1 + strb w12, [x2], #1 +.endif +.endif +.endif +.endm + +.macro move_pointers_to_next_line src_fmt, dst_fmt, is_final_odd_line + add x3, x3, x8 + add x0, x0, x6 +.ifc \dst_fmt, yuv420 + add x13, x13, x8 + add x10, x10, x6 +.endif + add x1, x1, x7 + add x2, x2, x7 +.endm + +.macro interleaved_yuv_to_planar src_fmt, dst_fmt +function ff_\src_fmt\()to\dst_fmt\()_neon, export=1 + sxtw x6, w6 + sxtw x7, w7 + ldrsw x8, [sp] + ands w11, w4, #~31 // choose between fast and slow path + +.ifc \dst_fmt, yuv420 + add x10, x0, x6 + add x13, x3, x8 + add x8, x8, x8 + add x6, x6, x6 + and w17, w5, #1 + asr w5, w5, #1 +.endif + asr w9, w4, #1 + sub x8, x8, w4, sxtw #1 // src offset + sub x6, x6, w4, sxtw // lum offset + sub x7, x7, x9 // chr offset + + b.eq 6f + +1: // fast path - the width is at least 32 + and w14, w4, #~31 // w14 is the main loop counter + and w9, w4, #31 // w9 holds the remaining width, 0 to 31 +2: + fastpath_iteration \src_fmt, \dst_fmt, 0, 0 + b.ne 2b + fastpath_shift_back_pointers \src_fmt, \dst_fmt, 0 + fastpath_iteration \src_fmt, \dst_fmt, 0, 0 + subs w5, w5, #1 + move_pointers_to_next_line \src_fmt, \dst_fmt + b.ne 1b + +.ifc \dst_fmt, yuv420 // handle the last line in case the height is odd + cbz w17, 3f + and w14, w4, #~31 +4: + fastpath_iteration \src_fmt, \dst_fmt, 0, 1 + b.ne 4b + fastpath_shift_back_pointers \src_fmt, \dst_fmt, 1 + fastpath_iteration \src_fmt, \dst_fmt, 1, 1 +3: +.endif + ret + +6: // slow path - width is at most 31 + and w9, w4, #31 +7: + subs w9, w9, #2 + slowpath_iteration \src_fmt, \dst_fmt, 0 + b.ne 7b + subs w5, w5, #1 + move_pointers_to_next_line \src_fmt, \dst_fmt + b.ne 6b + +.ifc \dst_fmt, yuv420 + cbz w17, 8f + and w9, w4, #31 +.ifc \src_fmt, uyvy + add x3, x3, #1 +.endif +5: + subs w9, w9, #2 + slowpath_iteration \src_fmt, \dst_fmt, 1 + b.ne 5b +8: +.endif + ret +endfunc +.endm + +interleaved_yuv_to_planar uyvy, yuv422 +interleaved_yuv_to_planar uyvy, yuv420 +interleaved_yuv_to_planar yuyv, yuv422 +interleaved_yuv_to_planar yuyv, yuv420 -- 2.47.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".