[FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented {yuyv, uyvy}toyuv{420, 422} - Krzysztof Pyrkosz via ffmpeg-devel

From: Krzysztof Pyrkosz via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: Krzysztof Pyrkosz <ffmpeg@szaka.eu>
Subject: [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented {yuyv, uyvy}toyuv{420, 422}
Date: Thu, 13 Feb 2025 20:02:29 +0100
Message-ID: <20250213190228.1351-2-ffmpeg@szaka.eu> (raw)
In-Reply-To: <60217252-6f0-5563-bab4-4410e6cdab9@martin.st>

This patch succesfully passes the github pipeline. The previous one,
which adds tests fails only the first check on linux x86, probably
because of that mmx issue.

The tiny patch in the second email chain (the one about right shift by
2) completes the checks chain as-is.

Krzysztof

---
 libswscale/aarch64/rgb2rgb.c      |  16 ++
 libswscale/aarch64/rgb2rgb_neon.S | 262 ++++++++++++++++++++++++++++++
 2 files changed, 278 insertions(+)

diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index 7e1dba572d..f474228298 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -67,6 +67,18 @@ void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size);
 
+void ff_uyvytoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+void ff_uyvytoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+void ff_yuyvtoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+void ff_yuyvtoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
 av_cold void rgb2rgb_init_aarch64(void)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -84,5 +96,9 @@ av_cold void rgb2rgb_init_aarch64(void)
         shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon;
         shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon;
         shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon;
+        uyvytoyuv422       = ff_uyvytoyuv422_neon;
+        uyvytoyuv420       = ff_uyvytoyuv420_neon;
+        yuyvtoyuv422       = ff_yuyvtoyuv422_neon;
+        yuyvtoyuv420       = ff_yuyvtoyuv420_neon;
     }
 }
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 22ecdf7ac8..191d20be25 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -427,3 +427,265 @@ neon_shuf 2013
 neon_shuf 1203
 neon_shuf 2130
 neon_shuf 3210
+
+/*
+v0-v7 - two consecutive lines
+x0 - upper Y destination
+x1 - U destination
+x2 - V destination
+x3 - upper src line
+w5 - width/iteration counter - count of line pairs for yuv420, of single lines for 422
+x6 - lum padding
+x7 - chrom padding
+x8 - src padding
+w9 - number of bytes remaining in the tail
+x10 - lower Y destination
+w12 - tmp
+x13 - lower src line
+w14 - tmp
+w17 - set to 1 if last line has to be handled separately (odd height)
+*/
+
+// one fast path iteration processes 16 uyvy tuples
+// is_line_tail is set to 1 when final 16 tuples are being processed
+// skip_storing_chroma is set to 1 when final line is processed and the height is odd
+.macro fastpath_iteration src_fmt, dst_fmt, is_line_tail, skip_storing_chroma
+        ld4             {v0.16b - v3.16b}, [x3], #64
+.if ! \is_line_tail
+        subs            w14, w14, #32
+.endif
+
+.if ! \skip_storing_chroma
+.ifc \dst_fmt, yuv420
+        ld4             {v4.16b - v7.16b}, [x13], #64
+.endif
+
+.ifc \dst_fmt, yuv420                                    // store UV
+.ifc \src_fmt, uyvy
+        uhadd           v0.16b, v4.16b, v0.16b            // halving sum of U
+        uhadd           v2.16b, v6.16b, v2.16b            // halving sum of V
+.else
+        uhadd           v1.16b, v5.16b, v1.16b            // halving sum of U
+        uhadd           v3.16b, v7.16b, v3.16b            // halving sum of V
+.endif
+.endif
+
+.ifc \src_fmt, uyvy
+        st1             {v2.16b}, [x2], #16
+        st1             {v0.16b}, [x1], #16
+.else
+        st1             {v3.16b}, [x2], #16
+        st1             {v1.16b}, [x1], #16
+.endif
+
+.ifc \dst_fmt, yuv420                                    // store_y
+.ifc \src_fmt, uyvy
+        mov             v6.16b, v5.16b
+        st2             {v6.16b,v7.16b}, [x10], #32
+.else
+        mov             v5.16b, v4.16b
+        st2             {v5.16b,v6.16b}, [x10], #32
+.endif
+.endif
+
+.endif // ! \skip_storing_chroma
+
+.ifc \src_fmt, uyvy
+        mov             v2.16b, v1.16b
+        st2             {v2.16b,v3.16b}, [x0], #32
+.else
+        mov             v1.16b, v0.16b
+        st2             {v1.16b,v2.16b}, [x0], #32
+.endif
+.endm
+
+// shift pointers back to width - 32 to process the tail of the line
+// if the height is odd, processing the final line is simplified
+.macro fastpath_shift_back_pointers src_fmt, dst_fmt, is_final_odd_line
+        add             x3, x3, w9, sxtw #1
+        sub             x3, x3, #64
+.if ! \is_final_odd_line
+.ifc \dst_fmt, yuv420
+        add             x13, x13, w9, sxtw #1
+        sub             x13, x13, #64
+        add             x10, x10, w9, sxtw
+        sub             x10, x10, #32
+.endif
+.endif
+        add             x0, x0, w9, sxtw
+        sub             x0, x0, #32
+.if ! \is_final_odd_line
+        asr             w14, w9, #1
+        add             x1, x1, w14, sxtw
+        sub             x1, x1, #16
+        add             x2, x2, w14, sxtw
+        sub             x2, x2, #16
+.endif
+.endm
+
+.macro slowpath_iteration src_fmt, dst_fmt, skip_storing_chroma
+.ifc \dst_fmt, yuv422
+.ifc \src_fmt, uyvy
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x1], #1
+        strb            w14, [x0], #1
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x2], #1
+        strb            w14, [x0], #1
+.else
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x0], #1
+        strb            w14, [x1], #1
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x0], #1
+        strb            w14, [x2], #1
+.endif
+.endif
+.ifc \dst_fmt, yuv420
+.ifc \src_fmt, uyvy
+.if \skip_storing_chroma
+        ldrb            w12, [x3], #2
+        ldrb            w14, [x3], #2
+        strb            w12, [x0], #1
+        strb            w14, [x0], #1
+.else
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x13], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x1], #1
+        ldrb            w14, [x3], #1
+        ldrb            w12, [x13], #1
+        strb            w14, [x0], #1
+        strb            w12, [x10], #1
+        ldrb            w14, [x13], #1
+        ldrb            w12, [x3], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x2], #1
+        ldrb            w14, [x3], #1
+        ldrb            w12, [x13], #1
+        strb            w14, [x0], #1
+        strb            w12, [x10], #1
+.endif
+.else
+.if \skip_storing_chroma
+        ldrb            w12, [x3], #2
+        ldrb            w14, [x3], #2
+        strb            w12, [x0], #1
+        strb            w14, [x0], #1
+.else
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x13], #1
+        strb            w12, [x0], #1
+        strb            w14, [x10], #1
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x13], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x1], #1
+        ldrb            w14, [x3], #1
+        ldrb            w12, [x13], #1
+        strb            w14, [x0], #1
+        strb            w12, [x10], #1
+        ldrb            w14, [x13], #1
+        ldrb            w12, [x3], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x2], #1
+.endif
+.endif
+.endif
+.endm
+
+.macro move_pointers_to_next_line src_fmt, dst_fmt, is_final_odd_line
+        add             x3, x3, x8
+        add             x0, x0, x6
+.ifc \dst_fmt, yuv420
+        add             x13, x13, x8
+        add             x10, x10, x6
+.endif
+        add             x1, x1, x7
+        add             x2, x2, x7
+.endm
+
+.macro interleaved_yuv_to_planar src_fmt, dst_fmt
+function ff_\src_fmt\()to\dst_fmt\()_neon, export=1
+        sxtw            x6, w6
+        sxtw            x7, w7
+        ldrsw           x8, [sp]
+        ands            w11, w4, #~31                     // choose between fast and slow path
+
+.ifc \dst_fmt, yuv420
+        add             x10, x0, x6
+        add             x13, x3, x8
+        add             x8, x8, x8
+        add             x6, x6, x6
+        and             w17, w5, #1
+        asr             w5, w5, #1
+.endif
+        asr             w9, w4, #1
+        sub             x8, x8, w4, sxtw #1               // src offset
+        sub             x6, x6, w4, sxtw                  // lum offset
+        sub             x7, x7, x9                        // chr offset
+
+        b.eq            6f
+
+1:                                                        // fast path - the width is at least 32
+        and             w14, w4, #~31                     // w14 is the main loop counter
+        and             w9, w4, #31                       // w9 holds the remaining width, 0 to 31
+2:
+        fastpath_iteration \src_fmt, \dst_fmt, 0, 0
+        b.ne            2b
+        fastpath_shift_back_pointers \src_fmt, \dst_fmt, 0
+        fastpath_iteration \src_fmt, \dst_fmt, 0, 0
+        subs            w5, w5, #1
+        move_pointers_to_next_line \src_fmt, \dst_fmt
+        b.ne            1b
+
+.ifc \dst_fmt, yuv420                                    // handle the last line in case the height is odd
+        cbz             w17, 3f
+        and             w14, w4, #~31
+4:
+        fastpath_iteration \src_fmt, \dst_fmt, 0, 1
+        b.ne            4b
+        fastpath_shift_back_pointers \src_fmt, \dst_fmt, 1
+        fastpath_iteration \src_fmt, \dst_fmt, 1, 1
+3:
+.endif
+	ret
+
+6:                                                        // slow path - width is at most 31
+        and             w9, w4, #31
+7:
+        subs            w9, w9, #2
+        slowpath_iteration \src_fmt, \dst_fmt, 0
+        b.ne            7b
+        subs            w5, w5, #1
+        move_pointers_to_next_line \src_fmt, \dst_fmt
+        b.ne            6b
+
+.ifc \dst_fmt, yuv420
+        cbz             w17, 8f
+        and             w9, w4, #31
+.ifc \src_fmt, uyvy
+        add             x3, x3, #1
+.endif
+5:
+        subs            w9, w9, #2
+        slowpath_iteration \src_fmt, \dst_fmt, 1
+        b.ne            5b
+8:
+.endif
+        ret
+endfunc
+.endm
+
+interleaved_yuv_to_planar uyvy, yuv422
+interleaved_yuv_to_planar uyvy, yuv420
+interleaved_yuv_to_planar yuyv, yuv422
+interleaved_yuv_to_planar yuyv, yuv420
-- 
2.47.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".