Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422
@ 2025-02-07 19:06 Krzysztof Pyrkosz via ffmpeg-devel
  2025-02-10 13:15 ` Martin Storsjö
  0 siblings, 1 reply; 9+ messages in thread
From: Krzysztof Pyrkosz via ffmpeg-devel @ 2025-02-07 19:06 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Krzysztof Pyrkosz

The patch contains NEON code that splits the uyvy input array into 3
separate buffers.

The existing test cases are covering scenarios with odd height and odd
stride, but width is even in every instance. Is it safe to make that
assumption about the width?

Just as I'm about to send this patch, I'm thinking if non-interleaved
read followed by 4 invocations of TBL wouldn't be more performant. One
call to generate a contiguous vector of u, second for v and two for y. 
I'm curious to find out.

The speed:

A78:
uyvytoyuv422_c:                                      42213.5 ( 1.00x)
uyvytoyuv422_neon:                                    5298.8 ( 7.97x)

A72:
uyvytoyuv422_c:                                      61797.6 ( 1.00x)
uyvytoyuv422_neon:                                   12141.9 ( 5.09x)

x13s:
uyvytoyuv422_c:                                      28581.7 ( 1.00x)
uyvytoyuv422_neon:                                    4882.4 ( 5.85x)

Krzysztof

---
 libswscale/aarch64/rgb2rgb.c      |  5 +++
 libswscale/aarch64/rgb2rgb_neon.S | 51 +++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index 7e1dba572d..096ed9f363 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -67,6 +67,10 @@ void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size);
 
+void ff_uyvytoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+
 av_cold void rgb2rgb_init_aarch64(void)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -84,5 +88,6 @@ av_cold void rgb2rgb_init_aarch64(void)
         shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon;
         shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon;
         shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon;
+        uyvytoyuv422       = ff_uyvytoyuv422_neon;
     }
 }
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 22ecdf7ac8..bdbee7df2e 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -427,3 +427,54 @@ neon_shuf 2013
 neon_shuf 1203
 neon_shuf 2130
 neon_shuf 3210
+
+function ff_uyvytoyuv422_neon, export=1
+        sxtw             x6, w6
+        sxtw             x7, w7
+        ldrsw            x8, [sp]
+        ubfx             x10, x4, #1, #31
+        sub              x8, x8, w4, sxtw #1               // src offset
+        sub              x6, x6, w4, sxtw                  // lum offset
+        sub              x7, x7, x10                       // chr offset
+1:
+        sub              w5, w5, #1
+        ands             w10, w4, #~31
+        and              w9, w4, #15
+        and              w11, w4, #16
+        b.eq             7f
+4:
+        ld4              {v0.16b - v3.16b}, [x3], #64      // handle 16 uyvy tuples per iteration
+        subs             w10, w10, #32
+        st1              {v2.16b}, [x2], #16
+        st1              {v0.16b}, [x1], #16
+        mov              v2.16b, v1.16b
+        st2              {v2.16b,v3.16b}, [x0], #32
+        b.ne             4b
+7:
+        cbz              w11, 5f                           // 8 - 15 remaining
+        ld4              {v0.8b - v3.8b}, [x3], #32
+        st1              {v2.8b}, [x2], #8
+        st1              {v0.8b}, [x1], #8
+        mov              v2.8b, v1.8b
+        st2              {v2.8b,v3.8b}, [x0], #16
+5:
+        cbz              w9, 3f
+2:
+        subs             w9, w9, #2                        // 0 - 7 left
+        ldrb             w12, [x3], #1
+        strb             w12, [x1], #1
+        ldrb             w12, [x3], #1
+        strb             w12, [x0], #1
+        ldrb             w12, [x3], #1
+        strb             w12, [x2], #1
+        ldrb             w12, [x3], #1
+        strb             w12, [x0], #1
+        b.ne             2b
+3:
+        add              x3, x3, x8
+        add              x0, x0, x6
+        add              x1, x1, x7
+        add              x2, x2, x7
+        cbnz             w5, 1b
+        ret
+endfunc
-- 
2.47.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422
  2025-02-07 19:06 [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422 Krzysztof Pyrkosz via ffmpeg-devel
@ 2025-02-10 13:15 ` Martin Storsjö
  2025-02-11 21:24   ` Krzysztof Pyrkosz via ffmpeg-devel
  2025-02-11 21:33   ` Krzysztof Pyrkosz via ffmpeg-devel
  0 siblings, 2 replies; 9+ messages in thread
From: Martin Storsjö @ 2025-02-10 13:15 UTC (permalink / raw)
  To: Krzysztof Pyrkosz via ffmpeg-devel; +Cc: Krzysztof Pyrkosz

On Fri, 7 Feb 2025, Krzysztof Pyrkosz via ffmpeg-devel wrote:

> The patch contains NEON code that splits the uyvy input array into 3
> separate buffers.
>
> The existing test cases are covering scenarios with odd height and odd
> stride, but width is even in every instance. Is it safe to make that
> assumption about the width?

For something like uyvy, I'm kinda ok with assuming that, especially if we 
don't have test coverage for it. But ideally, it would of course be clear 
how those aspects are handled wrt assembly functions, indeed!

> Just as I'm about to send this patch, I'm thinking if non-interleaved
> read followed by 4 invocations of TBL wouldn't be more performant. One
> call to generate a contiguous vector of u, second for v and two for y.
> I'm curious to find out.

My guess is that it may be more performant on more modern cores, but 
probably not on older ones.

>
> The speed:
>
> A78:
> uyvytoyuv422_c:                                      42213.5 ( 1.00x)
> uyvytoyuv422_neon:                                    5298.8 ( 7.97x)
>
> A72:
> uyvytoyuv422_c:                                      61797.6 ( 1.00x)
> uyvytoyuv422_neon:                                   12141.9 ( 5.09x)
>
> x13s:
> uyvytoyuv422_c:                                      28581.7 ( 1.00x)
> uyvytoyuv422_neon:                                    4882.4 ( 5.85x)
>
> Krzysztof
>
> ---
> libswscale/aarch64/rgb2rgb.c      |  5 +++
> libswscale/aarch64/rgb2rgb_neon.S | 51 +++++++++++++++++++++++++++++++
> 2 files changed, 56 insertions(+)
>
> diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
> index 7e1dba572d..096ed9f363 100644
> --- a/libswscale/aarch64/rgb2rgb.c
> +++ b/libswscale/aarch64/rgb2rgb.c
> @@ -67,6 +67,10 @@ void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size);
> void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size);
> void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size);
>
> +void ff_uyvytoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> +                          const uint8_t *src, int width, int height,
> +                          int lumStride, int chromStride, int srcStride);
> +
> av_cold void rgb2rgb_init_aarch64(void)
> {
>     int cpu_flags = av_get_cpu_flags();
> @@ -84,5 +88,6 @@ av_cold void rgb2rgb_init_aarch64(void)
>         shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon;
>         shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon;
>         shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon;
> +        uyvytoyuv422       = ff_uyvytoyuv422_neon;
>     }
> }
> diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
> index 22ecdf7ac8..bdbee7df2e 100644
> --- a/libswscale/aarch64/rgb2rgb_neon.S
> +++ b/libswscale/aarch64/rgb2rgb_neon.S
> @@ -427,3 +427,54 @@ neon_shuf 2013
> neon_shuf 1203
> neon_shuf 2130
> neon_shuf 3210
> +
> +function ff_uyvytoyuv422_neon, export=1
> +        sxtw             x6, w6

The indentation of the arguments column is off by one char within this 
whole function.

For testing various aarch64 assembly details (building with unusual 
toolchains etc), I've set up a little extra CI for that on github; have a 
look at the branch at the topmost 4 commits at 
https://github.com/mstorsjo/ffmpeg/commits/gha-aarch64.

If you include those 4 commits in a branch and push to a personal fork at 
github, you'll get test results for that push like this: 
https://github.com/mstorsjo/FFmpeg/actions/runs/13240513523

For these two patches, I got the following results:
https://github.com/mstorsjo/FFmpeg/actions/runs/13240526767

This points out the unexpected indentation here.

> +        sxtw             x7, w7
> +        ldrsw            x8, [sp]
> +        ubfx             x10, x4, #1, #31

The ubfx instruction is kinda esoteric; I presume what you're doing here 
is essentially the same as "lsr #1"? That'd be much more idiomatic and 
readable.

> +        sub              x8, x8, w4, sxtw #1               // src offset
> +        sub              x6, x6, w4, sxtw                  // lum offset
> +        sub              x7, x7, x10                       // chr offset
> +1:
> +        sub              w5, w5, #1

It feels a bit unusual to do the decrement of the outer loop counter here 
at this point; I feel that it would be more readable in context if it was 
done at the end after the 3: label. (There can of course be good reasons 
for doing it early due to scheduling etc, but I don't see such a case 
here.)

> +        ands             w10, w4, #~31
> +        and              w9, w4, #15
> +        and              w11, w4, #16
> +        b.eq             7f
> +4:
> +        ld4              {v0.16b - v3.16b}, [x3], #64      // handle 16 uyvy tuples per iteration
> +        subs             w10, w10, #32
> +        st1              {v2.16b}, [x2], #16
> +        st1              {v0.16b}, [x1], #16
> +        mov              v2.16b, v1.16b
> +        st2              {v2.16b,v3.16b}, [x0], #32
> +        b.ne             4b
> +7:
> +        cbz              w11, 5f                           // 8 - 15 remaining
> +        ld4              {v0.8b - v3.8b}, [x3], #32
> +        st1              {v2.8b}, [x2], #8
> +        st1              {v0.8b}, [x1], #8
> +        mov              v2.8b, v1.8b
> +        st2              {v2.8b,v3.8b}, [x0], #16
> +5:
> +        cbz              w9, 3f
> +2:
> +        subs             w9, w9, #2                        // 0 - 7 left
> +        ldrb             w12, [x3], #1
> +        strb             w12, [x1], #1
> +        ldrb             w12, [x3], #1
> +        strb             w12, [x0], #1
> +        ldrb             w12, [x3], #1
> +        strb             w12, [x2], #1
> +        ldrb             w12, [x3], #1
> +        strb             w12, [x0], #1
> +        b.ne             2b
> +3:
> +        add              x3, x3, x8
> +        add              x0, x0, x6
> +        add              x1, x1, x7
> +        add              x2, x2, x7
> +        cbnz             w5, 1b
> +        ret
> +endfunc

If the height decrement is moved into the end here, it can be a subs with 
a regular b.gt branch.

Other than that, this looks quite reasonable, thanks!

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422
  2025-02-10 13:15 ` Martin Storsjö
@ 2025-02-11 21:24   ` Krzysztof Pyrkosz via ffmpeg-devel
  2025-02-11 21:33   ` Krzysztof Pyrkosz via ffmpeg-devel
  1 sibling, 0 replies; 9+ messages in thread
From: Krzysztof Pyrkosz via ffmpeg-devel @ 2025-02-11 21:24 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Krzysztof Pyrkosz

[-- Attachment #1: Type: text/plain, Size: 1130 bytes --]

On Mon, Feb 10, 2025 at 03:15:35PM +0200, Martin Storsjö wrote:
> > Just as I'm about to send this patch, I'm thinking if non-interleaved
> > read followed by 4 invocations of TBL wouldn't be more performant. One
> > call to generate a contiguous vector of u, second for v and two for y.
> > I'm curious to find out.
> 
> My guess is that it may be more performant on more modern cores, but
> probably not on older ones.

That's the case. It's 15% faster on A78 and twice as slow on A72.

> 
> > +        sxtw             x7, w7
> > +        ldrsw            x8, [sp]
> > +        ubfx             x10, x4, #1, #31
> 
> The ubfx instruction is kinda esoteric; I presume what you're doing here is
> essentially the same as "lsr #1"? That'd be much more idiomatic and
> readable.

That's correct. What put me off was that register 4 is passed as int
(w4) and I expected register 10 to be 64 bits long with high bits set to
0. lsr w10, w4, #1 already does that.

I modified the code to handle {uyvy,yuyv}toyuv{420,422} using macros,
since these 4 functions share common routines. The code lost on the
readability, though.

Krzysztof

[-- Attachment #2: 0001-swscale-aarch64-rgb2rgb_neon-Implemented-uyvytoyuv42.patch --]
[-- Type: text/x-diff, Size: 15406 bytes --]

From a27f554ada9f2e81b4b19d313c9f19b348824ef1 Mon Sep 17 00:00:00 2001
From: Krzysztof Pyrkosz <ffmpeg@szaka.eu>
Date: Tue, 11 Feb 2025 22:04:19 +0100
Subject: [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422

---
 libswscale/aarch64/rgb2rgb.c      |  16 ++
 libswscale/aarch64/rgb2rgb_neon.S | 262 ++++++++++++++++++++++++++++++
 tests/checkasm/sw_rgb.c           |  63 ++++---
 3 files changed, 318 insertions(+), 23 deletions(-)

diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index 7e1dba572d..f474228298 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -67,6 +67,18 @@ void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size);
 
+void ff_uyvytoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+void ff_uyvytoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+void ff_yuyvtoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+void ff_yuyvtoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
 av_cold void rgb2rgb_init_aarch64(void)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -84,5 +96,9 @@ av_cold void rgb2rgb_init_aarch64(void)
         shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon;
         shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon;
         shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon;
+        uyvytoyuv422       = ff_uyvytoyuv422_neon;
+        uyvytoyuv420       = ff_uyvytoyuv420_neon;
+        yuyvtoyuv422       = ff_yuyvtoyuv422_neon;
+        yuyvtoyuv420       = ff_yuyvtoyuv420_neon;
     }
 }
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 22ecdf7ac8..9002aa028f 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -427,3 +427,265 @@ neon_shuf 2013
 neon_shuf 1203
 neon_shuf 2130
 neon_shuf 3210
+
+/*
+v0-v7 - two consecutive lines
+x0 - upper Y destination
+x1 - U destination
+x2 - V destination
+x3 - upper src line
+w5 - width/iteration counter - count of line pairs for yuv420, of single lines for 422
+x6 - lum padding
+x7 - chrom padding
+x8 - src padding
+w9 - number of bytes remaining in the tail
+x10 - lower Y destination
+w12 - tmp
+x13 - lower src line
+w14 - tmp
+w17 - set to 1 if last line has to be handled separately (odd height)
+*/
+
+// one fast path iteration processes 16 uyvy tuples
+// is_line_tail is set to 1 when final 16 tuples are being processed
+// skip_storing_chroma is set to 1 when final line is processed and the height is odd
+.macro fastpath_iteration src_fmt, dst_fmt, is_line_tail, skip_storing_chroma
+        ld4             {v0.16b - v3.16b}, [x3], #64
+.if ! \is_line_tail
+        subs            w14, w14, #32
+.endif
+
+.if ! \skip_storing_chroma
+.if \dst_fmt == yuv420
+        ld4             {v4.16b - v7.16b}, [x13], #64
+.endif
+
+.if \dst_fmt == yuv420                                    // store UV
+.if \src_fmt == uyvy
+        uhadd           v0.16b, v4.16b, v0.16b            // halving sum of U
+        uhadd           v2.16b, v6.16b, v2.16b            // halving sum of V
+.else
+        uhadd           v1.16b, v5.16b, v1.16b            // halving sum of U
+        uhadd           v3.16b, v7.16b, v3.16b            // halving sum of V
+.endif
+.endif
+
+.if \src_fmt == uyvy
+        st1             {v2.16b}, [x2], #16
+        st1             {v0.16b}, [x1], #16
+.else
+        st1             {v3.16b}, [x2], #16
+        st1             {v1.16b}, [x1], #16
+.endif
+
+.if \dst_fmt == yuv420                                    // store_y
+.if \src_fmt == uyvy
+        mov             v6.16b, v5.16b
+        st2             {v6.16b,v7.16b}, [x10], #32
+.else
+        mov             v5.16b, v4.16b
+        st2             {v5.16b,v6.16b}, [x10], #32
+.endif
+.endif
+
+.endif // ! \skip_storing_chroma
+
+.if \src_fmt == uyvy
+        mov             v2.16b, v1.16b
+        st2             {v2.16b,v3.16b}, [x0], #32
+.else
+        mov             v1.16b, v0.16b
+        st2             {v1.16b,v2.16b}, [x0], #32
+.endif
+.endm
+
+// shift pointers back to width - 32 to process the tail of the line
+// if the height is odd, processing the final line is simplified
+.macro fastpath_shift_back_pointers src_fmt, dst_fmt, is_final_odd_line
+        add             x3, x3, w9, sxtw #1
+        sub             x3, x3, #64
+.if ! \is_final_odd_line
+.if \dst_fmt == yuv420
+        add             x13, x13, w9, sxtw #1
+        sub             x13, x13, #64
+        add             x10, x10, w9, sxtw
+        sub             x10, x10, #32
+.endif
+.endif
+        add             x0, x0, w9, sxtw
+        sub             x0, x0, #32
+.if ! \is_final_odd_line
+        asr             w14, w9, #1
+        add             x1, x1, w14, sxtw
+        sub             x1, x1, #16
+        add             x2, x2, w14, sxtw
+        sub             x2, x2, #16
+.endif
+.endm
+
+.macro slowpath_iteration src_fmt, dst_fmt, skip_storing_chroma
+.if \dst_fmt == yuv422
+.if \src_fmt == uyvy
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x1], #1
+        strb            w14, [x0], #1
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x2], #1
+        strb            w14, [x0], #1
+.else
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x0], #1
+        strb            w14, [x1], #1
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x0], #1
+        strb            w14, [x2], #1
+.endif
+.endif
+.if \dst_fmt == yuv420
+.if \src_fmt == uyvy
+.if \skip_storing_chroma
+        ldrb            w12, [x3], #2
+        ldrb            w14, [x3], #2
+        strb            w12, [x0], #1
+        strb            w14, [x0], #1
+.else
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x13], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x1], #1
+        ldrb            w14, [x3], #1
+        ldrb            w12, [x13], #1
+        strb            w14, [x0], #1
+        strb            w12, [x10], #1
+        ldrb            w14, [x13], #1
+        ldrb            w12, [x3], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x2], #1
+        ldrb            w14, [x3], #1
+        ldrb            w12, [x13], #1
+        strb            w14, [x0], #1
+        strb            w12, [x10], #1
+.endif
+.else
+.if \skip_storing_chroma
+        ldrb            w12, [x3], #2
+        ldrb            w14, [x3], #2
+        strb            w12, [x0], #1
+        strb            w14, [x0], #1
+.else
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x13], #1
+        strb            w12, [x0], #1
+        strb            w14, [x10], #1
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x13], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x1], #1
+        ldrb            w14, [x3], #1
+        ldrb            w12, [x13], #1
+        strb            w14, [x0], #1
+        strb            w12, [x10], #1
+        ldrb            w14, [x13], #1
+        ldrb            w12, [x3], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x2], #1
+.endif
+.endif
+.endif
+.endm
+
+.macro move_pointers_to_next_line src_fmt, dst_fmt, is_final_odd_line
+        add             x3, x3, x8
+        add             x0, x0, x6
+.if \dst_fmt == yuv420
+        add             x13, x13, x8
+        add             x10, x10, x6
+.endif
+        add             x1, x1, x7
+        add             x2, x2, x7
+.endm
+
+.macro interleaved_yuv_to_planar src_fmt, dst_fmt
+function ff_\src_fmt\()to\dst_fmt\()_neon, export=1
+        sxtw            x6, w6
+        sxtw            x7, w7
+        ldrsw           x8, [sp]
+        ands            w11, w4, #~31                     // choose between fast and slow path
+
+.if \dst_fmt == yuv420
+        add             x10, x0, x6
+        add             x13, x3, x8
+        add             x8, x8, x8
+        add             x6, x6, x6
+        and             w17, w5, #1
+        asr             w5, w5, #1
+.endif
+        asr             w9, w4, #1
+        sub             x8, x8, w4, sxtw #1               // src offset
+        sub             x6, x6, w4, sxtw                  // lum offset
+        sub             x7, x7, x9                        // chr offset
+
+        b.eq            6f
+
+1:                                                        // fast path - the width is at least 32
+        and             w14, w4, #~31                     // w14 is the main loop counter
+        and             w9, w4, #31                       // w9 holds the remaining width, 0 to 31
+2:
+        fastpath_iteration \src_fmt, \dst_fmt, 0, 0
+        b.ne            2b
+        fastpath_shift_back_pointers \src_fmt, \dst_fmt, 0
+        fastpath_iteration \src_fmt, \dst_fmt, 0, 0
+        subs            w5, w5, #1
+        move_pointers_to_next_line \src_fmt, \dst_fmt
+        b.ne            1b
+
+.if \dst_fmt == yuv420                                    // handle the last line in case the height is odd
+        cbz             w17, 3f
+        and             w14, w4, #~31
+4:
+        fastpath_iteration \src_fmt, \dst_fmt, 0, 1
+        b.ne            4b
+        fastpath_shift_back_pointers \src_fmt, \dst_fmt, 1
+        fastpath_iteration \src_fmt, \dst_fmt, 1, 1
+3:
+.endif
+	ret
+
+6:                                                        // slow path - width is at most 31
+        and             w9, w4, #31
+7:
+        subs            w9, w9, #2
+        slowpath_iteration \src_fmt, \dst_fmt, 0
+        b.ne            7b
+        subs            w5, w5, #1
+        move_pointers_to_next_line \src_fmt, \dst_fmt
+        b.ne            6b
+
+.if \dst_fmt == yuv420
+        cbz             w17, 8f
+        and             w9, w4, #31
+.if \src_fmt == uyvy
+        add             x3, x3, #1
+.endif
+5:
+        subs            w9, w9, #2
+        slowpath_iteration \src_fmt, \dst_fmt, 1
+        b.ne            5b
+8:
+.endif
+        ret
+endfunc
+.endm
+
+interleaved_yuv_to_planar uyvy, yuv422
+interleaved_yuv_to_planar uyvy, yuv420
+interleaved_yuv_to_planar yuyv, yuv422
+interleaved_yuv_to_planar yuyv, yuv420
diff --git a/tests/checkasm/sw_rgb.c b/tests/checkasm/sw_rgb.c
index b98c7c6b47..183b4eeaa8 100644
--- a/tests/checkasm/sw_rgb.c
+++ b/tests/checkasm/sw_rgb.c
@@ -61,7 +61,7 @@ static void check_shuffle_bytes(void * func, const char * report)
     memcpy(src1, src0, MAX_STRIDE);
 
     if (check_func(func, "%s", report)) {
-        for (i = 0; i < 6; i ++) {
+        for (i = 0; i < FF_ARRAY_ELEMS(width); i ++) {
             call_ref(src0, dst0, width[i]);
             call_new(src1, dst1, width[i]);
             if (memcmp(dst0, dst1, MAX_STRIDE))
@@ -71,9 +71,24 @@ static void check_shuffle_bytes(void * func, const char * report)
     }
 }
 
-static void check_uyvy_to_422p(void)
+typedef void (*uyvy_to_yuv_func)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                           const uint8_t *src, int width, int height,
+                           int lumStride, int chromStride, int srcStride);
+
+typedef struct
+{
+    uyvy_to_yuv_func func;
+    const char* from;
+    int to;
+} uyvy_to_yuv_f;
+
+static void check_uyvy_to_yuv(void)
 {
     int i;
+    uyvy_to_yuv_f funcs[] = {
+	{uyvytoyuv420, "uyvy", 420}, {uyvytoyuv422, "uyvy", 422},
+	{yuyvtoyuv420, "yuyv", 420}, {yuyvtoyuv422, "yuyv", 422}
+    };
 
     LOCAL_ALIGNED_32(uint8_t, src0, [MAX_STRIDE * MAX_HEIGHT * 2]);
     LOCAL_ALIGNED_32(uint8_t, src1, [MAX_STRIDE * MAX_HEIGHT * 2]);
@@ -91,26 +106,28 @@ static void check_uyvy_to_422p(void)
     randomize_buffers(src0, MAX_STRIDE * MAX_HEIGHT * 2);
     memcpy(src1, src0, MAX_STRIDE * MAX_HEIGHT * 2);
 
-    if (check_func(uyvytoyuv422, "uyvytoyuv422")) {
-        for (i = 0; i < 6; i ++) {
-            memset(dst_y_0, 0, MAX_STRIDE * MAX_HEIGHT);
-            memset(dst_y_1, 0, MAX_STRIDE * MAX_HEIGHT);
-            memset(dst_u_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
-            memset(dst_u_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
-            memset(dst_v_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
-            memset(dst_v_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
-
-            call_ref(dst_y_0, dst_u_0, dst_v_0, src0, planes[i].w, planes[i].h,
-                     MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
-            call_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[i].w, planes[i].h,
-                     MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
-            if (memcmp(dst_y_0, dst_y_1, MAX_STRIDE * MAX_HEIGHT) ||
-                memcmp(dst_u_0, dst_u_1, (MAX_STRIDE/2) * MAX_HEIGHT) ||
-                memcmp(dst_v_0, dst_v_1, (MAX_STRIDE/2) * MAX_HEIGHT))
-                fail();
+    for (int k = 0; k < FF_ARRAY_ELEMS(funcs); k ++) {
+	if (check_func(funcs[k].func, "%stoyuv%d",funcs[k].from, funcs[k].to)) {
+	    for (i = 0; i < FF_ARRAY_ELEMS(planes); i ++) {
+		memset(dst_y_0, 0, MAX_STRIDE * MAX_HEIGHT);
+		memset(dst_y_1, 0, MAX_STRIDE * MAX_HEIGHT);
+		memset(dst_u_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+		memset(dst_u_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+		memset(dst_v_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+		memset(dst_v_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+
+		call_ref(dst_y_0, dst_u_0, dst_v_0, src0, planes[i].w, planes[i].h,
+			 MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
+		call_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[i].w, planes[i].h,
+			 MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
+		if (memcmp(dst_y_0, dst_y_1, MAX_STRIDE * MAX_HEIGHT) ||
+		    memcmp(dst_u_0, dst_u_1, (MAX_STRIDE/2) * MAX_HEIGHT) ||
+		    memcmp(dst_v_0, dst_v_1, (MAX_STRIDE/2) * MAX_HEIGHT))
+		    fail();
+	    }
+	    bench_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[5].w, planes[5].h,
+		      MAX_STRIDE, MAX_STRIDE / 2, planes[5].s);
         }
-        bench_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[5].w, planes[5].h,
-                  MAX_STRIDE, MAX_STRIDE / 2, planes[5].s);
     }
 }
 
@@ -834,8 +851,8 @@ void checkasm_check_sw_rgb(void)
     check_shuffle_bytes(shuffle_bytes_2130, "shuffle_bytes_2130");
     report("shuffle_bytes_2130");
 
-    check_uyvy_to_422p();
-    report("uyvytoyuv422");
+    check_uyvy_to_yuv();
+    report("uyvytoyuv");
 
     check_interleave_bytes();
     report("interleave_bytes");
-- 
2.47.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422
  2025-02-10 13:15 ` Martin Storsjö
  2025-02-11 21:24   ` Krzysztof Pyrkosz via ffmpeg-devel
@ 2025-02-11 21:33   ` Krzysztof Pyrkosz via ffmpeg-devel
  2025-02-11 21:53     ` Martin Storsjö
  1 sibling, 1 reply; 9+ messages in thread
From: Krzysztof Pyrkosz via ffmpeg-devel @ 2025-02-11 21:33 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Krzysztof Pyrkosz

I forgot to include the benchmarks in the previous message, here they
are:

A78:
uyvytoyuv420_neon:                                    6112.5 ( 6.96x)
uyvytoyuv422_neon:                                    6696.0 ( 6.32x)
yuyvtoyuv420_neon:                                    6113.0 ( 6.95x)
yuyvtoyuv422_neon:                                    6695.2 ( 6.31x)

A72:
uyvytoyuv420_neon:                                    9512.1 ( 6.09x)
uyvytoyuv422_neon:                                    9766.8 ( 6.32x)
yuyvtoyuv420_neon:                                    9639.1 ( 6.00x)
yuyvtoyuv422_neon:                                    9779.0 ( 6.03x)

A53:
uyvytoyuv420_neon:                                   12720.1 ( 9.10x)
uyvytoyuv422_neon:                                   14282.9 ( 6.71x)
yuyvtoyuv420_neon:                                   12637.4 ( 9.15x)
yuyvtoyuv422_neon:                                   14127.6 ( 6.77x)

---
 libswscale/aarch64/rgb2rgb.c      |  16 ++
 libswscale/aarch64/rgb2rgb_neon.S | 262 ++++++++++++++++++++++++++++++
 tests/checkasm/sw_rgb.c           |  63 ++++---
 3 files changed, 318 insertions(+), 23 deletions(-)

diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index 7e1dba572d..f474228298 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -67,6 +67,18 @@ void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size);
 
+void ff_uyvytoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+void ff_uyvytoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+void ff_yuyvtoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+void ff_yuyvtoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
 av_cold void rgb2rgb_init_aarch64(void)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -84,5 +96,9 @@ av_cold void rgb2rgb_init_aarch64(void)
         shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon;
         shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon;
         shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon;
+        uyvytoyuv422       = ff_uyvytoyuv422_neon;
+        uyvytoyuv420       = ff_uyvytoyuv420_neon;
+        yuyvtoyuv422       = ff_yuyvtoyuv422_neon;
+        yuyvtoyuv420       = ff_yuyvtoyuv420_neon;
     }
 }
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 22ecdf7ac8..9002aa028f 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -427,3 +427,265 @@ neon_shuf 2013
 neon_shuf 1203
 neon_shuf 2130
 neon_shuf 3210
+
+/*
+v0-v7 - two consecutive lines
+x0 - upper Y destination
+x1 - U destination
+x2 - V destination
+x3 - upper src line
+w5 - width/iteration counter - count of line pairs for yuv420, of single lines for 422
+x6 - lum padding
+x7 - chrom padding
+x8 - src padding
+w9 - number of bytes remaining in the tail
+x10 - lower Y destination
+w12 - tmp
+x13 - lower src line
+w14 - tmp
+w17 - set to 1 if last line has to be handled separately (odd height)
+*/
+
+// one fast path iteration processes 16 uyvy tuples
+// is_line_tail is set to 1 when final 16 tuples are being processed
+// skip_storing_chroma is set to 1 when final line is processed and the height is odd
+.macro fastpath_iteration src_fmt, dst_fmt, is_line_tail, skip_storing_chroma
+        ld4             {v0.16b - v3.16b}, [x3], #64
+.if ! \is_line_tail
+        subs            w14, w14, #32
+.endif
+
+.if ! \skip_storing_chroma
+.if \dst_fmt == yuv420
+        ld4             {v4.16b - v7.16b}, [x13], #64
+.endif
+
+.if \dst_fmt == yuv420                                    // store UV
+.if \src_fmt == uyvy
+        uhadd           v0.16b, v4.16b, v0.16b            // halving sum of U
+        uhadd           v2.16b, v6.16b, v2.16b            // halving sum of V
+.else
+        uhadd           v1.16b, v5.16b, v1.16b            // halving sum of U
+        uhadd           v3.16b, v7.16b, v3.16b            // halving sum of V
+.endif
+.endif
+
+.if \src_fmt == uyvy
+        st1             {v2.16b}, [x2], #16
+        st1             {v0.16b}, [x1], #16
+.else
+        st1             {v3.16b}, [x2], #16
+        st1             {v1.16b}, [x1], #16
+.endif
+
+.if \dst_fmt == yuv420                                    // store_y
+.if \src_fmt == uyvy
+        mov             v6.16b, v5.16b
+        st2             {v6.16b,v7.16b}, [x10], #32
+.else
+        mov             v5.16b, v4.16b
+        st2             {v5.16b,v6.16b}, [x10], #32
+.endif
+.endif
+
+.endif // ! \skip_storing_chroma
+
+.if \src_fmt == uyvy
+        mov             v2.16b, v1.16b
+        st2             {v2.16b,v3.16b}, [x0], #32
+.else
+        mov             v1.16b, v0.16b
+        st2             {v1.16b,v2.16b}, [x0], #32
+.endif
+.endm
+
+// shift pointers back to width - 32 to process the tail of the line
+// if the height is odd, processing the final line is simplified
+.macro fastpath_shift_back_pointers src_fmt, dst_fmt, is_final_odd_line
+        add             x3, x3, w9, sxtw #1
+        sub             x3, x3, #64
+.if ! \is_final_odd_line
+.if \dst_fmt == yuv420
+        add             x13, x13, w9, sxtw #1
+        sub             x13, x13, #64
+        add             x10, x10, w9, sxtw
+        sub             x10, x10, #32
+.endif
+.endif
+        add             x0, x0, w9, sxtw
+        sub             x0, x0, #32
+.if ! \is_final_odd_line
+        asr             w14, w9, #1
+        add             x1, x1, w14, sxtw
+        sub             x1, x1, #16
+        add             x2, x2, w14, sxtw
+        sub             x2, x2, #16
+.endif
+.endm
+
+.macro slowpath_iteration src_fmt, dst_fmt, skip_storing_chroma
+.if \dst_fmt == yuv422
+.if \src_fmt == uyvy
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x1], #1
+        strb            w14, [x0], #1
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x2], #1
+        strb            w14, [x0], #1
+.else
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x0], #1
+        strb            w14, [x1], #1
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x0], #1
+        strb            w14, [x2], #1
+.endif
+.endif
+.if \dst_fmt == yuv420
+.if \src_fmt == uyvy
+.if \skip_storing_chroma
+        ldrb            w12, [x3], #2
+        ldrb            w14, [x3], #2
+        strb            w12, [x0], #1
+        strb            w14, [x0], #1
+.else
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x13], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x1], #1
+        ldrb            w14, [x3], #1
+        ldrb            w12, [x13], #1
+        strb            w14, [x0], #1
+        strb            w12, [x10], #1
+        ldrb            w14, [x13], #1
+        ldrb            w12, [x3], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x2], #1
+        ldrb            w14, [x3], #1
+        ldrb            w12, [x13], #1
+        strb            w14, [x0], #1
+        strb            w12, [x10], #1
+.endif
+.else
+.if \skip_storing_chroma
+        ldrb            w12, [x3], #2
+        ldrb            w14, [x3], #2
+        strb            w12, [x0], #1
+        strb            w14, [x0], #1
+.else
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x13], #1
+        strb            w12, [x0], #1
+        strb            w14, [x10], #1
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x13], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x1], #1
+        ldrb            w14, [x3], #1
+        ldrb            w12, [x13], #1
+        strb            w14, [x0], #1
+        strb            w12, [x10], #1
+        ldrb            w14, [x13], #1
+        ldrb            w12, [x3], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x2], #1
+.endif
+.endif
+.endif
+.endm
+
+.macro move_pointers_to_next_line src_fmt, dst_fmt, is_final_odd_line
+        add             x3, x3, x8
+        add             x0, x0, x6
+.if \dst_fmt == yuv420
+        add             x13, x13, x8
+        add             x10, x10, x6
+.endif
+        add             x1, x1, x7
+        add             x2, x2, x7
+.endm
+
+.macro interleaved_yuv_to_planar src_fmt, dst_fmt
+function ff_\src_fmt\()to\dst_fmt\()_neon, export=1
+        sxtw            x6, w6
+        sxtw            x7, w7
+        ldrsw           x8, [sp]
+        ands            w11, w4, #~31                     // choose between fast and slow path
+
+.if \dst_fmt == yuv420
+        add             x10, x0, x6
+        add             x13, x3, x8
+        add             x8, x8, x8
+        add             x6, x6, x6
+        and             w17, w5, #1
+        asr             w5, w5, #1
+.endif
+        asr             w9, w4, #1
+        sub             x8, x8, w4, sxtw #1               // src offset
+        sub             x6, x6, w4, sxtw                  // lum offset
+        sub             x7, x7, x9                        // chr offset
+
+        b.eq            6f
+
+1:                                                        // fast path - the width is at least 32
+        and             w14, w4, #~31                     // w14 is the main loop counter
+        and             w9, w4, #31                       // w9 holds the remaining width, 0 to 31
+2:
+        fastpath_iteration \src_fmt, \dst_fmt, 0, 0
+        b.ne            2b
+        fastpath_shift_back_pointers \src_fmt, \dst_fmt, 0
+        fastpath_iteration \src_fmt, \dst_fmt, 0, 0
+        subs            w5, w5, #1
+        move_pointers_to_next_line \src_fmt, \dst_fmt
+        b.ne            1b
+
+.if \dst_fmt == yuv420                                    // handle the last line in case the height is odd
+        cbz             w17, 3f
+        and             w14, w4, #~31
+4:
+        fastpath_iteration \src_fmt, \dst_fmt, 0, 1
+        b.ne            4b
+        fastpath_shift_back_pointers \src_fmt, \dst_fmt, 1
+        fastpath_iteration \src_fmt, \dst_fmt, 1, 1
+3:
+.endif
+	ret
+
+6:                                                        // slow path - width is at most 31
+        and             w9, w4, #31
+7:
+        subs            w9, w9, #2
+        slowpath_iteration \src_fmt, \dst_fmt, 0
+        b.ne            7b
+        subs            w5, w5, #1
+        move_pointers_to_next_line \src_fmt, \dst_fmt
+        b.ne            6b
+
+.if \dst_fmt == yuv420
+        cbz             w17, 8f
+        and             w9, w4, #31
+.if \src_fmt == uyvy
+        add             x3, x3, #1
+.endif
+5:
+        subs            w9, w9, #2
+        slowpath_iteration \src_fmt, \dst_fmt, 1
+        b.ne            5b
+8:
+.endif
+        ret
+endfunc
+.endm
+
+interleaved_yuv_to_planar uyvy, yuv422
+interleaved_yuv_to_planar uyvy, yuv420
+interleaved_yuv_to_planar yuyv, yuv422
+interleaved_yuv_to_planar yuyv, yuv420
diff --git a/tests/checkasm/sw_rgb.c b/tests/checkasm/sw_rgb.c
index b98c7c6b47..183b4eeaa8 100644
--- a/tests/checkasm/sw_rgb.c
+++ b/tests/checkasm/sw_rgb.c
@@ -61,7 +61,7 @@ static void check_shuffle_bytes(void * func, const char * report)
     memcpy(src1, src0, MAX_STRIDE);
 
     if (check_func(func, "%s", report)) {
-        for (i = 0; i < 6; i ++) {
+        for (i = 0; i < FF_ARRAY_ELEMS(width); i ++) {
             call_ref(src0, dst0, width[i]);
             call_new(src1, dst1, width[i]);
             if (memcmp(dst0, dst1, MAX_STRIDE))
@@ -71,9 +71,24 @@ static void check_shuffle_bytes(void * func, const char * report)
     }
 }
 
-static void check_uyvy_to_422p(void)
+typedef void (*uyvy_to_yuv_func)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                           const uint8_t *src, int width, int height,
+                           int lumStride, int chromStride, int srcStride);
+
+typedef struct
+{
+    uyvy_to_yuv_func func;
+    const char* from;
+    int to;
+} uyvy_to_yuv_f;
+
+static void check_uyvy_to_yuv(void)
 {
     int i;
+    uyvy_to_yuv_f funcs[] = {
+	{uyvytoyuv420, "uyvy", 420}, {uyvytoyuv422, "uyvy", 422},
+	{yuyvtoyuv420, "yuyv", 420}, {yuyvtoyuv422, "yuyv", 422}
+    };
 
     LOCAL_ALIGNED_32(uint8_t, src0, [MAX_STRIDE * MAX_HEIGHT * 2]);
     LOCAL_ALIGNED_32(uint8_t, src1, [MAX_STRIDE * MAX_HEIGHT * 2]);
@@ -91,26 +106,28 @@ static void check_uyvy_to_422p(void)
     randomize_buffers(src0, MAX_STRIDE * MAX_HEIGHT * 2);
     memcpy(src1, src0, MAX_STRIDE * MAX_HEIGHT * 2);
 
-    if (check_func(uyvytoyuv422, "uyvytoyuv422")) {
-        for (i = 0; i < 6; i ++) {
-            memset(dst_y_0, 0, MAX_STRIDE * MAX_HEIGHT);
-            memset(dst_y_1, 0, MAX_STRIDE * MAX_HEIGHT);
-            memset(dst_u_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
-            memset(dst_u_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
-            memset(dst_v_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
-            memset(dst_v_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
-
-            call_ref(dst_y_0, dst_u_0, dst_v_0, src0, planes[i].w, planes[i].h,
-                     MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
-            call_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[i].w, planes[i].h,
-                     MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
-            if (memcmp(dst_y_0, dst_y_1, MAX_STRIDE * MAX_HEIGHT) ||
-                memcmp(dst_u_0, dst_u_1, (MAX_STRIDE/2) * MAX_HEIGHT) ||
-                memcmp(dst_v_0, dst_v_1, (MAX_STRIDE/2) * MAX_HEIGHT))
-                fail();
+    for (int k = 0; k < FF_ARRAY_ELEMS(funcs); k ++) {
+	if (check_func(funcs[k].func, "%stoyuv%d",funcs[k].from, funcs[k].to)) {
+	    for (i = 0; i < FF_ARRAY_ELEMS(planes); i ++) {
+		memset(dst_y_0, 0, MAX_STRIDE * MAX_HEIGHT);
+		memset(dst_y_1, 0, MAX_STRIDE * MAX_HEIGHT);
+		memset(dst_u_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+		memset(dst_u_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+		memset(dst_v_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+		memset(dst_v_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+
+		call_ref(dst_y_0, dst_u_0, dst_v_0, src0, planes[i].w, planes[i].h,
+			 MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
+		call_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[i].w, planes[i].h,
+			 MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
+		if (memcmp(dst_y_0, dst_y_1, MAX_STRIDE * MAX_HEIGHT) ||
+		    memcmp(dst_u_0, dst_u_1, (MAX_STRIDE/2) * MAX_HEIGHT) ||
+		    memcmp(dst_v_0, dst_v_1, (MAX_STRIDE/2) * MAX_HEIGHT))
+		    fail();
+	    }
+	    bench_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[5].w, planes[5].h,
+		      MAX_STRIDE, MAX_STRIDE / 2, planes[5].s);
         }
-        bench_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[5].w, planes[5].h,
-                  MAX_STRIDE, MAX_STRIDE / 2, planes[5].s);
     }
 }
 
@@ -834,8 +851,8 @@ void checkasm_check_sw_rgb(void)
     check_shuffle_bytes(shuffle_bytes_2130, "shuffle_bytes_2130");
     report("shuffle_bytes_2130");
 
-    check_uyvy_to_422p();
-    report("uyvytoyuv422");
+    check_uyvy_to_yuv();
+    report("uyvytoyuv");
 
     check_interleave_bytes();
     report("interleave_bytes");
-- 
2.47.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422
  2025-02-11 21:33   ` Krzysztof Pyrkosz via ffmpeg-devel
@ 2025-02-11 21:53     ` Martin Storsjö
  2025-02-11 22:06       ` [FFmpeg-devel] [PATCH 1/2] tests/checkasm/sw_rgb: Added {yuyv, uyvy}toyuv{420, 422} test cases Krzysztof Pyrkosz via ffmpeg-devel
  2025-02-11 22:06       ` [FFmpeg-devel] [PATCH 2/2] swscale/aarch64/rgb2rgb_neon: Implemented {yuyv, uyvy}toyuv{420, 422} Krzysztof Pyrkosz via ffmpeg-devel
  0 siblings, 2 replies; 9+ messages in thread
From: Martin Storsjö @ 2025-02-11 21:53 UTC (permalink / raw)
  To: Krzysztof Pyrkosz via ffmpeg-devel; +Cc: Krzysztof Pyrkosz

On Tue, 11 Feb 2025, Krzysztof Pyrkosz via ffmpeg-devel wrote:

> I forgot to include the benchmarks in the previous message, here they
> are:
>
> A78:
> uyvytoyuv420_neon:                                    6112.5 ( 6.96x)
> uyvytoyuv422_neon:                                    6696.0 ( 6.32x)
> yuyvtoyuv420_neon:                                    6113.0 ( 6.95x)
> yuyvtoyuv422_neon:                                    6695.2 ( 6.31x)
>
> A72:
> uyvytoyuv420_neon:                                    9512.1 ( 6.09x)
> uyvytoyuv422_neon:                                    9766.8 ( 6.32x)
> yuyvtoyuv420_neon:                                    9639.1 ( 6.00x)
> yuyvtoyuv422_neon:                                    9779.0 ( 6.03x)
>
> A53:
> uyvytoyuv420_neon:                                   12720.1 ( 9.10x)
> uyvytoyuv422_neon:                                   14282.9 ( 6.71x)
> yuyvtoyuv420_neon:                                   12637.4 ( 9.15x)
> yuyvtoyuv422_neon:                                   14127.6 ( 6.77x)
>
> ---
> libswscale/aarch64/rgb2rgb.c      |  16 ++
> libswscale/aarch64/rgb2rgb_neon.S | 262 ++++++++++++++++++++++++++++++
> tests/checkasm/sw_rgb.c           |  63 ++++---
> 3 files changed, 318 insertions(+), 23 deletions(-)

Can you split out the checkasm changes to a separate preceding patch? That 
makes it easier to reason about it, regarding whether the new test 
coverage works for archs with existing asm, etc.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [FFmpeg-devel] [PATCH 1/2] tests/checkasm/sw_rgb: Added {yuyv, uyvy}toyuv{420, 422} test cases
  2025-02-11 21:53     ` Martin Storsjö
@ 2025-02-11 22:06       ` Krzysztof Pyrkosz via ffmpeg-devel
  2025-02-12  9:37         ` Martin Storsjö
  2025-02-11 22:06       ` [FFmpeg-devel] [PATCH 2/2] swscale/aarch64/rgb2rgb_neon: Implemented {yuyv, uyvy}toyuv{420, 422} Krzysztof Pyrkosz via ffmpeg-devel
  1 sibling, 1 reply; 9+ messages in thread
From: Krzysztof Pyrkosz via ffmpeg-devel @ 2025-02-11 22:06 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Krzysztof Pyrkosz

Splitting the previous patch into two.
I noticed that on my x86 box, one of the newly added tests fail:

MMXEXT:
   uyvytoyuv420_mmxext (sw_rgb.c:126)
   yuyvtoyuv420_mmxext (sw_rgb.c:126)
 - sw_rgb.uyvytoyuv          [FAILED]

SSE2, AVX and AVX2 are passing, though.

---
 tests/checkasm/sw_rgb.c | 63 ++++++++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 23 deletions(-)

diff --git a/tests/checkasm/sw_rgb.c b/tests/checkasm/sw_rgb.c
index b98c7c6b47..183b4eeaa8 100644
--- a/tests/checkasm/sw_rgb.c
+++ b/tests/checkasm/sw_rgb.c
@@ -61,7 +61,7 @@ static void check_shuffle_bytes(void * func, const char * report)
     memcpy(src1, src0, MAX_STRIDE);
 
     if (check_func(func, "%s", report)) {
-        for (i = 0; i < 6; i ++) {
+        for (i = 0; i < FF_ARRAY_ELEMS(width); i ++) {
             call_ref(src0, dst0, width[i]);
             call_new(src1, dst1, width[i]);
             if (memcmp(dst0, dst1, MAX_STRIDE))
@@ -71,9 +71,24 @@ static void check_shuffle_bytes(void * func, const char * report)
     }
 }
 
-static void check_uyvy_to_422p(void)
+typedef void (*uyvy_to_yuv_func)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                           const uint8_t *src, int width, int height,
+                           int lumStride, int chromStride, int srcStride);
+
+typedef struct
+{
+    uyvy_to_yuv_func func;
+    const char* from;
+    int to;
+} uyvy_to_yuv_f;
+
+static void check_uyvy_to_yuv(void)
 {
     int i;
+    uyvy_to_yuv_f funcs[] = {
+	{uyvytoyuv420, "uyvy", 420}, {uyvytoyuv422, "uyvy", 422},
+	{yuyvtoyuv420, "yuyv", 420}, {yuyvtoyuv422, "yuyv", 422}
+    };
 
     LOCAL_ALIGNED_32(uint8_t, src0, [MAX_STRIDE * MAX_HEIGHT * 2]);
     LOCAL_ALIGNED_32(uint8_t, src1, [MAX_STRIDE * MAX_HEIGHT * 2]);
@@ -91,26 +106,28 @@ static void check_uyvy_to_422p(void)
     randomize_buffers(src0, MAX_STRIDE * MAX_HEIGHT * 2);
     memcpy(src1, src0, MAX_STRIDE * MAX_HEIGHT * 2);
 
-    if (check_func(uyvytoyuv422, "uyvytoyuv422")) {
-        for (i = 0; i < 6; i ++) {
-            memset(dst_y_0, 0, MAX_STRIDE * MAX_HEIGHT);
-            memset(dst_y_1, 0, MAX_STRIDE * MAX_HEIGHT);
-            memset(dst_u_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
-            memset(dst_u_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
-            memset(dst_v_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
-            memset(dst_v_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
-
-            call_ref(dst_y_0, dst_u_0, dst_v_0, src0, planes[i].w, planes[i].h,
-                     MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
-            call_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[i].w, planes[i].h,
-                     MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
-            if (memcmp(dst_y_0, dst_y_1, MAX_STRIDE * MAX_HEIGHT) ||
-                memcmp(dst_u_0, dst_u_1, (MAX_STRIDE/2) * MAX_HEIGHT) ||
-                memcmp(dst_v_0, dst_v_1, (MAX_STRIDE/2) * MAX_HEIGHT))
-                fail();
+    for (int k = 0; k < FF_ARRAY_ELEMS(funcs); k ++) {
+	if (check_func(funcs[k].func, "%stoyuv%d",funcs[k].from, funcs[k].to)) {
+	    for (i = 0; i < FF_ARRAY_ELEMS(planes); i ++) {
+		memset(dst_y_0, 0, MAX_STRIDE * MAX_HEIGHT);
+		memset(dst_y_1, 0, MAX_STRIDE * MAX_HEIGHT);
+		memset(dst_u_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+		memset(dst_u_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+		memset(dst_v_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+		memset(dst_v_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+
+		call_ref(dst_y_0, dst_u_0, dst_v_0, src0, planes[i].w, planes[i].h,
+			 MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
+		call_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[i].w, planes[i].h,
+			 MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
+		if (memcmp(dst_y_0, dst_y_1, MAX_STRIDE * MAX_HEIGHT) ||
+		    memcmp(dst_u_0, dst_u_1, (MAX_STRIDE/2) * MAX_HEIGHT) ||
+		    memcmp(dst_v_0, dst_v_1, (MAX_STRIDE/2) * MAX_HEIGHT))
+		    fail();
+	    }
+	    bench_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[5].w, planes[5].h,
+		      MAX_STRIDE, MAX_STRIDE / 2, planes[5].s);
         }
-        bench_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[5].w, planes[5].h,
-                  MAX_STRIDE, MAX_STRIDE / 2, planes[5].s);
     }
 }
 
@@ -834,8 +851,8 @@ void checkasm_check_sw_rgb(void)
     check_shuffle_bytes(shuffle_bytes_2130, "shuffle_bytes_2130");
     report("shuffle_bytes_2130");
 
-    check_uyvy_to_422p();
-    report("uyvytoyuv422");
+    check_uyvy_to_yuv();
+    report("uyvytoyuv");
 
     check_interleave_bytes();
     report("interleave_bytes");
-- 
2.47.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [FFmpeg-devel] [PATCH 2/2] swscale/aarch64/rgb2rgb_neon: Implemented {yuyv, uyvy}toyuv{420, 422}
  2025-02-11 21:53     ` Martin Storsjö
  2025-02-11 22:06       ` [FFmpeg-devel] [PATCH 1/2] tests/checkasm/sw_rgb: Added {yuyv, uyvy}toyuv{420, 422} test cases Krzysztof Pyrkosz via ffmpeg-devel
@ 2025-02-11 22:06       ` Krzysztof Pyrkosz via ffmpeg-devel
  2025-02-12 12:02         ` Martin Storsjö
  1 sibling, 1 reply; 9+ messages in thread
From: Krzysztof Pyrkosz via ffmpeg-devel @ 2025-02-11 22:06 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Krzysztof Pyrkosz

---
 libswscale/aarch64/rgb2rgb.c      |  16 ++
 libswscale/aarch64/rgb2rgb_neon.S | 262 ++++++++++++++++++++++++++++++
 2 files changed, 278 insertions(+)

diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index 7e1dba572d..f474228298 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -67,6 +67,18 @@ void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size);
 
+void ff_uyvytoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+void ff_uyvytoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+void ff_yuyvtoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+void ff_yuyvtoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
 av_cold void rgb2rgb_init_aarch64(void)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -84,5 +96,9 @@ av_cold void rgb2rgb_init_aarch64(void)
         shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon;
         shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon;
         shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon;
+        uyvytoyuv422       = ff_uyvytoyuv422_neon;
+        uyvytoyuv420       = ff_uyvytoyuv420_neon;
+        yuyvtoyuv422       = ff_yuyvtoyuv422_neon;
+        yuyvtoyuv420       = ff_yuyvtoyuv420_neon;
     }
 }
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 22ecdf7ac8..9002aa028f 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -427,3 +427,265 @@ neon_shuf 2013
 neon_shuf 1203
 neon_shuf 2130
 neon_shuf 3210
+
+/*
+v0-v7 - two consecutive lines
+x0 - upper Y destination
+x1 - U destination
+x2 - V destination
+x3 - upper src line
+w5 - width/iteration counter - count of line pairs for yuv420, of single lines for 422
+x6 - lum padding
+x7 - chrom padding
+x8 - src padding
+w9 - number of bytes remaining in the tail
+x10 - lower Y destination
+w12 - tmp
+x13 - lower src line
+w14 - tmp
+w17 - set to 1 if last line has to be handled separately (odd height)
+*/
+
+// one fast path iteration processes 16 uyvy tuples
+// is_line_tail is set to 1 when final 16 tuples are being processed
+// skip_storing_chroma is set to 1 when final line is processed and the height is odd
+.macro fastpath_iteration src_fmt, dst_fmt, is_line_tail, skip_storing_chroma
+        ld4             {v0.16b - v3.16b}, [x3], #64
+.if ! \is_line_tail
+        subs            w14, w14, #32
+.endif
+
+.if ! \skip_storing_chroma
+.if \dst_fmt == yuv420
+        ld4             {v4.16b - v7.16b}, [x13], #64
+.endif
+
+.if \dst_fmt == yuv420                                    // store UV
+.if \src_fmt == uyvy
+        uhadd           v0.16b, v4.16b, v0.16b            // halving sum of U
+        uhadd           v2.16b, v6.16b, v2.16b            // halving sum of V
+.else
+        uhadd           v1.16b, v5.16b, v1.16b            // halving sum of U
+        uhadd           v3.16b, v7.16b, v3.16b            // halving sum of V
+.endif
+.endif
+
+.if \src_fmt == uyvy
+        st1             {v2.16b}, [x2], #16
+        st1             {v0.16b}, [x1], #16
+.else
+        st1             {v3.16b}, [x2], #16
+        st1             {v1.16b}, [x1], #16
+.endif
+
+.if \dst_fmt == yuv420                                    // store_y
+.if \src_fmt == uyvy
+        mov             v6.16b, v5.16b
+        st2             {v6.16b,v7.16b}, [x10], #32
+.else
+        mov             v5.16b, v4.16b
+        st2             {v5.16b,v6.16b}, [x10], #32
+.endif
+.endif
+
+.endif // ! \skip_storing_chroma
+
+.if \src_fmt == uyvy
+        mov             v2.16b, v1.16b
+        st2             {v2.16b,v3.16b}, [x0], #32
+.else
+        mov             v1.16b, v0.16b
+        st2             {v1.16b,v2.16b}, [x0], #32
+.endif
+.endm
+
+// shift pointers back to width - 32 to process the tail of the line
+// if the height is odd, processing the final line is simplified
+.macro fastpath_shift_back_pointers src_fmt, dst_fmt, is_final_odd_line
+        add             x3, x3, w9, sxtw #1
+        sub             x3, x3, #64
+.if ! \is_final_odd_line
+.if \dst_fmt == yuv420
+        add             x13, x13, w9, sxtw #1
+        sub             x13, x13, #64
+        add             x10, x10, w9, sxtw
+        sub             x10, x10, #32
+.endif
+.endif
+        add             x0, x0, w9, sxtw
+        sub             x0, x0, #32
+.if ! \is_final_odd_line
+        asr             w14, w9, #1
+        add             x1, x1, w14, sxtw
+        sub             x1, x1, #16
+        add             x2, x2, w14, sxtw
+        sub             x2, x2, #16
+.endif
+.endm
+
+.macro slowpath_iteration src_fmt, dst_fmt, skip_storing_chroma
+.if \dst_fmt == yuv422
+.if \src_fmt == uyvy
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x1], #1
+        strb            w14, [x0], #1
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x2], #1
+        strb            w14, [x0], #1
+.else
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x0], #1
+        strb            w14, [x1], #1
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x3], #1
+        strb            w12, [x0], #1
+        strb            w14, [x2], #1
+.endif
+.endif
+.if \dst_fmt == yuv420
+.if \src_fmt == uyvy
+.if \skip_storing_chroma
+        ldrb            w12, [x3], #2
+        ldrb            w14, [x3], #2
+        strb            w12, [x0], #1
+        strb            w14, [x0], #1
+.else
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x13], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x1], #1
+        ldrb            w14, [x3], #1
+        ldrb            w12, [x13], #1
+        strb            w14, [x0], #1
+        strb            w12, [x10], #1
+        ldrb            w14, [x13], #1
+        ldrb            w12, [x3], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x2], #1
+        ldrb            w14, [x3], #1
+        ldrb            w12, [x13], #1
+        strb            w14, [x0], #1
+        strb            w12, [x10], #1
+.endif
+.else
+.if \skip_storing_chroma
+        ldrb            w12, [x3], #2
+        ldrb            w14, [x3], #2
+        strb            w12, [x0], #1
+        strb            w14, [x0], #1
+.else
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x13], #1
+        strb            w12, [x0], #1
+        strb            w14, [x10], #1
+        ldrb            w12, [x3], #1
+        ldrb            w14, [x13], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x1], #1
+        ldrb            w14, [x3], #1
+        ldrb            w12, [x13], #1
+        strb            w14, [x0], #1
+        strb            w12, [x10], #1
+        ldrb            w14, [x13], #1
+        ldrb            w12, [x3], #1
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x2], #1
+.endif
+.endif
+.endif
+.endm
+
+.macro move_pointers_to_next_line src_fmt, dst_fmt, is_final_odd_line
+        add             x3, x3, x8
+        add             x0, x0, x6
+.if \dst_fmt == yuv420
+        add             x13, x13, x8
+        add             x10, x10, x6
+.endif
+        add             x1, x1, x7
+        add             x2, x2, x7
+.endm
+
+.macro interleaved_yuv_to_planar src_fmt, dst_fmt
+function ff_\src_fmt\()to\dst_fmt\()_neon, export=1
+        sxtw            x6, w6
+        sxtw            x7, w7
+        ldrsw           x8, [sp]
+        ands            w11, w4, #~31                     // choose between fast and slow path
+
+.if \dst_fmt == yuv420
+        add             x10, x0, x6
+        add             x13, x3, x8
+        add             x8, x8, x8
+        add             x6, x6, x6
+        and             w17, w5, #1
+        asr             w5, w5, #1
+.endif
+        asr             w9, w4, #1
+        sub             x8, x8, w4, sxtw #1               // src offset
+        sub             x6, x6, w4, sxtw                  // lum offset
+        sub             x7, x7, x9                        // chr offset
+
+        b.eq            6f
+
+1:                                                        // fast path - the width is at least 32
+        and             w14, w4, #~31                     // w14 is the main loop counter
+        and             w9, w4, #31                       // w9 holds the remaining width, 0 to 31
+2:
+        fastpath_iteration \src_fmt, \dst_fmt, 0, 0
+        b.ne            2b
+        fastpath_shift_back_pointers \src_fmt, \dst_fmt, 0
+        fastpath_iteration \src_fmt, \dst_fmt, 0, 0
+        subs            w5, w5, #1
+        move_pointers_to_next_line \src_fmt, \dst_fmt
+        b.ne            1b
+
+.if \dst_fmt == yuv420                                    // handle the last line in case the height is odd
+        cbz             w17, 3f
+        and             w14, w4, #~31
+4:
+        fastpath_iteration \src_fmt, \dst_fmt, 0, 1
+        b.ne            4b
+        fastpath_shift_back_pointers \src_fmt, \dst_fmt, 1
+        fastpath_iteration \src_fmt, \dst_fmt, 1, 1
+3:
+.endif
+	ret
+
+6:                                                        // slow path - width is at most 31
+        and             w9, w4, #31
+7:
+        subs            w9, w9, #2
+        slowpath_iteration \src_fmt, \dst_fmt, 0
+        b.ne            7b
+        subs            w5, w5, #1
+        move_pointers_to_next_line \src_fmt, \dst_fmt
+        b.ne            6b
+
+.if \dst_fmt == yuv420
+        cbz             w17, 8f
+        and             w9, w4, #31
+.if \src_fmt == uyvy
+        add             x3, x3, #1
+.endif
+5:
+        subs            w9, w9, #2
+        slowpath_iteration \src_fmt, \dst_fmt, 1
+        b.ne            5b
+8:
+.endif
+        ret
+endfunc
+.endm
+
+interleaved_yuv_to_planar uyvy, yuv422
+interleaved_yuv_to_planar uyvy, yuv420
+interleaved_yuv_to_planar yuyv, yuv422
+interleaved_yuv_to_planar yuyv, yuv420
-- 
2.47.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] tests/checkasm/sw_rgb: Added {yuyv, uyvy}toyuv{420, 422} test cases
  2025-02-11 22:06       ` [FFmpeg-devel] [PATCH 1/2] tests/checkasm/sw_rgb: Added {yuyv, uyvy}toyuv{420, 422} test cases Krzysztof Pyrkosz via ffmpeg-devel
@ 2025-02-12  9:37         ` Martin Storsjö
  0 siblings, 0 replies; 9+ messages in thread
From: Martin Storsjö @ 2025-02-12  9:37 UTC (permalink / raw)
  To: Krzysztof Pyrkosz via ffmpeg-devel; +Cc: Krzysztof Pyrkosz

On Tue, 11 Feb 2025, Krzysztof Pyrkosz via ffmpeg-devel wrote:

> Splitting the previous patch into two.
> I noticed that on my x86 box, one of the newly added tests fail:
>
> MMXEXT:
>   uyvytoyuv420_mmxext (sw_rgb.c:126)
>   yuyvtoyuv420_mmxext (sw_rgb.c:126)
> - sw_rgb.uyvytoyuv          [FAILED]
>
> SSE2, AVX and AVX2 are passing, though.

Hmm, that's problematic - we need to sort that out before we can push this 
test.

It's possible that this is a bug in the mmxext implementation. It's also 
possible that it is a case where the test is overly strict about something 
where we legitimately are allowed to deviate a little. (E.g. for cases 
where we know we have aligned/padded line sizes, it may be acceptable to 
overwrite the payload area by a couple of bytes.)

For failures in mmx tests, it's also possible that the declare_func 
should be a declare_func_emms instead.

We'd need to check exactly what differs in the output.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] swscale/aarch64/rgb2rgb_neon: Implemented {yuyv, uyvy}toyuv{420, 422}
  2025-02-11 22:06       ` [FFmpeg-devel] [PATCH 2/2] swscale/aarch64/rgb2rgb_neon: Implemented {yuyv, uyvy}toyuv{420, 422} Krzysztof Pyrkosz via ffmpeg-devel
@ 2025-02-12 12:02         ` Martin Storsjö
  0 siblings, 0 replies; 9+ messages in thread
From: Martin Storsjö @ 2025-02-12 12:02 UTC (permalink / raw)
  To: Krzysztof Pyrkosz via ffmpeg-devel; +Cc: Krzysztof Pyrkosz

On Tue, 11 Feb 2025, Krzysztof Pyrkosz via ffmpeg-devel wrote:

> ---
> libswscale/aarch64/rgb2rgb.c      |  16 ++
> libswscale/aarch64/rgb2rgb_neon.S | 262 ++++++++++++++++++++++++++++++
> 2 files changed, 278 insertions(+)
>
> diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
> index 7e1dba572d..f474228298 100644
> --- a/libswscale/aarch64/rgb2rgb.c
> +++ b/libswscale/aarch64/rgb2rgb.c
> @@ -67,6 +67,18 @@ void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size);
> void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size);
> void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size);
>
> +void ff_uyvytoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> +                          const uint8_t *src, int width, int height,
> +                          int lumStride, int chromStride, int srcStride);
> +void ff_uyvytoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> +                          const uint8_t *src, int width, int height,
> +                          int lumStride, int chromStride, int srcStride);
> +void ff_yuyvtoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> +                          const uint8_t *src, int width, int height,
> +                          int lumStride, int chromStride, int srcStride);
> +void ff_yuyvtoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> +                          const uint8_t *src, int width, int height,
> +                          int lumStride, int chromStride, int srcStride);
> av_cold void rgb2rgb_init_aarch64(void)
> {
>     int cpu_flags = av_get_cpu_flags();
> @@ -84,5 +96,9 @@ av_cold void rgb2rgb_init_aarch64(void)
>         shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon;
>         shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon;
>         shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon;
> +        uyvytoyuv422       = ff_uyvytoyuv422_neon;
> +        uyvytoyuv420       = ff_uyvytoyuv420_neon;
> +        yuyvtoyuv422       = ff_yuyvtoyuv422_neon;
> +        yuyvtoyuv420       = ff_yuyvtoyuv420_neon;
>     }
> }
> diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
> index 22ecdf7ac8..9002aa028f 100644
> --- a/libswscale/aarch64/rgb2rgb_neon.S
> +++ b/libswscale/aarch64/rgb2rgb_neon.S
> @@ -427,3 +427,265 @@ neon_shuf 2013
> neon_shuf 1203
> neon_shuf 2130
> neon_shuf 3210
> +
> +/*
> +v0-v7 - two consecutive lines
> +x0 - upper Y destination
> +x1 - U destination
> +x2 - V destination
> +x3 - upper src line
> +w5 - width/iteration counter - count of line pairs for yuv420, of single lines for 422
> +x6 - lum padding
> +x7 - chrom padding
> +x8 - src padding
> +w9 - number of bytes remaining in the tail
> +x10 - lower Y destination
> +w12 - tmp
> +x13 - lower src line
> +w14 - tmp
> +w17 - set to 1 if last line has to be handled separately (odd height)
> +*/
> +
> +// one fast path iteration processes 16 uyvy tuples
> +// is_line_tail is set to 1 when final 16 tuples are being processed
> +// skip_storing_chroma is set to 1 when final line is processed and the height is odd
> +.macro fastpath_iteration src_fmt, dst_fmt, is_line_tail, skip_storing_chroma
> +        ld4             {v0.16b - v3.16b}, [x3], #64
> +.if ! \is_line_tail
> +        subs            w14, w14, #32
> +.endif
> +
> +.if ! \skip_storing_chroma
> +.if \dst_fmt == yuv420

This doesn't work as you want it to across all supported tools; .if 
conditionals are meant for pure numerical comparisons, and yuv420 isn't a 
numerical constant. In practice it does seem to work with binutils though, 
but not with Clang (or with gas-preprocessor).

You can use .ifc for string comparisons, see 
https://sourceware.org/binutils/docs/as/If.html for more references.

Also see 
https://github.com/mstorsjo/FFmpeg/actions/runs/13282469154/job/37083639139 
for the fallout from trying to build this patch with various tool setups.

Please do consider trying the aarch64 assembly testset from 
https://github.com/mstorsjo/FFmpeg/commits/gha-aarch64 on your commits.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2025-02-12 12:02 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-02-07 19:06 [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422 Krzysztof Pyrkosz via ffmpeg-devel
2025-02-10 13:15 ` Martin Storsjö
2025-02-11 21:24   ` Krzysztof Pyrkosz via ffmpeg-devel
2025-02-11 21:33   ` Krzysztof Pyrkosz via ffmpeg-devel
2025-02-11 21:53     ` Martin Storsjö
2025-02-11 22:06       ` [FFmpeg-devel] [PATCH 1/2] tests/checkasm/sw_rgb: Added {yuyv, uyvy}toyuv{420, 422} test cases Krzysztof Pyrkosz via ffmpeg-devel
2025-02-12  9:37         ` Martin Storsjö
2025-02-11 22:06       ` [FFmpeg-devel] [PATCH 2/2] swscale/aarch64/rgb2rgb_neon: Implemented {yuyv, uyvy}toyuv{420, 422} Krzysztof Pyrkosz via ffmpeg-devel
2025-02-12 12:02         ` Martin Storsjö

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git