Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422
@ 2025-02-07 19:06 Krzysztof Pyrkosz via ffmpeg-devel
  2025-02-10 13:15 ` Martin Storsjö
  0 siblings, 1 reply; 2+ messages in thread
From: Krzysztof Pyrkosz via ffmpeg-devel @ 2025-02-07 19:06 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Krzysztof Pyrkosz

The patch contains NEON code that splits the uyvy input array into 3
separate buffers.

The existing test cases are covering scenarios with odd height and odd
stride, but width is even in every instance. Is it safe to make that
assumption about the width?

Just as I'm about to send this patch, I'm thinking if non-interleaved
read followed by 4 invocations of TBL wouldn't be more performant. One
call to generate a contiguous vector of u, second for v and two for y. 
I'm curious to find out.

The speed:

A78:
uyvytoyuv422_c:                                      42213.5 ( 1.00x)
uyvytoyuv422_neon:                                    5298.8 ( 7.97x)

A72:
uyvytoyuv422_c:                                      61797.6 ( 1.00x)
uyvytoyuv422_neon:                                   12141.9 ( 5.09x)

x13s:
uyvytoyuv422_c:                                      28581.7 ( 1.00x)
uyvytoyuv422_neon:                                    4882.4 ( 5.85x)

Krzysztof

---
 libswscale/aarch64/rgb2rgb.c      |  5 +++
 libswscale/aarch64/rgb2rgb_neon.S | 51 +++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index 7e1dba572d..096ed9f363 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -67,6 +67,10 @@ void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size);
 
+void ff_uyvytoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+
 av_cold void rgb2rgb_init_aarch64(void)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -84,5 +88,6 @@ av_cold void rgb2rgb_init_aarch64(void)
         shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon;
         shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon;
         shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon;
+        uyvytoyuv422       = ff_uyvytoyuv422_neon;
     }
 }
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 22ecdf7ac8..bdbee7df2e 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -427,3 +427,54 @@ neon_shuf 2013
 neon_shuf 1203
 neon_shuf 2130
 neon_shuf 3210
+
+function ff_uyvytoyuv422_neon, export=1
+        sxtw             x6, w6
+        sxtw             x7, w7
+        ldrsw            x8, [sp]
+        ubfx             x10, x4, #1, #31
+        sub              x8, x8, w4, sxtw #1               // src offset
+        sub              x6, x6, w4, sxtw                  // lum offset
+        sub              x7, x7, x10                       // chr offset
+1:
+        sub              w5, w5, #1
+        ands             w10, w4, #~31
+        and              w9, w4, #15
+        and              w11, w4, #16
+        b.eq             7f
+4:
+        ld4              {v0.16b - v3.16b}, [x3], #64      // handle 16 uyvy tuples per iteration
+        subs             w10, w10, #32
+        st1              {v2.16b}, [x2], #16
+        st1              {v0.16b}, [x1], #16
+        mov              v2.16b, v1.16b
+        st2              {v2.16b,v3.16b}, [x0], #32
+        b.ne             4b
+7:
+        cbz              w11, 5f                           // 8 - 15 remaining
+        ld4              {v0.8b - v3.8b}, [x3], #32
+        st1              {v2.8b}, [x2], #8
+        st1              {v0.8b}, [x1], #8
+        mov              v2.8b, v1.8b
+        st2              {v2.8b,v3.8b}, [x0], #16
+5:
+        cbz              w9, 3f
+2:
+        subs             w9, w9, #2                        // 0 - 7 left
+        ldrb             w12, [x3], #1
+        strb             w12, [x1], #1
+        ldrb             w12, [x3], #1
+        strb             w12, [x0], #1
+        ldrb             w12, [x3], #1
+        strb             w12, [x2], #1
+        ldrb             w12, [x3], #1
+        strb             w12, [x0], #1
+        b.ne             2b
+3:
+        add              x3, x3, x8
+        add              x0, x0, x6
+        add              x1, x1, x7
+        add              x2, x2, x7
+        cbnz             w5, 1b
+        ret
+endfunc
-- 
2.47.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422
  2025-02-07 19:06 [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422 Krzysztof Pyrkosz via ffmpeg-devel
@ 2025-02-10 13:15 ` Martin Storsjö
  0 siblings, 0 replies; 2+ messages in thread
From: Martin Storsjö @ 2025-02-10 13:15 UTC (permalink / raw)
  To: Krzysztof Pyrkosz via ffmpeg-devel; +Cc: Krzysztof Pyrkosz

On Fri, 7 Feb 2025, Krzysztof Pyrkosz via ffmpeg-devel wrote:

> The patch contains NEON code that splits the uyvy input array into 3
> separate buffers.
>
> The existing test cases are covering scenarios with odd height and odd
> stride, but width is even in every instance. Is it safe to make that
> assumption about the width?

For something like uyvy, I'm kinda ok with assuming that, especially if we 
don't have test coverage for it. But ideally, it would of course be clear 
how those aspects are handled wrt assembly functions, indeed!

> Just as I'm about to send this patch, I'm thinking if non-interleaved
> read followed by 4 invocations of TBL wouldn't be more performant. One
> call to generate a contiguous vector of u, second for v and two for y.
> I'm curious to find out.

My guess is that it may be more performant on more modern cores, but 
probably not on older ones.

>
> The speed:
>
> A78:
> uyvytoyuv422_c:                                      42213.5 ( 1.00x)
> uyvytoyuv422_neon:                                    5298.8 ( 7.97x)
>
> A72:
> uyvytoyuv422_c:                                      61797.6 ( 1.00x)
> uyvytoyuv422_neon:                                   12141.9 ( 5.09x)
>
> x13s:
> uyvytoyuv422_c:                                      28581.7 ( 1.00x)
> uyvytoyuv422_neon:                                    4882.4 ( 5.85x)
>
> Krzysztof
>
> ---
> libswscale/aarch64/rgb2rgb.c      |  5 +++
> libswscale/aarch64/rgb2rgb_neon.S | 51 +++++++++++++++++++++++++++++++
> 2 files changed, 56 insertions(+)
>
> diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
> index 7e1dba572d..096ed9f363 100644
> --- a/libswscale/aarch64/rgb2rgb.c
> +++ b/libswscale/aarch64/rgb2rgb.c
> @@ -67,6 +67,10 @@ void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size);
> void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size);
> void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size);
>
> +void ff_uyvytoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> +                          const uint8_t *src, int width, int height,
> +                          int lumStride, int chromStride, int srcStride);
> +
> av_cold void rgb2rgb_init_aarch64(void)
> {
>     int cpu_flags = av_get_cpu_flags();
> @@ -84,5 +88,6 @@ av_cold void rgb2rgb_init_aarch64(void)
>         shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon;
>         shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon;
>         shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon;
> +        uyvytoyuv422       = ff_uyvytoyuv422_neon;
>     }
> }
> diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
> index 22ecdf7ac8..bdbee7df2e 100644
> --- a/libswscale/aarch64/rgb2rgb_neon.S
> +++ b/libswscale/aarch64/rgb2rgb_neon.S
> @@ -427,3 +427,54 @@ neon_shuf 2013
> neon_shuf 1203
> neon_shuf 2130
> neon_shuf 3210
> +
> +function ff_uyvytoyuv422_neon, export=1
> +        sxtw             x6, w6

The indentation of the arguments column is off by one char within this 
whole function.

For testing various aarch64 assembly details (building with unusual 
toolchains etc), I've set up a little extra CI for that on github; have a 
look at the branch at the topmost 4 commits at 
https://github.com/mstorsjo/ffmpeg/commits/gha-aarch64.

If you include those 4 commits in a branch and push to a personal fork at 
github, you'll get test results for that push like this: 
https://github.com/mstorsjo/FFmpeg/actions/runs/13240513523

For these two patches, I got the following results:
https://github.com/mstorsjo/FFmpeg/actions/runs/13240526767

This points out the unexpected indentation here.

> +        sxtw             x7, w7
> +        ldrsw            x8, [sp]
> +        ubfx             x10, x4, #1, #31

The ubfx instruction is kinda esoteric; I presume what you're doing here 
is essentially the same as "lsr #1"? That'd be much more idiomatic and 
readable.

> +        sub              x8, x8, w4, sxtw #1               // src offset
> +        sub              x6, x6, w4, sxtw                  // lum offset
> +        sub              x7, x7, x10                       // chr offset
> +1:
> +        sub              w5, w5, #1

It feels a bit unusual to do the decrement of the outer loop counter here 
at this point; I feel that it would be more readable in context if it was 
done at the end after the 3: label. (There can of course be good reasons 
for doing it early due to scheduling etc, but I don't see such a case 
here.)

> +        ands             w10, w4, #~31
> +        and              w9, w4, #15
> +        and              w11, w4, #16
> +        b.eq             7f
> +4:
> +        ld4              {v0.16b - v3.16b}, [x3], #64      // handle 16 uyvy tuples per iteration
> +        subs             w10, w10, #32
> +        st1              {v2.16b}, [x2], #16
> +        st1              {v0.16b}, [x1], #16
> +        mov              v2.16b, v1.16b
> +        st2              {v2.16b,v3.16b}, [x0], #32
> +        b.ne             4b
> +7:
> +        cbz              w11, 5f                           // 8 - 15 remaining
> +        ld4              {v0.8b - v3.8b}, [x3], #32
> +        st1              {v2.8b}, [x2], #8
> +        st1              {v0.8b}, [x1], #8
> +        mov              v2.8b, v1.8b
> +        st2              {v2.8b,v3.8b}, [x0], #16
> +5:
> +        cbz              w9, 3f
> +2:
> +        subs             w9, w9, #2                        // 0 - 7 left
> +        ldrb             w12, [x3], #1
> +        strb             w12, [x1], #1
> +        ldrb             w12, [x3], #1
> +        strb             w12, [x0], #1
> +        ldrb             w12, [x3], #1
> +        strb             w12, [x2], #1
> +        ldrb             w12, [x3], #1
> +        strb             w12, [x0], #1
> +        b.ne             2b
> +3:
> +        add              x3, x3, x8
> +        add              x0, x0, x6
> +        add              x1, x1, x7
> +        add              x2, x2, x7
> +        cbnz             w5, 1b
> +        ret
> +endfunc

If the height decrement is moved into the end here, it can be a subs with 
a regular b.gt branch.

Other than that, this looks quite reasonable, thanks!

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2025-02-10 13:15 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-02-07 19:06 [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422 Krzysztof Pyrkosz via ffmpeg-devel
2025-02-10 13:15 ` Martin Storsjö

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git