* Re: [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422
2025-02-10 13:15 ` Martin Storsjö
@ 2025-02-11 21:24 ` Krzysztof Pyrkosz via ffmpeg-devel
2025-02-11 21:33 ` Krzysztof Pyrkosz via ffmpeg-devel
1 sibling, 0 replies; 7+ messages in thread
From: Krzysztof Pyrkosz via ffmpeg-devel @ 2025-02-11 21:24 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Krzysztof Pyrkosz
[-- Attachment #1: Type: text/plain, Size: 1130 bytes --]
On Mon, Feb 10, 2025 at 03:15:35PM +0200, Martin Storsjö wrote:
> > Just as I'm about to send this patch, I'm thinking if non-interleaved
> > read followed by 4 invocations of TBL wouldn't be more performant. One
> > call to generate a contiguous vector of u, second for v and two for y.
> > I'm curious to find out.
>
> My guess is that it may be more performant on more modern cores, but
> probably not on older ones.
That's the case. It's 15% faster on A78 and twice as slow on A72.
>
> > + sxtw x7, w7
> > + ldrsw x8, [sp]
> > + ubfx x10, x4, #1, #31
>
> The ubfx instruction is kinda esoteric; I presume what you're doing here is
> essentially the same as "lsr #1"? That'd be much more idiomatic and
> readable.
That's correct. What put me off was that register 4 is passed as int
(w4) and I expected register 10 to be 64 bits long with high bits set to
0. lsr w10, w4, #1 already does that.
I modified the code to handle {uyvy,yuyv}toyuv{420,422} using macros,
since these 4 functions share common routines. The code lost on the
readability, though.
Krzysztof
[-- Attachment #2: 0001-swscale-aarch64-rgb2rgb_neon-Implemented-uyvytoyuv42.patch --]
[-- Type: text/x-diff, Size: 15406 bytes --]
From a27f554ada9f2e81b4b19d313c9f19b348824ef1 Mon Sep 17 00:00:00 2001
From: Krzysztof Pyrkosz <ffmpeg@szaka.eu>
Date: Tue, 11 Feb 2025 22:04:19 +0100
Subject: [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422
---
libswscale/aarch64/rgb2rgb.c | 16 ++
libswscale/aarch64/rgb2rgb_neon.S | 262 ++++++++++++++++++++++++++++++
tests/checkasm/sw_rgb.c | 63 ++++---
3 files changed, 318 insertions(+), 23 deletions(-)
diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index 7e1dba572d..f474228298 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -67,6 +67,18 @@ void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_uyvytoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
+void ff_uyvytoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
+void ff_yuyvtoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
+void ff_yuyvtoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
av_cold void rgb2rgb_init_aarch64(void)
{
int cpu_flags = av_get_cpu_flags();
@@ -84,5 +96,9 @@ av_cold void rgb2rgb_init_aarch64(void)
shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon;
shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon;
shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon;
+ uyvytoyuv422 = ff_uyvytoyuv422_neon;
+ uyvytoyuv420 = ff_uyvytoyuv420_neon;
+ yuyvtoyuv422 = ff_yuyvtoyuv422_neon;
+ yuyvtoyuv420 = ff_yuyvtoyuv420_neon;
}
}
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 22ecdf7ac8..9002aa028f 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -427,3 +427,265 @@ neon_shuf 2013
neon_shuf 1203
neon_shuf 2130
neon_shuf 3210
+
+/*
+v0-v7 - two consecutive lines
+x0 - upper Y destination
+x1 - U destination
+x2 - V destination
+x3 - upper src line
+w5 - width/iteration counter - count of line pairs for yuv420, of single lines for 422
+x6 - lum padding
+x7 - chrom padding
+x8 - src padding
+w9 - number of bytes remaining in the tail
+x10 - lower Y destination
+w12 - tmp
+x13 - lower src line
+w14 - tmp
+w17 - set to 1 if last line has to be handled separately (odd height)
+*/
+
+// one fast path iteration processes 16 uyvy tuples
+// is_line_tail is set to 1 when final 16 tuples are being processed
+// skip_storing_chroma is set to 1 when final line is processed and the height is odd
+.macro fastpath_iteration src_fmt, dst_fmt, is_line_tail, skip_storing_chroma
+ ld4 {v0.16b - v3.16b}, [x3], #64
+.if ! \is_line_tail
+ subs w14, w14, #32
+.endif
+
+.if ! \skip_storing_chroma
+.if \dst_fmt == yuv420
+ ld4 {v4.16b - v7.16b}, [x13], #64
+.endif
+
+.if \dst_fmt == yuv420 // store UV
+.if \src_fmt == uyvy
+ uhadd v0.16b, v4.16b, v0.16b // halving sum of U
+ uhadd v2.16b, v6.16b, v2.16b // halving sum of V
+.else
+ uhadd v1.16b, v5.16b, v1.16b // halving sum of U
+ uhadd v3.16b, v7.16b, v3.16b // halving sum of V
+.endif
+.endif
+
+.if \src_fmt == uyvy
+ st1 {v2.16b}, [x2], #16
+ st1 {v0.16b}, [x1], #16
+.else
+ st1 {v3.16b}, [x2], #16
+ st1 {v1.16b}, [x1], #16
+.endif
+
+.if \dst_fmt == yuv420 // store_y
+.if \src_fmt == uyvy
+ mov v6.16b, v5.16b
+ st2 {v6.16b,v7.16b}, [x10], #32
+.else
+ mov v5.16b, v4.16b
+ st2 {v5.16b,v6.16b}, [x10], #32
+.endif
+.endif
+
+.endif // ! \skip_storing_chroma
+
+.if \src_fmt == uyvy
+ mov v2.16b, v1.16b
+ st2 {v2.16b,v3.16b}, [x0], #32
+.else
+ mov v1.16b, v0.16b
+ st2 {v1.16b,v2.16b}, [x0], #32
+.endif
+.endm
+
+// shift pointers back to width - 32 to process the tail of the line
+// if the height is odd, processing the final line is simplified
+.macro fastpath_shift_back_pointers src_fmt, dst_fmt, is_final_odd_line
+ add x3, x3, w9, sxtw #1
+ sub x3, x3, #64
+.if ! \is_final_odd_line
+.if \dst_fmt == yuv420
+ add x13, x13, w9, sxtw #1
+ sub x13, x13, #64
+ add x10, x10, w9, sxtw
+ sub x10, x10, #32
+.endif
+.endif
+ add x0, x0, w9, sxtw
+ sub x0, x0, #32
+.if ! \is_final_odd_line
+ asr w14, w9, #1
+ add x1, x1, w14, sxtw
+ sub x1, x1, #16
+ add x2, x2, w14, sxtw
+ sub x2, x2, #16
+.endif
+.endm
+
+.macro slowpath_iteration src_fmt, dst_fmt, skip_storing_chroma
+.if \dst_fmt == yuv422
+.if \src_fmt == uyvy
+ ldrb w12, [x3], #1
+ ldrb w14, [x3], #1
+ strb w12, [x1], #1
+ strb w14, [x0], #1
+ ldrb w12, [x3], #1
+ ldrb w14, [x3], #1
+ strb w12, [x2], #1
+ strb w14, [x0], #1
+.else
+ ldrb w12, [x3], #1
+ ldrb w14, [x3], #1
+ strb w12, [x0], #1
+ strb w14, [x1], #1
+ ldrb w12, [x3], #1
+ ldrb w14, [x3], #1
+ strb w12, [x0], #1
+ strb w14, [x2], #1
+.endif
+.endif
+.if \dst_fmt == yuv420
+.if \src_fmt == uyvy
+.if \skip_storing_chroma
+ ldrb w12, [x3], #2
+ ldrb w14, [x3], #2
+ strb w12, [x0], #1
+ strb w14, [x0], #1
+.else
+ ldrb w12, [x3], #1
+ ldrb w14, [x13], #1
+ add w12, w12, w14
+ lsr w12, w12, #1
+ strb w12, [x1], #1
+ ldrb w14, [x3], #1
+ ldrb w12, [x13], #1
+ strb w14, [x0], #1
+ strb w12, [x10], #1
+ ldrb w14, [x13], #1
+ ldrb w12, [x3], #1
+ add w12, w12, w14
+ lsr w12, w12, #1
+ strb w12, [x2], #1
+ ldrb w14, [x3], #1
+ ldrb w12, [x13], #1
+ strb w14, [x0], #1
+ strb w12, [x10], #1
+.endif
+.else
+.if \skip_storing_chroma
+ ldrb w12, [x3], #2
+ ldrb w14, [x3], #2
+ strb w12, [x0], #1
+ strb w14, [x0], #1
+.else
+ ldrb w12, [x3], #1
+ ldrb w14, [x13], #1
+ strb w12, [x0], #1
+ strb w14, [x10], #1
+ ldrb w12, [x3], #1
+ ldrb w14, [x13], #1
+ add w12, w12, w14
+ lsr w12, w12, #1
+ strb w12, [x1], #1
+ ldrb w14, [x3], #1
+ ldrb w12, [x13], #1
+ strb w14, [x0], #1
+ strb w12, [x10], #1
+ ldrb w14, [x13], #1
+ ldrb w12, [x3], #1
+ add w12, w12, w14
+ lsr w12, w12, #1
+ strb w12, [x2], #1
+.endif
+.endif
+.endif
+.endm
+
+.macro move_pointers_to_next_line src_fmt, dst_fmt, is_final_odd_line
+ add x3, x3, x8
+ add x0, x0, x6
+.if \dst_fmt == yuv420
+ add x13, x13, x8
+ add x10, x10, x6
+.endif
+ add x1, x1, x7
+ add x2, x2, x7
+.endm
+
+.macro interleaved_yuv_to_planar src_fmt, dst_fmt
+function ff_\src_fmt\()to\dst_fmt\()_neon, export=1
+ sxtw x6, w6
+ sxtw x7, w7
+ ldrsw x8, [sp]
+ ands w11, w4, #~31 // choose between fast and slow path
+
+.if \dst_fmt == yuv420
+ add x10, x0, x6
+ add x13, x3, x8
+ add x8, x8, x8
+ add x6, x6, x6
+ and w17, w5, #1
+ asr w5, w5, #1
+.endif
+ asr w9, w4, #1
+ sub x8, x8, w4, sxtw #1 // src offset
+ sub x6, x6, w4, sxtw // lum offset
+ sub x7, x7, x9 // chr offset
+
+ b.eq 6f
+
+1: // fast path - the width is at least 32
+ and w14, w4, #~31 // w14 is the main loop counter
+ and w9, w4, #31 // w9 holds the remaining width, 0 to 31
+2:
+ fastpath_iteration \src_fmt, \dst_fmt, 0, 0
+ b.ne 2b
+ fastpath_shift_back_pointers \src_fmt, \dst_fmt, 0
+ fastpath_iteration \src_fmt, \dst_fmt, 0, 0
+ subs w5, w5, #1
+ move_pointers_to_next_line \src_fmt, \dst_fmt
+ b.ne 1b
+
+.if \dst_fmt == yuv420 // handle the last line in case the height is odd
+ cbz w17, 3f
+ and w14, w4, #~31
+4:
+ fastpath_iteration \src_fmt, \dst_fmt, 0, 1
+ b.ne 4b
+ fastpath_shift_back_pointers \src_fmt, \dst_fmt, 1
+ fastpath_iteration \src_fmt, \dst_fmt, 1, 1
+3:
+.endif
+ ret
+
+6: // slow path - width is at most 31
+ and w9, w4, #31
+7:
+ subs w9, w9, #2
+ slowpath_iteration \src_fmt, \dst_fmt, 0
+ b.ne 7b
+ subs w5, w5, #1
+ move_pointers_to_next_line \src_fmt, \dst_fmt
+ b.ne 6b
+
+.if \dst_fmt == yuv420
+ cbz w17, 8f
+ and w9, w4, #31
+.if \src_fmt == uyvy
+ add x3, x3, #1
+.endif
+5:
+ subs w9, w9, #2
+ slowpath_iteration \src_fmt, \dst_fmt, 1
+ b.ne 5b
+8:
+.endif
+ ret
+endfunc
+.endm
+
+interleaved_yuv_to_planar uyvy, yuv422
+interleaved_yuv_to_planar uyvy, yuv420
+interleaved_yuv_to_planar yuyv, yuv422
+interleaved_yuv_to_planar yuyv, yuv420
diff --git a/tests/checkasm/sw_rgb.c b/tests/checkasm/sw_rgb.c
index b98c7c6b47..183b4eeaa8 100644
--- a/tests/checkasm/sw_rgb.c
+++ b/tests/checkasm/sw_rgb.c
@@ -61,7 +61,7 @@ static void check_shuffle_bytes(void * func, const char * report)
memcpy(src1, src0, MAX_STRIDE);
if (check_func(func, "%s", report)) {
- for (i = 0; i < 6; i ++) {
+ for (i = 0; i < FF_ARRAY_ELEMS(width); i ++) {
call_ref(src0, dst0, width[i]);
call_new(src1, dst1, width[i]);
if (memcmp(dst0, dst1, MAX_STRIDE))
@@ -71,9 +71,24 @@ static void check_shuffle_bytes(void * func, const char * report)
}
}
-static void check_uyvy_to_422p(void)
+typedef void (*uyvy_to_yuv_func)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
+
+typedef struct
+{
+ uyvy_to_yuv_func func;
+ const char* from;
+ int to;
+} uyvy_to_yuv_f;
+
+static void check_uyvy_to_yuv(void)
{
int i;
+ uyvy_to_yuv_f funcs[] = {
+ {uyvytoyuv420, "uyvy", 420}, {uyvytoyuv422, "uyvy", 422},
+ {yuyvtoyuv420, "yuyv", 420}, {yuyvtoyuv422, "yuyv", 422}
+ };
LOCAL_ALIGNED_32(uint8_t, src0, [MAX_STRIDE * MAX_HEIGHT * 2]);
LOCAL_ALIGNED_32(uint8_t, src1, [MAX_STRIDE * MAX_HEIGHT * 2]);
@@ -91,26 +106,28 @@ static void check_uyvy_to_422p(void)
randomize_buffers(src0, MAX_STRIDE * MAX_HEIGHT * 2);
memcpy(src1, src0, MAX_STRIDE * MAX_HEIGHT * 2);
- if (check_func(uyvytoyuv422, "uyvytoyuv422")) {
- for (i = 0; i < 6; i ++) {
- memset(dst_y_0, 0, MAX_STRIDE * MAX_HEIGHT);
- memset(dst_y_1, 0, MAX_STRIDE * MAX_HEIGHT);
- memset(dst_u_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
- memset(dst_u_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
- memset(dst_v_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
- memset(dst_v_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
-
- call_ref(dst_y_0, dst_u_0, dst_v_0, src0, planes[i].w, planes[i].h,
- MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
- call_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[i].w, planes[i].h,
- MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
- if (memcmp(dst_y_0, dst_y_1, MAX_STRIDE * MAX_HEIGHT) ||
- memcmp(dst_u_0, dst_u_1, (MAX_STRIDE/2) * MAX_HEIGHT) ||
- memcmp(dst_v_0, dst_v_1, (MAX_STRIDE/2) * MAX_HEIGHT))
- fail();
+ for (int k = 0; k < FF_ARRAY_ELEMS(funcs); k ++) {
+ if (check_func(funcs[k].func, "%stoyuv%d",funcs[k].from, funcs[k].to)) {
+ for (i = 0; i < FF_ARRAY_ELEMS(planes); i ++) {
+ memset(dst_y_0, 0, MAX_STRIDE * MAX_HEIGHT);
+ memset(dst_y_1, 0, MAX_STRIDE * MAX_HEIGHT);
+ memset(dst_u_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+ memset(dst_u_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+ memset(dst_v_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+ memset(dst_v_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+
+ call_ref(dst_y_0, dst_u_0, dst_v_0, src0, planes[i].w, planes[i].h,
+ MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
+ call_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[i].w, planes[i].h,
+ MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
+ if (memcmp(dst_y_0, dst_y_1, MAX_STRIDE * MAX_HEIGHT) ||
+ memcmp(dst_u_0, dst_u_1, (MAX_STRIDE/2) * MAX_HEIGHT) ||
+ memcmp(dst_v_0, dst_v_1, (MAX_STRIDE/2) * MAX_HEIGHT))
+ fail();
+ }
+ bench_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[5].w, planes[5].h,
+ MAX_STRIDE, MAX_STRIDE / 2, planes[5].s);
}
- bench_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[5].w, planes[5].h,
- MAX_STRIDE, MAX_STRIDE / 2, planes[5].s);
}
}
@@ -834,8 +851,8 @@ void checkasm_check_sw_rgb(void)
check_shuffle_bytes(shuffle_bytes_2130, "shuffle_bytes_2130");
report("shuffle_bytes_2130");
- check_uyvy_to_422p();
- report("uyvytoyuv422");
+ check_uyvy_to_yuv();
+ report("uyvytoyuv");
check_interleave_bytes();
report("interleave_bytes");
--
2.47.2
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422
2025-02-10 13:15 ` Martin Storsjö
2025-02-11 21:24 ` Krzysztof Pyrkosz via ffmpeg-devel
@ 2025-02-11 21:33 ` Krzysztof Pyrkosz via ffmpeg-devel
2025-02-11 21:53 ` Martin Storsjö
1 sibling, 1 reply; 7+ messages in thread
From: Krzysztof Pyrkosz via ffmpeg-devel @ 2025-02-11 21:33 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Krzysztof Pyrkosz
I forgot to include the benchmarks in the previous message, here they
are:
A78:
uyvytoyuv420_neon: 6112.5 ( 6.96x)
uyvytoyuv422_neon: 6696.0 ( 6.32x)
yuyvtoyuv420_neon: 6113.0 ( 6.95x)
yuyvtoyuv422_neon: 6695.2 ( 6.31x)
A72:
uyvytoyuv420_neon: 9512.1 ( 6.09x)
uyvytoyuv422_neon: 9766.8 ( 6.32x)
yuyvtoyuv420_neon: 9639.1 ( 6.00x)
yuyvtoyuv422_neon: 9779.0 ( 6.03x)
A53:
uyvytoyuv420_neon: 12720.1 ( 9.10x)
uyvytoyuv422_neon: 14282.9 ( 6.71x)
yuyvtoyuv420_neon: 12637.4 ( 9.15x)
yuyvtoyuv422_neon: 14127.6 ( 6.77x)
---
libswscale/aarch64/rgb2rgb.c | 16 ++
libswscale/aarch64/rgb2rgb_neon.S | 262 ++++++++++++++++++++++++++++++
tests/checkasm/sw_rgb.c | 63 ++++---
3 files changed, 318 insertions(+), 23 deletions(-)
diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index 7e1dba572d..f474228298 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -67,6 +67,18 @@ void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_uyvytoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
+void ff_uyvytoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
+void ff_yuyvtoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
+void ff_yuyvtoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
av_cold void rgb2rgb_init_aarch64(void)
{
int cpu_flags = av_get_cpu_flags();
@@ -84,5 +96,9 @@ av_cold void rgb2rgb_init_aarch64(void)
shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon;
shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon;
shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon;
+ uyvytoyuv422 = ff_uyvytoyuv422_neon;
+ uyvytoyuv420 = ff_uyvytoyuv420_neon;
+ yuyvtoyuv422 = ff_yuyvtoyuv422_neon;
+ yuyvtoyuv420 = ff_yuyvtoyuv420_neon;
}
}
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 22ecdf7ac8..9002aa028f 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -427,3 +427,265 @@ neon_shuf 2013
neon_shuf 1203
neon_shuf 2130
neon_shuf 3210
+
+/*
+v0-v7 - two consecutive lines
+x0 - upper Y destination
+x1 - U destination
+x2 - V destination
+x3 - upper src line
+w5 - width/iteration counter - count of line pairs for yuv420, of single lines for 422
+x6 - lum padding
+x7 - chrom padding
+x8 - src padding
+w9 - number of bytes remaining in the tail
+x10 - lower Y destination
+w12 - tmp
+x13 - lower src line
+w14 - tmp
+w17 - set to 1 if last line has to be handled separately (odd height)
+*/
+
+// one fast path iteration processes 16 uyvy tuples
+// is_line_tail is set to 1 when final 16 tuples are being processed
+// skip_storing_chroma is set to 1 when final line is processed and the height is odd
+.macro fastpath_iteration src_fmt, dst_fmt, is_line_tail, skip_storing_chroma
+ ld4 {v0.16b - v3.16b}, [x3], #64
+.if ! \is_line_tail
+ subs w14, w14, #32
+.endif
+
+.if ! \skip_storing_chroma
+.if \dst_fmt == yuv420
+ ld4 {v4.16b - v7.16b}, [x13], #64
+.endif
+
+.if \dst_fmt == yuv420 // store UV
+.if \src_fmt == uyvy
+ uhadd v0.16b, v4.16b, v0.16b // halving sum of U
+ uhadd v2.16b, v6.16b, v2.16b // halving sum of V
+.else
+ uhadd v1.16b, v5.16b, v1.16b // halving sum of U
+ uhadd v3.16b, v7.16b, v3.16b // halving sum of V
+.endif
+.endif
+
+.if \src_fmt == uyvy
+ st1 {v2.16b}, [x2], #16
+ st1 {v0.16b}, [x1], #16
+.else
+ st1 {v3.16b}, [x2], #16
+ st1 {v1.16b}, [x1], #16
+.endif
+
+.if \dst_fmt == yuv420 // store_y
+.if \src_fmt == uyvy
+ mov v6.16b, v5.16b
+ st2 {v6.16b,v7.16b}, [x10], #32
+.else
+ mov v5.16b, v4.16b
+ st2 {v5.16b,v6.16b}, [x10], #32
+.endif
+.endif
+
+.endif // ! \skip_storing_chroma
+
+.if \src_fmt == uyvy
+ mov v2.16b, v1.16b
+ st2 {v2.16b,v3.16b}, [x0], #32
+.else
+ mov v1.16b, v0.16b
+ st2 {v1.16b,v2.16b}, [x0], #32
+.endif
+.endm
+
+// shift pointers back to width - 32 to process the tail of the line
+// if the height is odd, processing the final line is simplified
+.macro fastpath_shift_back_pointers src_fmt, dst_fmt, is_final_odd_line
+ add x3, x3, w9, sxtw #1
+ sub x3, x3, #64
+.if ! \is_final_odd_line
+.if \dst_fmt == yuv420
+ add x13, x13, w9, sxtw #1
+ sub x13, x13, #64
+ add x10, x10, w9, sxtw
+ sub x10, x10, #32
+.endif
+.endif
+ add x0, x0, w9, sxtw
+ sub x0, x0, #32
+.if ! \is_final_odd_line
+ asr w14, w9, #1
+ add x1, x1, w14, sxtw
+ sub x1, x1, #16
+ add x2, x2, w14, sxtw
+ sub x2, x2, #16
+.endif
+.endm
+
+.macro slowpath_iteration src_fmt, dst_fmt, skip_storing_chroma
+.if \dst_fmt == yuv422
+.if \src_fmt == uyvy
+ ldrb w12, [x3], #1
+ ldrb w14, [x3], #1
+ strb w12, [x1], #1
+ strb w14, [x0], #1
+ ldrb w12, [x3], #1
+ ldrb w14, [x3], #1
+ strb w12, [x2], #1
+ strb w14, [x0], #1
+.else
+ ldrb w12, [x3], #1
+ ldrb w14, [x3], #1
+ strb w12, [x0], #1
+ strb w14, [x1], #1
+ ldrb w12, [x3], #1
+ ldrb w14, [x3], #1
+ strb w12, [x0], #1
+ strb w14, [x2], #1
+.endif
+.endif
+.if \dst_fmt == yuv420
+.if \src_fmt == uyvy
+.if \skip_storing_chroma
+ ldrb w12, [x3], #2
+ ldrb w14, [x3], #2
+ strb w12, [x0], #1
+ strb w14, [x0], #1
+.else
+ ldrb w12, [x3], #1
+ ldrb w14, [x13], #1
+ add w12, w12, w14
+ lsr w12, w12, #1
+ strb w12, [x1], #1
+ ldrb w14, [x3], #1
+ ldrb w12, [x13], #1
+ strb w14, [x0], #1
+ strb w12, [x10], #1
+ ldrb w14, [x13], #1
+ ldrb w12, [x3], #1
+ add w12, w12, w14
+ lsr w12, w12, #1
+ strb w12, [x2], #1
+ ldrb w14, [x3], #1
+ ldrb w12, [x13], #1
+ strb w14, [x0], #1
+ strb w12, [x10], #1
+.endif
+.else
+.if \skip_storing_chroma
+ ldrb w12, [x3], #2
+ ldrb w14, [x3], #2
+ strb w12, [x0], #1
+ strb w14, [x0], #1
+.else
+ ldrb w12, [x3], #1
+ ldrb w14, [x13], #1
+ strb w12, [x0], #1
+ strb w14, [x10], #1
+ ldrb w12, [x3], #1
+ ldrb w14, [x13], #1
+ add w12, w12, w14
+ lsr w12, w12, #1
+ strb w12, [x1], #1
+ ldrb w14, [x3], #1
+ ldrb w12, [x13], #1
+ strb w14, [x0], #1
+ strb w12, [x10], #1
+ ldrb w14, [x13], #1
+ ldrb w12, [x3], #1
+ add w12, w12, w14
+ lsr w12, w12, #1
+ strb w12, [x2], #1
+.endif
+.endif
+.endif
+.endm
+
+.macro move_pointers_to_next_line src_fmt, dst_fmt, is_final_odd_line
+ add x3, x3, x8
+ add x0, x0, x6
+.if \dst_fmt == yuv420
+ add x13, x13, x8
+ add x10, x10, x6
+.endif
+ add x1, x1, x7
+ add x2, x2, x7
+.endm
+
+.macro interleaved_yuv_to_planar src_fmt, dst_fmt
+function ff_\src_fmt\()to\dst_fmt\()_neon, export=1
+ sxtw x6, w6
+ sxtw x7, w7
+ ldrsw x8, [sp]
+ ands w11, w4, #~31 // choose between fast and slow path
+
+.if \dst_fmt == yuv420
+ add x10, x0, x6
+ add x13, x3, x8
+ add x8, x8, x8
+ add x6, x6, x6
+ and w17, w5, #1
+ asr w5, w5, #1
+.endif
+ asr w9, w4, #1
+ sub x8, x8, w4, sxtw #1 // src offset
+ sub x6, x6, w4, sxtw // lum offset
+ sub x7, x7, x9 // chr offset
+
+ b.eq 6f
+
+1: // fast path - the width is at least 32
+ and w14, w4, #~31 // w14 is the main loop counter
+ and w9, w4, #31 // w9 holds the remaining width, 0 to 31
+2:
+ fastpath_iteration \src_fmt, \dst_fmt, 0, 0
+ b.ne 2b
+ fastpath_shift_back_pointers \src_fmt, \dst_fmt, 0
+ fastpath_iteration \src_fmt, \dst_fmt, 0, 0
+ subs w5, w5, #1
+ move_pointers_to_next_line \src_fmt, \dst_fmt
+ b.ne 1b
+
+.if \dst_fmt == yuv420 // handle the last line in case the height is odd
+ cbz w17, 3f
+ and w14, w4, #~31
+4:
+ fastpath_iteration \src_fmt, \dst_fmt, 0, 1
+ b.ne 4b
+ fastpath_shift_back_pointers \src_fmt, \dst_fmt, 1
+ fastpath_iteration \src_fmt, \dst_fmt, 1, 1
+3:
+.endif
+ ret
+
+6: // slow path - width is at most 31
+ and w9, w4, #31
+7:
+ subs w9, w9, #2
+ slowpath_iteration \src_fmt, \dst_fmt, 0
+ b.ne 7b
+ subs w5, w5, #1
+ move_pointers_to_next_line \src_fmt, \dst_fmt
+ b.ne 6b
+
+.if \dst_fmt == yuv420
+ cbz w17, 8f
+ and w9, w4, #31
+.if \src_fmt == uyvy
+ add x3, x3, #1
+.endif
+5:
+ subs w9, w9, #2
+ slowpath_iteration \src_fmt, \dst_fmt, 1
+ b.ne 5b
+8:
+.endif
+ ret
+endfunc
+.endm
+
+interleaved_yuv_to_planar uyvy, yuv422
+interleaved_yuv_to_planar uyvy, yuv420
+interleaved_yuv_to_planar yuyv, yuv422
+interleaved_yuv_to_planar yuyv, yuv420
diff --git a/tests/checkasm/sw_rgb.c b/tests/checkasm/sw_rgb.c
index b98c7c6b47..183b4eeaa8 100644
--- a/tests/checkasm/sw_rgb.c
+++ b/tests/checkasm/sw_rgb.c
@@ -61,7 +61,7 @@ static void check_shuffle_bytes(void * func, const char * report)
memcpy(src1, src0, MAX_STRIDE);
if (check_func(func, "%s", report)) {
- for (i = 0; i < 6; i ++) {
+ for (i = 0; i < FF_ARRAY_ELEMS(width); i ++) {
call_ref(src0, dst0, width[i]);
call_new(src1, dst1, width[i]);
if (memcmp(dst0, dst1, MAX_STRIDE))
@@ -71,9 +71,24 @@ static void check_shuffle_bytes(void * func, const char * report)
}
}
-static void check_uyvy_to_422p(void)
+typedef void (*uyvy_to_yuv_func)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
+
+typedef struct
+{
+ uyvy_to_yuv_func func;
+ const char* from;
+ int to;
+} uyvy_to_yuv_f;
+
+static void check_uyvy_to_yuv(void)
{
int i;
+ uyvy_to_yuv_f funcs[] = {
+ {uyvytoyuv420, "uyvy", 420}, {uyvytoyuv422, "uyvy", 422},
+ {yuyvtoyuv420, "yuyv", 420}, {yuyvtoyuv422, "yuyv", 422}
+ };
LOCAL_ALIGNED_32(uint8_t, src0, [MAX_STRIDE * MAX_HEIGHT * 2]);
LOCAL_ALIGNED_32(uint8_t, src1, [MAX_STRIDE * MAX_HEIGHT * 2]);
@@ -91,26 +106,28 @@ static void check_uyvy_to_422p(void)
randomize_buffers(src0, MAX_STRIDE * MAX_HEIGHT * 2);
memcpy(src1, src0, MAX_STRIDE * MAX_HEIGHT * 2);
- if (check_func(uyvytoyuv422, "uyvytoyuv422")) {
- for (i = 0; i < 6; i ++) {
- memset(dst_y_0, 0, MAX_STRIDE * MAX_HEIGHT);
- memset(dst_y_1, 0, MAX_STRIDE * MAX_HEIGHT);
- memset(dst_u_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
- memset(dst_u_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
- memset(dst_v_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
- memset(dst_v_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
-
- call_ref(dst_y_0, dst_u_0, dst_v_0, src0, planes[i].w, planes[i].h,
- MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
- call_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[i].w, planes[i].h,
- MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
- if (memcmp(dst_y_0, dst_y_1, MAX_STRIDE * MAX_HEIGHT) ||
- memcmp(dst_u_0, dst_u_1, (MAX_STRIDE/2) * MAX_HEIGHT) ||
- memcmp(dst_v_0, dst_v_1, (MAX_STRIDE/2) * MAX_HEIGHT))
- fail();
+ for (int k = 0; k < FF_ARRAY_ELEMS(funcs); k ++) {
+ if (check_func(funcs[k].func, "%stoyuv%d",funcs[k].from, funcs[k].to)) {
+ for (i = 0; i < FF_ARRAY_ELEMS(planes); i ++) {
+ memset(dst_y_0, 0, MAX_STRIDE * MAX_HEIGHT);
+ memset(dst_y_1, 0, MAX_STRIDE * MAX_HEIGHT);
+ memset(dst_u_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+ memset(dst_u_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+ memset(dst_v_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+ memset(dst_v_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT);
+
+ call_ref(dst_y_0, dst_u_0, dst_v_0, src0, planes[i].w, planes[i].h,
+ MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
+ call_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[i].w, planes[i].h,
+ MAX_STRIDE, MAX_STRIDE / 2, planes[i].s);
+ if (memcmp(dst_y_0, dst_y_1, MAX_STRIDE * MAX_HEIGHT) ||
+ memcmp(dst_u_0, dst_u_1, (MAX_STRIDE/2) * MAX_HEIGHT) ||
+ memcmp(dst_v_0, dst_v_1, (MAX_STRIDE/2) * MAX_HEIGHT))
+ fail();
+ }
+ bench_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[5].w, planes[5].h,
+ MAX_STRIDE, MAX_STRIDE / 2, planes[5].s);
}
- bench_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[5].w, planes[5].h,
- MAX_STRIDE, MAX_STRIDE / 2, planes[5].s);
}
}
@@ -834,8 +851,8 @@ void checkasm_check_sw_rgb(void)
check_shuffle_bytes(shuffle_bytes_2130, "shuffle_bytes_2130");
report("shuffle_bytes_2130");
- check_uyvy_to_422p();
- report("uyvytoyuv422");
+ check_uyvy_to_yuv();
+ report("uyvytoyuv");
check_interleave_bytes();
report("interleave_bytes");
--
2.47.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread