From: Krzysztof Pyrkosz via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: Krzysztof Pyrkosz <ffmpeg@szaka.eu>
Subject: [FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb_neon: Implemented uyvytoyuv422
Date: Fri, 7 Feb 2025 20:06:48 +0100
Message-ID: <20250207190647.4729-2-ffmpeg@szaka.eu> (raw)
The patch contains NEON code that splits the uyvy input array into 3
separate buffers.
The existing test cases are covering scenarios with odd height and odd
stride, but width is even in every instance. Is it safe to make that
assumption about the width?
Just as I'm about to send this patch, I'm thinking if non-interleaved
read followed by 4 invocations of TBL wouldn't be more performant. One
call to generate a contiguous vector of u, second for v and two for y.
I'm curious to find out.
The speed:
A78:
uyvytoyuv422_c: 42213.5 ( 1.00x)
uyvytoyuv422_neon: 5298.8 ( 7.97x)
A72:
uyvytoyuv422_c: 61797.6 ( 1.00x)
uyvytoyuv422_neon: 12141.9 ( 5.09x)
x13s:
uyvytoyuv422_c: 28581.7 ( 1.00x)
uyvytoyuv422_neon: 4882.4 ( 5.85x)
Krzysztof
---
libswscale/aarch64/rgb2rgb.c | 5 +++
libswscale/aarch64/rgb2rgb_neon.S | 51 +++++++++++++++++++++++++++++++
2 files changed, 56 insertions(+)
diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index 7e1dba572d..096ed9f363 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -67,6 +67,10 @@ void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_uyvytoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
+
av_cold void rgb2rgb_init_aarch64(void)
{
int cpu_flags = av_get_cpu_flags();
@@ -84,5 +88,6 @@ av_cold void rgb2rgb_init_aarch64(void)
shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon;
shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon;
shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon;
+ uyvytoyuv422 = ff_uyvytoyuv422_neon;
}
}
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 22ecdf7ac8..bdbee7df2e 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -427,3 +427,54 @@ neon_shuf 2013
neon_shuf 1203
neon_shuf 2130
neon_shuf 3210
+
+function ff_uyvytoyuv422_neon, export=1
+ sxtw x6, w6
+ sxtw x7, w7
+ ldrsw x8, [sp]
+ ubfx x10, x4, #1, #31
+ sub x8, x8, w4, sxtw #1 // src offset
+ sub x6, x6, w4, sxtw // lum offset
+ sub x7, x7, x10 // chr offset
+1:
+ sub w5, w5, #1
+ ands w10, w4, #~31
+ and w9, w4, #15
+ and w11, w4, #16
+ b.eq 7f
+4:
+ ld4 {v0.16b - v3.16b}, [x3], #64 // handle 16 uyvy tuples per iteration
+ subs w10, w10, #32
+ st1 {v2.16b}, [x2], #16
+ st1 {v0.16b}, [x1], #16
+ mov v2.16b, v1.16b
+ st2 {v2.16b,v3.16b}, [x0], #32
+ b.ne 4b
+7:
+ cbz w11, 5f // 8 - 15 remaining
+ ld4 {v0.8b - v3.8b}, [x3], #32
+ st1 {v2.8b}, [x2], #8
+ st1 {v0.8b}, [x1], #8
+ mov v2.8b, v1.8b
+ st2 {v2.8b,v3.8b}, [x0], #16
+5:
+ cbz w9, 3f
+2:
+ subs w9, w9, #2 // 0 - 7 left
+ ldrb w12, [x3], #1
+ strb w12, [x1], #1
+ ldrb w12, [x3], #1
+ strb w12, [x0], #1
+ ldrb w12, [x3], #1
+ strb w12, [x2], #1
+ ldrb w12, [x3], #1
+ strb w12, [x0], #1
+ b.ne 2b
+3:
+ add x3, x3, x8
+ add x0, x0, x6
+ add x1, x1, x7
+ add x2, x2, x7
+ cbnz w5, 1b
+ ret
+endfunc
--
2.47.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next reply other threads:[~2025-02-07 19:27 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-07 19:06 Krzysztof Pyrkosz via ffmpeg-devel [this message]
2025-02-10 13:15 ` Martin Storsjö
2025-02-11 21:24 ` Krzysztof Pyrkosz via ffmpeg-devel
2025-02-11 21:33 ` Krzysztof Pyrkosz via ffmpeg-devel
2025-02-11 21:53 ` Martin Storsjö
2025-02-11 22:06 ` [FFmpeg-devel] [PATCH 1/2] tests/checkasm/sw_rgb: Added {yuyv, uyvy}toyuv{420, 422} test cases Krzysztof Pyrkosz via ffmpeg-devel
2025-02-12 9:37 ` Martin Storsjö
2025-02-17 9:06 ` Martin Storsjö
2025-02-17 9:37 ` Martin Storsjö
2025-02-11 22:06 ` [FFmpeg-devel] [PATCH 2/2] swscale/aarch64/rgb2rgb_neon: Implemented {yuyv, uyvy}toyuv{420, 422} Krzysztof Pyrkosz via ffmpeg-devel
2025-02-12 12:02 ` Martin Storsjö
2025-02-13 19:02 ` [FFmpeg-devel] [PATCH] " Krzysztof Pyrkosz via ffmpeg-devel
2025-02-17 9:39 ` Martin Storsjö
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250207190647.4729-2-ffmpeg@szaka.eu \
--to=ffmpeg-devel@ffmpeg.org \
--cc=ffmpeg@szaka.eu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git