* [FFmpeg-devel] [PATCH] swscale: rgb_to_yuv neon optimizations
@ 2025-05-19 19:50 Dmitriy Kovalenko
2025-05-20 19:46 ` Michael Niedermayer
0 siblings, 1 reply; 4+ messages in thread
From: Dmitriy Kovalenko @ 2025-05-19 19:50 UTC (permalink / raw)
To: ffmpeg-devel
I've found quite a few ways to optimize existing ffmpeg's rgb to yuv
subsampled conversion. In this patch stack I'll try to
improve the performance.
This particular set of changes is a small improvement to all the
existing functions and macro. The biggest performance gain is
coming from post loading increment of the pointer and immediate
pref etching of the memory blocks and interleaving the multiplication
shifting operations of
different registers for better scheduling.
Also changed a bunch of places where cmp + b.le was used instead
of one instruction cbnz/tbnz and some other small cleanups.
Here are checkasm results on the macbook pro with the latest M4 max
<before>
bgra_to_uv_1080_c: 257.5 ( 1.00x)
bgra_to_uv_1080_neon: 211.9 ( 1.22x)
bgra_to_uv_1920_c: 467.1 ( 1.00x)
bgra_to_uv_1920_neon: 379.3 ( 1.23x)
bgra_to_uv_half_1080_c: 198.9 ( 1.00x)
bgra_to_uv_half_1080_neon: 125.7 ( 1.58x)
bgra_to_uv_half_1920_c: 346.3 ( 1.00x)
bgra_to_uv_half_1920_neon: 223.7 ( 1.55x)
<after>
bgra_to_uv_1080_c: 268.3 ( 1.00x)
bgra_to_uv_1080_neon: 176.0 ( 1.53x)
bgra_to_uv_1920_c: 456.6 ( 1.00x)
bgra_to_uv_1920_neon: 307.7 ( 1.48x)
bgra_to_uv_half_1080_c: 193.2 ( 1.00x)
bgra_to_uv_half_1080_neon: 96.8 ( 2.00x)
bgra_to_uv_half_1920_c: 347.2 ( 1.00x)
bgra_to_uv_half_1920_neon: 182.6 ( 1.92x)
With my proprietary test on IOS it gives around 70% of performance
improvement converting bgra 1920x1920 image to yuv420p
On my linux arm cortex-r processing the performance improvement not that
visible but still consistently faster by 5-10% than the current
implementation.
Signed-off-by: Dmitriy Kovalenko <dmtr.kovalenko@outlook.com>
---
libswscale/aarch64/input.S | 166 +++++++++++++++++++++++++------------
1 file changed, 112 insertions(+), 54 deletions(-)
diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index c1c0adffc8..ee8eb24c14 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -1,5 +1,4 @@
-/*
- * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
+/* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
*
* This file is part of FFmpeg.
*
@@ -57,20 +56,41 @@
sqshrn2 \dst\().8h, \dst2\().4s, \right_shift //
dst_higher_half = dst2 >> right_shift
.endm
+// interleaved product version of the rgb to yuv gives slightly
better performance on non-performant mobile +.macro
rgb_to_uv_interleaved_product r, g, b, u_coef0, u_coef1, u_coef2,
v_coef0, v_coef1, v_coef2, u_dst1, u_dst2, v_dst1, v_dst2, u_dst, v_dst,
right_shift
+ smlal \u_dst1\().4s, \u_coef0\().4h, \r\().4h // U += ru * r
(first 4)
+ smlal \v_dst1\().4s, \v_coef0\().4h, \r\().4h // V += rv * r
(first 4)
+ smlal2 \u_dst2\().4s, \u_coef0\().8h, \r\().8h // U += ru * r
(second 4)
+ smlal2 \v_dst2\().4s, \v_coef0\().8h, \r\().8h // V += rv * r
(second 4)
+ + smlal \u_dst1\().4s, \u_coef1\().4h, \g\().4h // U += gu
* g (first 4)
+ smlal \v_dst1\().4s, \v_coef1\().4h, \g\().4h // V += gv * g
(first 4)
+ smlal2 \u_dst2\().4s, \u_coef1\().8h, \g\().8h // U += gu * g
(second 4)
+ smlal2 \v_dst2\().4s, \v_coef1\().8h, \g\().8h // V += gv * g
(second 4)
+ + smlal \u_dst1\().4s, \u_coef2\().4h, \b\().4h // U += bu
* b (first 4)
+ smlal \v_dst1\().4s, \v_coef2\().4h, \b\().4h // V += bv * b
(first 4)
+ smlal2 \u_dst2\().4s, \u_coef2\().8h, \b\().8h // U += bu * b
(second 4)
+ smlal2 \v_dst2\().4s, \v_coef2\().8h, \b\().8h // V += bv * b
(second 4)
+
+ sqshrn \u_dst\().4h, \u_dst1\().4s, \right_shift // U first 4 pixels
+ sqshrn2 \u_dst\().8h, \u_dst2\().4s, \right_shift // U all 8 pixels
+ sqshrn \v_dst\().4h, \v_dst1\().4s, \right_shift // V first 4 pixels
+ sqshrn2 \v_dst\().8h, \v_dst2\().4s, \right_shift // V all 8 pixels
+.endm
+
.macro rgbToY_neon fmt_bgr, fmt_rgb, element, alpha_first=0
function ff_\fmt_bgr\()ToY_neon, export=1
- cmp w4, #0 // check width > 0
+ cbz w4, 3f // check width > 0
ldp w12, w11, [x5] // w12: ry, w11: gy
ldr w10, [x5, #8] // w10: by
- b.gt 4f
- ret
+ b 4f
endfunc
function ff_\fmt_rgb\()ToY_neon, export=1
- cmp w4, #0 // check width > 0
+ cbz w4, 3f // check width > 0
ldp w10, w11, [x5] // w10: ry, w11: gy
ldr w12, [x5, #8] // w12: by
- b.le 3f
4:
mov w9, #256 // w9 = 1 <<
(RGB2YUV_SHIFT - 7)
movk w9, #8, lsl #16 // w9 += 32 <<
(RGB2YUV_SHIFT - 1)
@@ -158,8 +178,7 @@ rgbToY_neon abgr32, argb32, element=4, alpha_first=1
.macro rgbToUV_half_neon fmt_bgr, fmt_rgb, element, alpha_first=0
function ff_\fmt_bgr\()ToUV_half_neon, export=1
- cmp w5, #0 // check width > 0
- b.le 3f
+ cbz w5, 3f // check width > 0
ldp w12, w11, [x6, #12]
ldp w10, w15, [x6, #20]
@@ -168,7 +187,7 @@ function ff_\fmt_bgr\()ToUV_half_neon, export=1
endfunc
function ff_\fmt_rgb\()ToUV_half_neon, export=1
- cmp w5, #0 // check width > 0
+ cmp w5, #0 // check width > 0
b.le 3f
ldp w10, w11, [x6, #12] // w10: ru, w11: gu
@@ -178,32 +197,41 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
cmp w5, #8
rgb_set_uv_coeff half=1
b.lt 2f
-1:
+1: // load 16 pixels and prefetch memory for the next block
.if \element == 3
- ld3 { v16.16b, v17.16b, v18.16b }, [x3]
+ ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48
+ prfm pldl1strm, [x3, #48]
.else
- ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
+ ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
+ prfm pldl1strm, [x3, #64]
.endif
+ .if \alpha_first
- uaddlp v21.8h, v19.16b
- uaddlp v20.8h, v18.16b
- uaddlp v19.8h, v17.16b
+ uaddlp v21.8h, v19.16b // v21: summed b pairs
+ uaddlp v20.8h, v18.16b // v20: summed g pairs
+ uaddlp v19.8h, v17.16b // v19: summed r pairs
.else
- uaddlp v19.8h, v16.16b // v19: r
- uaddlp v20.8h, v17.16b // v20: g
- uaddlp v21.8h, v18.16b // v21: b
+ uaddlp v19.8h, v16.16b // v19: summed r pairs
+ uaddlp v20.8h, v17.16b // v20: summed g pairs
+ uaddlp v21.8h, v18.16b // v21: summed b pairs
.endif
- rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
- rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
- sub w5, w5, #8 // width -= 8
- add x3, x3, #(16*\element)
- cmp w5, #8 // width >= 8 ?
+ mov v22.16b, v6.16b // U first half
+ mov v23.16b, v6.16b // U second half
+ mov v24.16b, v6.16b // V first half
+ mov v25.16b, v6.16b // V second half
+
+ rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3,
v4, v5, v22, v23, v24, v25, v16, v17, #10
+
str q16, [x0], #16 // store dst_u
str q17, [x1], #16 // store dst_v
+ + sub w5, w5, #8 // width -= 8
+ cmp w5, #8 // width >= 8 ?
b.ge 1b
- cbz w5, 3f
-2:
+ cbz w5, 3f // No pixels left? Exit
+
+2: // Scalar fallback for remaining pixels
.if \alpha_first
rgb_load_add_half 1, 5, 2, 6, 3, 7
.else
@@ -213,21 +241,24 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
rgb_load_add_half 0, 4, 1, 5, 2, 6
.endif
.endif
-
smaddl x8, w2, w10, x9 // dst_u = ru * r +
const_offset
+ smaddl x16, w2, w13, x9 // dst_v = rv * r +
const_offset (parallel)
+ smaddl x8, w4, w11, x8 // dst_u += gu * g
+ smaddl x16, w4, w14, x16 // dst_v += gv * g
(parallel)
+ smaddl x8, w7, w12, x8 // dst_u += bu * b
- asr x8, x8, #10 // dst_u >>= 10
+ smaddl x16, w7, w15, x16 // dst_v += bv * b
(parallel)
+ + asr w8, w8, #10 // dst_u >>= 10
+ asr w16, w16, #10 // dst_v >>= 10
+ strh w8, [x0], #2 // store dst_u
-
- smaddl x8, w2, w13, x9 // dst_v = rv * r +
const_offset
- smaddl x8, w4, w14, x8 // dst_v += gv * g
- smaddl x8, w7, w15, x8 // dst_v += bv * b
- asr x8, x8, #10 // dst_v >>= 10
- sub w5, w5, #1
- add x3, x3, #(2*\element)
- strh w8, [x1], #2 // store dst_v
- cbnz w5, 2b
+ strh w16, [x1], #2 // store dst_v
+ + sub w5, w5, #1 // width--
+ add x3, x3, #(2*\element) // Advance source pointer
+ cbnz w5, 2b // Process next pixel
if any left
3:
ret
endfunc
@@ -244,9 +275,9 @@ function ff_\fmt_bgr\()ToUV_neon, export=1
cmp w5, #0 // check width > 0
b.le 3f
- ldp w12, w11, [x6, #12]
- ldp w10, w15, [x6, #20]
- ldp w14, w13, [x6, #28]
+ ldp w12, w11, [x6, #12] // bu, gu
+ ldp w10, w15, [x6, #20] // ru, bv
+ ldp w14, w13, [x6, #28] // gv, rv
b 4f
endfunc
@@ -263,21 +294,48 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
b.lt 2f
1:
.if \alpha_first
- argb_to_yuv_load_rgb x3
+ ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
+ uxtl v21.8h, v19.8b // v21: b
+ uxtl2 v24.8h, v19.16b // v24: b
+ uxtl v19.8h, v17.8b // v19: r
+ uxtl v20.8h, v18.8b // v20: g
+ uxtl2 v22.8h, v17.16b // v22: r
+ uxtl2 v23.8h, v18.16b // v23: g
.else
- rgb_to_yuv_load_rgb x3, \element
+ .if \element == 3
+ ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48
+ prfm pldl1strm, [x3, #48]
+ .else // element == 4
+ ld4 { v16.16b, v17.16b, v18.16b, v19.16b },
[x3], #64
+ prfm pldl1strm, [x3, #64]
+ .endif
+ uxtl v19.8h, v16.8b // v19: r
+ uxtl v20.8h, v17.8b // v20: g
+ uxtl v21.8h, v18.8b // v21: b
+ uxtl2 v22.8h, v16.16b // v22: r
+ uxtl2 v23.8h, v17.16b // v23: g
+ uxtl2 v24.8h, v18.16b // v24: b
.endif
- rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
- rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
- rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
- rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
- sub w5, w5, #16
- add x3, x3, #(16*\element)
- cmp w5, #16
- stp q16, q17, [x0], #32 // store to dst_u
- stp q18, q19, [x1], #32 // store to dst_v
+ // process 2 groups of 8 pixels
+ mov v25.16b, v6.16b // U_dst1 =
const_offset (32-bit accumulators)
+ mov v26.16b, v6.16b // U_dst2 = const_offset
+ mov v27.16b, v6.16b // V_dst1 = const_offset
+ mov v28.16b, v6.16b // V_dst2 = const_offset
+ rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3,
v4, v5, v25, v26, v27, v28, v16, v18, #9
+
+ mov v25.16b, v6.16b + mov
v26.16b, v6.16b
+ mov v27.16b, v6.16b
+ mov v28.16b, v6.16b
+ rgb_to_uv_interleaved_product v22, v23, v24, v0, v1, v2, v3,
v4, v5, v25, v26, v27, v28, v17, v19, #9
+
+ sub w5, w5, #16 // width -= 16
+ cmp w5, #16 // width >= 16 ?
+ stp q16, q17, [x0], #32 // store to dst_u
(post-increment)
+ stp q18, q19, [x1], #32 // store to dst_v
(post-increment)
b.ge 1b
- cbz w5, 3f
+ cbz w5, 3f // No pixels left? Exit
+
2:
.if \alpha_first
ldrb w16, [x3, #1] // w16: r
@@ -292,13 +350,13 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
smaddl x8, w16, w10, x9 // x8 = ru * r +
const_offset
smaddl x8, w17, w11, x8 // x8 += gu * g
smaddl x8, w4, w12, x8 // x8 += bu * b
- asr w8, w8, #9 // x8 >>= 9
+ asr x8, x8, #9 // x8 >>= 9
strh w8, [x0], #2 // store to dst_u
smaddl x8, w16, w13, x9 // x8 = rv * r +
const_offset
smaddl x8, w17, w14, x8 // x8 += gv * g
smaddl x8, w4, w15, x8 // x8 += bv * b
- asr w8, w8, #9 // x8 >>= 9
+ asr x8, x8, #9 // x8 >>= 9
sub w5, w5, #1 // width--
add x3, x3, #\element
strh w8, [x1], #2 // store to dst_v
--
2.39.5 (Apple Git-154)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale: rgb_to_yuv neon optimizations
2025-05-19 19:50 [FFmpeg-devel] [PATCH] swscale: rgb_to_yuv neon optimizations Dmitriy Kovalenko
@ 2025-05-20 19:46 ` Michael Niedermayer
0 siblings, 0 replies; 4+ messages in thread
From: Michael Niedermayer @ 2025-05-20 19:46 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 3602 bytes --]
On Mon, May 19, 2025 at 09:50:18PM +0200, Dmitriy Kovalenko wrote:
> I've found quite a few ways to optimize existing ffmpeg's rgb to yuv
> subsampled conversion. In this patch stack I'll try to
> improve the performance.
>
> This particular set of changes is a small improvement to all the
> existing functions and macro. The biggest performance gain is
> coming from post loading increment of the pointer and immediate
> pref etching of the memory blocks and interleaving the multiplication
> shifting operations of
> different registers for better scheduling.
>
> Also changed a bunch of places where cmp + b.le was used instead
> of one instruction cbnz/tbnz and some other small cleanups.
>
> Here are checkasm results on the macbook pro with the latest M4 max
>
> <before>
>
> bgra_to_uv_1080_c: 257.5 ( 1.00x)
> bgra_to_uv_1080_neon: 211.9 ( 1.22x)
> bgra_to_uv_1920_c: 467.1 ( 1.00x)
> bgra_to_uv_1920_neon: 379.3 ( 1.23x)
> bgra_to_uv_half_1080_c: 198.9 ( 1.00x)
> bgra_to_uv_half_1080_neon: 125.7 ( 1.58x)
> bgra_to_uv_half_1920_c: 346.3 ( 1.00x)
> bgra_to_uv_half_1920_neon: 223.7 ( 1.55x)
>
> <after>
>
> bgra_to_uv_1080_c: 268.3 ( 1.00x)
> bgra_to_uv_1080_neon: 176.0 ( 1.53x)
> bgra_to_uv_1920_c: 456.6 ( 1.00x)
> bgra_to_uv_1920_neon: 307.7 ( 1.48x)
> bgra_to_uv_half_1080_c: 193.2 ( 1.00x)
> bgra_to_uv_half_1080_neon: 96.8 ( 2.00x)
> bgra_to_uv_half_1920_c: 347.2 ( 1.00x)
> bgra_to_uv_half_1920_neon: 182.6 ( 1.92x)
>
> With my proprietary test on IOS it gives around 70% of performance
> improvement converting bgra 1920x1920 image to yuv420p
>
> On my linux arm cortex-r processing the performance improvement not that
> visible but still consistently faster by 5-10% than the current
> implementation.
>
> Signed-off-by: Dmitriy Kovalenko <dmtr.kovalenko@outlook.com>
> ---
> libswscale/aarch64/input.S | 166 +++++++++++++++++++++++++------------
> 1 file changed, 112 insertions(+), 54 deletions(-)
>
> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> index c1c0adffc8..ee8eb24c14 100644
> --- a/libswscale/aarch64/input.S
> +++ b/libswscale/aarch64/input.S
> @@ -1,5 +1,4 @@
> -/*
> - * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
> +/* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
> *
> * This file is part of FFmpeg.
> *
> @@ -57,20 +56,41 @@
> sqshrn2 \dst\().8h, \dst2\().4s, \right_shift //
> dst_higher_half = dst2 >> right_shift
> .endm
> +// interleaved product version of the rgb to yuv gives slightly better
> performance on non-performant mobile +.macro rgb_to_uv_interleaved_product
> r, g, b, u_coef0, u_coef1, u_coef2, v_coef0, v_coef1, v_coef2, u_dst1,
> u_dst2, v_dst1, v_dst2, u_dst, v_dst, right_shift
error: corrupt patch at line 58
please make sure your line/word wrap settings dont damage patches
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
During times of universal deceit, telling the truth becomes a
revolutionary act. -- George Orwell
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale: rgb_to_yuv neon optimizations
2025-05-22 6:55 ` Dmitriy Kovalenko
@ 2025-05-22 10:37 ` Ramiro Polla
0 siblings, 0 replies; 4+ messages in thread
From: Ramiro Polla @ 2025-05-22 10:37 UTC (permalink / raw)
To: FFmpeg development discussions and patches, dmtr.kovalenko
On Thu, May 22, 2025 at 8:55 AM Dmitriy Kovalenko
<dmtr.kovalenko@outlook.com> wrote:
> Bumping on the review for this one
Michael already replied:
https://ffmpeg.org/pipermail/ffmpeg-devel/2025-May/343826.html
The patch is corrupted and can't be applied.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale: rgb_to_yuv neon optimizations
[not found] <e10e6bc6-a4d2-4645-822a-5a0225f86353@outlook.com>
@ 2025-05-22 6:55 ` Dmitriy Kovalenko
2025-05-22 10:37 ` Ramiro Polla
0 siblings, 1 reply; 4+ messages in thread
From: Dmitriy Kovalenko @ 2025-05-22 6:55 UTC (permalink / raw)
To: ffmpeg-devel
Bumping on the review for this one
On 19/05/2025 21:50, Dmitriy Kovalenko wrote:
> I've found quite a few ways to optimize existing ffmpeg's rgb to yuv
> subsampled conversion. In this patch stack I'll try to
> improve the performance.
>
> This particular set of changes is a small improvement to all the
> existing functions and macro. The biggest performance gain is
> coming from post loading increment of the pointer and immediate
> pref etching of the memory blocks and interleaving the multiplication
> shifting operations of
> different registers for better scheduling.
>
> Also changed a bunch of places where cmp + b.le was used instead
> of one instruction cbnz/tbnz and some other small cleanups.
>
> Here are checkasm results on the macbook pro with the latest M4 max
>
> <before>
>
> bgra_to_uv_1080_c: 257.5 ( 1.00x)
> bgra_to_uv_1080_neon: 211.9 ( 1.22x)
> bgra_to_uv_1920_c: 467.1 ( 1.00x)
> bgra_to_uv_1920_neon: 379.3 ( 1.23x)
> bgra_to_uv_half_1080_c: 198.9 ( 1.00x)
> bgra_to_uv_half_1080_neon: 125.7 ( 1.58x)
> bgra_to_uv_half_1920_c: 346.3 ( 1.00x)
> bgra_to_uv_half_1920_neon: 223.7 ( 1.55x)
>
> <after>
>
> bgra_to_uv_1080_c: 268.3 ( 1.00x)
> bgra_to_uv_1080_neon: 176.0 ( 1.53x)
> bgra_to_uv_1920_c: 456.6 ( 1.00x)
> bgra_to_uv_1920_neon: 307.7 ( 1.48x)
> bgra_to_uv_half_1080_c: 193.2 ( 1.00x)
> bgra_to_uv_half_1080_neon: 96.8 ( 2.00x)
> bgra_to_uv_half_1920_c: 347.2 ( 1.00x)
> bgra_to_uv_half_1920_neon: 182.6 ( 1.92x)
>
> With my proprietary test on IOS it gives around 70% of performance
> improvement converting bgra 1920x1920 image to yuv420p
>
> On my linux arm cortex-r processing the performance improvement not that
> visible but still consistently faster by 5-10% than the current
> implementation.
>
> Signed-off-by: Dmitriy Kovalenko <dmtr.kovalenko@outlook.com>
> ---
> libswscale/aarch64/input.S | 166 +++++++++++++++++++++++++------------
> 1 file changed, 112 insertions(+), 54 deletions(-)
>
> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> index c1c0adffc8..ee8eb24c14 100644
> --- a/libswscale/aarch64/input.S
> +++ b/libswscale/aarch64/input.S
> @@ -1,5 +1,4 @@
> -/*
> - * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
> +/* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
> *
> * This file is part of FFmpeg.
> *
> @@ -57,20 +56,41 @@
> sqshrn2 \dst\().8h, \dst2\().4s, \right_shift //
> dst_higher_half = dst2 >> right_shift
> .endm
> +// interleaved product version of the rgb to yuv gives slightly
> better performance on non-performant mobile +.macro
> rgb_to_uv_interleaved_product r, g, b, u_coef0, u_coef1, u_coef2,
> v_coef0, v_coef1, v_coef2, u_dst1, u_dst2, v_dst1, v_dst2, u_dst,
> v_dst, right_shift
> + smlal \u_dst1\().4s, \u_coef0\().4h, \r\().4h // U += ru *
> r (first 4)
> + smlal \v_dst1\().4s, \v_coef0\().4h, \r\().4h // V += rv *
> r (first 4)
> + smlal2 \u_dst2\().4s, \u_coef0\().8h, \r\().8h // U += ru *
> r (second 4)
> + smlal2 \v_dst2\().4s, \v_coef0\().8h, \r\().8h // V += rv *
> r (second 4)
> + + smlal \u_dst1\().4s, \u_coef1\().4h, \g\().4h // U +=
> gu * g (first 4)
> + smlal \v_dst1\().4s, \v_coef1\().4h, \g\().4h // V += gv *
> g (first 4)
> + smlal2 \u_dst2\().4s, \u_coef1\().8h, \g\().8h // U += gu *
> g (second 4)
> + smlal2 \v_dst2\().4s, \v_coef1\().8h, \g\().8h // V += gv *
> g (second 4)
> + + smlal \u_dst1\().4s, \u_coef2\().4h, \b\().4h // U +=
> bu * b (first 4)
> + smlal \v_dst1\().4s, \v_coef2\().4h, \b\().4h // V += bv *
> b (first 4)
> + smlal2 \u_dst2\().4s, \u_coef2\().8h, \b\().8h // U += bu *
> b (second 4)
> + smlal2 \v_dst2\().4s, \v_coef2\().8h, \b\().8h // V += bv *
> b (second 4)
> +
> + sqshrn \u_dst\().4h, \u_dst1\().4s, \right_shift // U first 4
> pixels
> + sqshrn2 \u_dst\().8h, \u_dst2\().4s, \right_shift // U all 8
> pixels
> + sqshrn \v_dst\().4h, \v_dst1\().4s, \right_shift // V first 4
> pixels
> + sqshrn2 \v_dst\().8h, \v_dst2\().4s, \right_shift // V all 8
> pixels
> +.endm
> +
> .macro rgbToY_neon fmt_bgr, fmt_rgb, element, alpha_first=0
> function ff_\fmt_bgr\()ToY_neon, export=1
> - cmp w4, #0 // check width > 0
> + cbz w4, 3f // check width > 0
> ldp w12, w11, [x5] // w12: ry, w11: gy
> ldr w10, [x5, #8] // w10: by
> - b.gt 4f
> - ret
> + b 4f
> endfunc
> function ff_\fmt_rgb\()ToY_neon, export=1
> - cmp w4, #0 // check width > 0
> + cbz w4, 3f // check width > 0
> ldp w10, w11, [x5] // w10: ry, w11: gy
> ldr w12, [x5, #8] // w12: by
> - b.le 3f
> 4:
> mov w9, #256 // w9 = 1 <<
> (RGB2YUV_SHIFT - 7)
> movk w9, #8, lsl #16 // w9 += 32 <<
> (RGB2YUV_SHIFT - 1)
> @@ -158,8 +178,7 @@ rgbToY_neon abgr32, argb32, element=4, alpha_first=1
> .macro rgbToUV_half_neon fmt_bgr, fmt_rgb, element, alpha_first=0
> function ff_\fmt_bgr\()ToUV_half_neon, export=1
> - cmp w5, #0 // check width > 0
> - b.le 3f
> + cbz w5, 3f // check width > 0
> ldp w12, w11, [x6, #12]
> ldp w10, w15, [x6, #20]
> @@ -168,7 +187,7 @@ function ff_\fmt_bgr\()ToUV_half_neon, export=1
> endfunc
> function ff_\fmt_rgb\()ToUV_half_neon, export=1
> - cmp w5, #0 // check width > 0
> + cmp w5, #0 // check width > 0
> b.le 3f
> ldp w10, w11, [x6, #12] // w10: ru, w11: gu
> @@ -178,32 +197,41 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
> cmp w5, #8
> rgb_set_uv_coeff half=1
> b.lt 2f
> -1:
> +1: // load 16 pixels and prefetch memory for the next block
> .if \element == 3
> - ld3 { v16.16b, v17.16b, v18.16b }, [x3]
> + ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48
> + prfm pldl1strm, [x3, #48]
> .else
> - ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3],
> #64
> + prfm pldl1strm, [x3, #64]
> .endif
> + .if \alpha_first
> - uaddlp v21.8h, v19.16b
> - uaddlp v20.8h, v18.16b
> - uaddlp v19.8h, v17.16b
> + uaddlp v21.8h, v19.16b // v21: summed b pairs
> + uaddlp v20.8h, v18.16b // v20: summed g pairs
> + uaddlp v19.8h, v17.16b // v19: summed r pairs
> .else
> - uaddlp v19.8h, v16.16b // v19: r
> - uaddlp v20.8h, v17.16b // v20: g
> - uaddlp v21.8h, v18.16b // v21: b
> + uaddlp v19.8h, v16.16b // v19: summed r pairs
> + uaddlp v20.8h, v17.16b // v20: summed g pairs
> + uaddlp v21.8h, v18.16b // v21: summed b pairs
> .endif
> - rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2,
> #10
> - rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
> - sub w5, w5, #8 // width -= 8
> - add x3, x3, #(16*\element)
> - cmp w5, #8 // width >= 8 ?
> + mov v22.16b, v6.16b // U first half
> + mov v23.16b, v6.16b // U second half
> + mov v24.16b, v6.16b // V first half
> + mov v25.16b, v6.16b // V second half
> +
> + rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3,
> v4, v5, v22, v23, v24, v25, v16, v17, #10
> +
> str q16, [x0], #16 // store dst_u
> str q17, [x1], #16 // store dst_v
> + + sub w5, w5, #8 // width -= 8
> + cmp w5, #8 // width >= 8 ?
> b.ge 1b
> - cbz w5, 3f
> -2:
> + cbz w5, 3f // No pixels left? Exit
> +
> +2: // Scalar fallback for remaining pixels
> .if \alpha_first
> rgb_load_add_half 1, 5, 2, 6, 3, 7
> .else
> @@ -213,21 +241,24 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
> rgb_load_add_half 0, 4, 1, 5, 2, 6
> .endif
> .endif
> -
> smaddl x8, w2, w10, x9 // dst_u = ru * r +
> const_offset
> + smaddl x16, w2, w13, x9 // dst_v = rv * r +
> const_offset (parallel)
> + smaddl x8, w4, w11, x8 // dst_u +=
> gu * g
> + smaddl x16, w4, w14, x16 // dst_v += gv * g
> (parallel)
> + smaddl x8, w7, w12, x8 // dst_u +=
> bu * b
> - asr x8, x8, #10 // dst_u >>= 10
> + smaddl x16, w7, w15, x16 // dst_v += bv * b
> (parallel)
> + + asr w8, w8, #10 // dst_u >>= 10
> + asr w16, w16, #10 // dst_v >>= 10
> + strh w8, [x0], #2 // store dst_u
> -
> - smaddl x8, w2, w13, x9 // dst_v = rv * r +
> const_offset
> - smaddl x8, w4, w14, x8 // dst_v += gv * g
> - smaddl x8, w7, w15, x8 // dst_v += bv * b
> - asr x8, x8, #10 // dst_v >>= 10
> - sub w5, w5, #1
> - add x3, x3, #(2*\element)
> - strh w8, [x1], #2 // store dst_v
> - cbnz w5, 2b
> + strh w16, [x1], #2 // store dst_v
> + + sub w5, w5, #1 // width--
> + add x3, x3, #(2*\element) // Advance source
> pointer
> + cbnz w5, 2b // Process next pixel
> if any left
> 3:
> ret
> endfunc
> @@ -244,9 +275,9 @@ function ff_\fmt_bgr\()ToUV_neon, export=1
> cmp w5, #0 // check width > 0
> b.le 3f
> - ldp w12, w11, [x6, #12]
> - ldp w10, w15, [x6, #20]
> - ldp w14, w13, [x6, #28]
> + ldp w12, w11, [x6, #12] // bu, gu
> + ldp w10, w15, [x6, #20] // ru, bv
> + ldp w14, w13, [x6, #28] // gv, rv
> b 4f
> endfunc
> @@ -263,21 +294,48 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
> b.lt 2f
> 1:
> .if \alpha_first
> - argb_to_yuv_load_rgb x3
> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3],
> #64
> + uxtl v21.8h, v19.8b // v21: b
> + uxtl2 v24.8h, v19.16b // v24: b
> + uxtl v19.8h, v17.8b // v19: r
> + uxtl v20.8h, v18.8b // v20: g
> + uxtl2 v22.8h, v17.16b // v22: r
> + uxtl2 v23.8h, v18.16b // v23: g
> .else
> - rgb_to_yuv_load_rgb x3, \element
> + .if \element == 3
> + ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48
> + prfm pldl1strm, [x3, #48]
> + .else // element == 4
> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b },
> [x3], #64
> + prfm pldl1strm, [x3, #64]
> + .endif
> + uxtl v19.8h, v16.8b // v19: r
> + uxtl v20.8h, v17.8b // v20: g
> + uxtl v21.8h, v18.8b // v21: b
> + uxtl2 v22.8h, v16.16b // v22: r
> + uxtl2 v23.8h, v17.16b // v23: g
> + uxtl2 v24.8h, v18.16b // v24: b
> .endif
> - rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
> - rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
> - rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
> - rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
> - sub w5, w5, #16
> - add x3, x3, #(16*\element)
> - cmp w5, #16
> - stp q16, q17, [x0], #32 // store to dst_u
> - stp q18, q19, [x1], #32 // store to dst_v
> + // process 2 groups of 8 pixels
> + mov v25.16b, v6.16b // U_dst1 =
> const_offset (32-bit accumulators)
> + mov v26.16b, v6.16b // U_dst2 = const_offset
> + mov v27.16b, v6.16b // V_dst1 = const_offset
> + mov v28.16b, v6.16b // V_dst2 = const_offset
> + rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3,
> v4, v5, v25, v26, v27, v28, v16, v18, #9
> +
> + mov v25.16b, v6.16b + mov v26.16b, v6.16b
> + mov v27.16b, v6.16b
> + mov v28.16b, v6.16b
> + rgb_to_uv_interleaved_product v22, v23, v24, v0, v1, v2, v3,
> v4, v5, v25, v26, v27, v28, v17, v19, #9
> +
> + sub w5, w5, #16 // width -= 16
> + cmp w5, #16 // width >= 16 ?
> + stp q16, q17, [x0], #32 // store to dst_u
> (post-increment)
> + stp q18, q19, [x1], #32 // store to dst_v
> (post-increment)
> b.ge 1b
> - cbz w5, 3f
> + cbz w5, 3f // No pixels left? Exit
> +
> 2:
> .if \alpha_first
> ldrb w16, [x3, #1] // w16: r
> @@ -292,13 +350,13 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
> smaddl x8, w16, w10, x9 // x8 = ru * r +
> const_offset
> smaddl x8, w17, w11, x8 // x8 += gu * g
> smaddl x8, w4, w12, x8 // x8 += bu * b
> - asr w8, w8, #9 // x8 >>= 9
> + asr x8, x8, #9 // x8 >>= 9
> strh w8, [x0], #2 // store to dst_u
> smaddl x8, w16, w13, x9 // x8 = rv * r +
> const_offset
> smaddl x8, w17, w14, x8 // x8 += gv * g
> smaddl x8, w4, w15, x8 // x8 += bv * b
> - asr w8, w8, #9 // x8 >>= 9
> + asr x8, x8, #9 // x8 >>= 9
> sub w5, w5, #1 // width--
> add x3, x3, #\element
> strh w8, [x1], #2 // store to dst_v
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2025-05-22 10:37 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-05-19 19:50 [FFmpeg-devel] [PATCH] swscale: rgb_to_yuv neon optimizations Dmitriy Kovalenko
2025-05-20 19:46 ` Michael Niedermayer
[not found] <e10e6bc6-a4d2-4645-822a-5a0225f86353@outlook.com>
2025-05-22 6:55 ` Dmitriy Kovalenko
2025-05-22 10:37 ` Ramiro Polla
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git