* [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations
[not found] <20250531091631.45342-1-dmtr.kovalenko@outlook.com>
@ 2025-05-31 9:11 ` Dmitriy Kovalenko
2025-06-05 12:00 ` Martin Storsjö
2025-05-31 9:11 ` [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 32 pixels at a time Dmitriy Kovalenko
1 sibling, 1 reply; 8+ messages in thread
From: Dmitriy Kovalenko @ 2025-05-31 9:11 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Dmitriy Kovalenko
I've found quite a few ways to optimize existing ffmpeg's rgb to yuv
subsampled conversion. In this patch stack I'll try to
improve the perofrmance.
This particular set of changes is a small improvement to all the
existing functions and macro. The biggest performance gain is
coming from post loading increment of the pointer and immediate
~~prefetching of the memory blocks~~(was moved to the next patch in the stack) and interleaving the multiplication shifting operations of
different registers for better scheduling.
Also changed a bunch of places where cmp + b.le was used instead
of one instruction cbnz/tbnz and some other small cleanups.
Here are checkasm results on the macbook pro with the latest M4 max
<before>
bgra_to_uv_1080_c: 257.5 ( 1.00x)
bgra_to_uv_1080_neon: 211.9 ( 1.22x)
bgra_to_uv_1920_c: 467.1 ( 1.00x)
bgra_to_uv_1920_neon: 379.3 ( 1.23x)
bgra_to_uv_half_1080_c: 198.9 ( 1.00x)
bgra_to_uv_half_1080_neon: 125.7 ( 1.58x)
bgra_to_uv_half_1920_c: 346.3 ( 1.00x)
bgra_to_uv_half_1920_neon: 223.7 ( 1.55x)
<after>
bgra_to_uv_1080_c: 268.3 ( 1.00x)
bgra_to_uv_1080_neon: 176.0 ( 1.53x)
bgra_to_uv_1920_c: 456.6 ( 1.00x)
bgra_to_uv_1920_neon: 307.7 ( 1.48x)
bgra_to_uv_half_1080_c: 193.2 ( 1.00x)
bgra_to_uv_half_1080_neon: 96.8 ( 2.00x)
bgra_to_uv_half_1920_c: 347.2 ( 1.00x)
bgra_to_uv_half_1920_neon: 182.6 ( 1.92x)
With my proprietary test on IOS it gives around 70% of performance
improvement converting bgra 1920x1920 image to yuv420p
On my linux arm cortex-r processing the performance improvement not that
visible but still consistently faster by 5-10% than the current
implementation.
---
libswscale/aarch64/input.S | 143 +++++++++++++++++++++++--------------
1 file changed, 91 insertions(+), 52 deletions(-)
diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index c1c0adffc8..260a26e965 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -22,9 +22,9 @@
.macro rgb_to_yuv_load_rgb src, element=3
.if \element == 3
- ld3 { v16.16b, v17.16b, v18.16b }, [\src]
+ ld3 { v16.16b, v17.16b, v18.16b }, [\src], #48
.else
- ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
+ ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src], #64
.endif
uxtl v19.8h, v16.8b // v19: r
uxtl v20.8h, v17.8b // v20: g
@@ -35,7 +35,7 @@
.endm
.macro argb_to_yuv_load_rgb src
- ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
+ ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src], #64
uxtl v21.8h, v19.8b // v21: b
uxtl2 v24.8h, v19.16b // v24: b
uxtl v19.8h, v17.8b // v19: r
@@ -57,20 +57,41 @@
sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift
.endm
+// interleaved product version of the rgb to yuv gives slightly better performance on non-performant mobile cores
+.macro rgb_to_uv_interleaved_product r, g, b, u_coef0, u_coef1, u_coef2, v_coef0, v_coef1, v_coef2, u_dst1, u_dst2, v_dst1, v_dst2, u_dst, v_dst, right_shift
+ smlal \u_dst1\().4s, \u_coef0\().4h, \r\().4h // U += ru * r (first 4)
+ smlal \v_dst1\().4s, \v_coef0\().4h, \r\().4h // V += rv * r (first 4)
+ smlal2 \u_dst2\().4s, \u_coef0\().8h, \r\().8h // U += ru * r (second 4)
+ smlal2 \v_dst2\().4s, \v_coef0\().8h, \r\().8h // V += rv * r (second 4)
+
+ smlal \u_dst1\().4s, \u_coef1\().4h, \g\().4h // U += gu * g (first 4)
+ smlal \v_dst1\().4s, \v_coef1\().4h, \g\().4h // V += gv * g (first 4)
+ smlal2 \u_dst2\().4s, \u_coef1\().8h, \g\().8h // U += gu * g (second 4)
+ smlal2 \v_dst2\().4s, \v_coef1\().8h, \g\().8h // V += gv * g (second 4)
+
+ smlal \u_dst1\().4s, \u_coef2\().4h, \b\().4h // U += bu * b (first 4)
+ smlal \v_dst1\().4s, \v_coef2\().4h, \b\().4h // V += bv * b (first 4)
+ smlal2 \u_dst2\().4s, \u_coef2\().8h, \b\().8h // U += bu * b (second 4)
+ smlal2 \v_dst2\().4s, \v_coef2\().8h, \b\().8h // V += bv * b (second 4)
+
+ sqshrn \u_dst\().4h, \u_dst1\().4s, \right_shift // U first 4 pixels
+ sqshrn2 \u_dst\().8h, \u_dst2\().4s, \right_shift // U all 8 pixels
+ sqshrn \v_dst\().4h, \v_dst1\().4s, \right_shift // V first 4 pixels
+ sqshrn2 \v_dst\().8h, \v_dst2\().4s, \right_shift // V all 8 pixels
+.endm
+
.macro rgbToY_neon fmt_bgr, fmt_rgb, element, alpha_first=0
function ff_\fmt_bgr\()ToY_neon, export=1
- cmp w4, #0 // check width > 0
+ cbz w4, 3f // check width > 0
ldp w12, w11, [x5] // w12: ry, w11: gy
ldr w10, [x5, #8] // w10: by
- b.gt 4f
- ret
+ b 4f
endfunc
function ff_\fmt_rgb\()ToY_neon, export=1
- cmp w4, #0 // check width > 0
+ cbz w4, 3f // check width > 0
ldp w10, w11, [x5] // w10: ry, w11: gy
ldr w12, [x5, #8] // w12: by
- b.le 3f
4:
mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7)
movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1)
@@ -90,7 +111,6 @@ function ff_\fmt_rgb\()ToY_neon, export=1
rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
sub w4, w4, #16 // width -= 16
- add x1, x1, #(16*\element)
cmp w4, #16 // width >= 16 ?
stp q16, q17, [x0], #32 // store to dst
b.ge 1b
@@ -158,8 +178,7 @@ rgbToY_neon abgr32, argb32, element=4, alpha_first=1
.macro rgbToUV_half_neon fmt_bgr, fmt_rgb, element, alpha_first=0
function ff_\fmt_bgr\()ToUV_half_neon, export=1
- cmp w5, #0 // check width > 0
- b.le 3f
+ cbz w5, 3f // check width > 0
ldp w12, w11, [x6, #12]
ldp w10, w15, [x6, #20]
@@ -168,7 +187,7 @@ function ff_\fmt_bgr\()ToUV_half_neon, export=1
endfunc
function ff_\fmt_rgb\()ToUV_half_neon, export=1
- cmp w5, #0 // check width > 0
+ cmp w5, #0 // check width > 0
b.le 3f
ldp w10, w11, [x6, #12] // w10: ru, w11: gu
@@ -178,32 +197,39 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
cmp w5, #8
rgb_set_uv_coeff half=1
b.lt 2f
-1:
+1: // load 16 pixels
.if \element == 3
- ld3 { v16.16b, v17.16b, v18.16b }, [x3]
+ ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48
.else
- ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
+ ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
.endif
+
.if \alpha_first
- uaddlp v21.8h, v19.16b
- uaddlp v20.8h, v18.16b
- uaddlp v19.8h, v17.16b
+ uaddlp v21.8h, v19.16b // v21: summed b pairs
+ uaddlp v20.8h, v18.16b // v20: summed g pairs
+ uaddlp v19.8h, v17.16b // v19: summed r pairs
.else
- uaddlp v19.8h, v16.16b // v19: r
- uaddlp v20.8h, v17.16b // v20: g
- uaddlp v21.8h, v18.16b // v21: b
+ uaddlp v19.8h, v16.16b // v19: summed r pairs
+ uaddlp v20.8h, v17.16b // v20: summed g pairs
+ uaddlp v21.8h, v18.16b // v21: summed b pairs
.endif
- rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
- rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
- sub w5, w5, #8 // width -= 8
- add x3, x3, #(16*\element)
- cmp w5, #8 // width >= 8 ?
+ mov v22.16b, v6.16b // U first half
+ mov v23.16b, v6.16b // U second half
+ mov v24.16b, v6.16b // V first half
+ mov v25.16b, v6.16b // V second half
+
+ rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v22, v23, v24, v25, v16, v17, #10
+
str q16, [x0], #16 // store dst_u
str q17, [x1], #16 // store dst_v
+
+ sub w5, w5, #8 // width -= 8
+ cmp w5, #8 // width >= 8 ?
b.ge 1b
- cbz w5, 3f
-2:
+ cbz w5, 3f // No pixels left? Exit
+
+2: // Scalar fallback for remaining pixels
.if \alpha_first
rgb_load_add_half 1, 5, 2, 6, 3, 7
.else
@@ -213,21 +239,24 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
rgb_load_add_half 0, 4, 1, 5, 2, 6
.endif
.endif
-
smaddl x8, w2, w10, x9 // dst_u = ru * r + const_offset
+ smaddl x16, w2, w13, x9 // dst_v = rv * r + const_offset (parallel)
+
smaddl x8, w4, w11, x8 // dst_u += gu * g
+ smaddl x16, w4, w14, x16 // dst_v += gv * g (parallel)
+
smaddl x8, w7, w12, x8 // dst_u += bu * b
- asr x8, x8, #10 // dst_u >>= 10
+ smaddl x16, w7, w15, x16 // dst_v += bv * b (parallel)
+
+ asr w8, w8, #10 // dst_u >>= 10
+ asr w16, w16, #10 // dst_v >>= 10
+
strh w8, [x0], #2 // store dst_u
+ strh w16, [x1], #2 // store dst_v
- smaddl x8, w2, w13, x9 // dst_v = rv * r + const_offset
- smaddl x8, w4, w14, x8 // dst_v += gv * g
- smaddl x8, w7, w15, x8 // dst_v += bv * b
- asr x8, x8, #10 // dst_v >>= 10
- sub w5, w5, #1
- add x3, x3, #(2*\element)
- strh w8, [x1], #2 // store dst_v
- cbnz w5, 2b
+ sub w5, w5, #1 // width--
+ add x3, x3, #(2*\element) // Advance source pointer
+ cbnz w5, 2b // Process next pixel if any left
3:
ret
endfunc
@@ -244,9 +273,9 @@ function ff_\fmt_bgr\()ToUV_neon, export=1
cmp w5, #0 // check width > 0
b.le 3f
- ldp w12, w11, [x6, #12]
- ldp w10, w15, [x6, #20]
- ldp w14, w13, [x6, #28]
+ ldp w12, w11, [x6, #12] // bu, gu
+ ldp w10, w15, [x6, #20] // ru, bv
+ ldp w14, w13, [x6, #28] // gv, rv
b 4f
endfunc
@@ -267,17 +296,26 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
.else
rgb_to_yuv_load_rgb x3, \element
.endif
- rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
- rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
- rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
- rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
- sub w5, w5, #16
- add x3, x3, #(16*\element)
- cmp w5, #16
- stp q16, q17, [x0], #32 // store to dst_u
- stp q18, q19, [x1], #32 // store to dst_v
+ // process 2 groups of 8 pixels
+ mov v25.16b, v6.16b // U_dst1 = const_offset (32-bit accumulators)
+ mov v26.16b, v6.16b // U_dst2 = const_offset
+ mov v27.16b, v6.16b // V_dst1 = const_offset
+ mov v28.16b, v6.16b // V_dst2 = const_offset
+ rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v25, v26, v27, v28, v16, v18, #9
+
+ mov v25.16b, v6.16b
+ mov v26.16b, v6.16b
+ mov v27.16b, v6.16b
+ mov v28.16b, v6.16b
+ rgb_to_uv_interleaved_product v22, v23, v24, v0, v1, v2, v3, v4, v5, v25, v26, v27, v28, v17, v19, #9
+
+ sub w5, w5, #16 // width -= 16
+ cmp w5, #16 // width >= 16 ?
+ stp q16, q17, [x0], #32 // store to dst_u (post-increment)
+ stp q18, q19, [x1], #32 // store to dst_v (post-increment)
b.ge 1b
- cbz w5, 3f
+ cbz w5, 3f // No pixels left? Exit
+
2:
.if \alpha_first
ldrb w16, [x3, #1] // w16: r
@@ -292,7 +330,7 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
smaddl x8, w16, w10, x9 // x8 = ru * r + const_offset
smaddl x8, w17, w11, x8 // x8 += gu * g
smaddl x8, w4, w12, x8 // x8 += bu * b
- asr w8, w8, #9 // x8 >>= 9
+ asr x8, x8, #9 // x8 >>= 9
strh w8, [x0], #2 // store to dst_u
smaddl x8, w16, w13, x9 // x8 = rv * r + const_offset
@@ -401,3 +439,4 @@ endfunc
DISABLE_DOTPROD
#endif
+
--
2.49.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 32 pixels at a time
[not found] <20250531091631.45342-1-dmtr.kovalenko@outlook.com>
2025-05-31 9:11 ` [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations Dmitriy Kovalenko
@ 2025-05-31 9:11 ` Dmitriy Kovalenko
2025-05-31 10:32 ` Kieran Kunhya via ffmpeg-devel
2025-06-05 12:13 ` Martin Storsjö
1 sibling, 2 replies; 8+ messages in thread
From: Dmitriy Kovalenko @ 2025-05-31 9:11 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Dmitriy Kovalenko
This patch integrates so called double bufferring when we are loading
2 batch of elements at a time and then processing them in parallel. On the
moden arm processors especially Apple Silicon it gives a visible
benefit, for subsampled pixel processing it is especially nice because
it allows to read elements w/ 2 instructions and write with a single one
(especially visible on a platforms with slower memory like ios).
Including the previous patch in a stack on macbook pro m4 max rgb_to_yuv_half
in checkasm goes up 2x of the c version
---
libswscale/aarch64/input.S | 130 ++++++++++++++++++++++++++-----------
1 file changed, 91 insertions(+), 39 deletions(-)
diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index 260a26e965..b90ca05996 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -178,7 +178,7 @@ rgbToY_neon abgr32, argb32, element=4, alpha_first=1
.macro rgbToUV_half_neon fmt_bgr, fmt_rgb, element, alpha_first=0
function ff_\fmt_bgr\()ToUV_half_neon, export=1
- cbz w5, 3f // check width > 0
+ cbz w5, 3f
ldp w12, w11, [x6, #12]
ldp w10, w15, [x6, #20]
@@ -187,49 +187,101 @@ function ff_\fmt_bgr\()ToUV_half_neon, export=1
endfunc
function ff_\fmt_rgb\()ToUV_half_neon, export=1
- cmp w5, #0 // check width > 0
+ cmp w5, #0
b.le 3f
- ldp w10, w11, [x6, #12] // w10: ru, w11: gu
- ldp w12, w13, [x6, #20] // w12: bu, w13: rv
- ldp w14, w15, [x6, #28] // w14: gv, w15: bv
+ ldp w10, w11, [x6, #12]
+ ldp w12, w13, [x6, #20]
+ ldp w14, w15, [x6, #28]
4:
- cmp w5, #8
rgb_set_uv_coeff half=1
+
+ cmp w5, #16
b.lt 2f
-1: // load 16 pixels
+
+1:
.if \element == 3
ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48
+ ld3 { v26.16b, v27.16b, v28.16b }, [x3], #48
.else
ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
+ ld4 { v26.16b, v27.16b, v28.16b, v29.16b }, [x3], #64
.endif
.if \alpha_first
- uaddlp v21.8h, v19.16b // v21: summed b pairs
- uaddlp v20.8h, v18.16b // v20: summed g pairs
- uaddlp v19.8h, v17.16b // v19: summed r pairs
+ uaddlp v21.8h, v19.16b
+ uaddlp v20.8h, v18.16b
+ uaddlp v19.8h, v17.16b
+ uaddlp v31.8h, v29.16b
+ uaddlp v30.8h, v28.16b
+ uaddlp v29.8h, v27.16b
.else
- uaddlp v19.8h, v16.16b // v19: summed r pairs
- uaddlp v20.8h, v17.16b // v20: summed g pairs
- uaddlp v21.8h, v18.16b // v21: summed b pairs
+ uaddlp v19.8h, v16.16b
+ uaddlp v20.8h, v17.16b
+ uaddlp v21.8h, v18.16b
+ uaddlp v29.8h, v26.16b
+ uaddlp v30.8h, v27.16b
+ uaddlp v31.8h, v28.16b
.endif
- mov v22.16b, v6.16b // U first half
- mov v23.16b, v6.16b // U second half
- mov v24.16b, v6.16b // V first half
- mov v25.16b, v6.16b // V second half
-
- rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v22, v23, v24, v25, v16, v17, #10
-
- str q16, [x0], #16 // store dst_u
- str q17, [x1], #16 // store dst_v
+ mov v7.16b, v6.16b
+ mov v16.16b, v6.16b
+ mov v17.16b, v6.16b
+ mov v18.16b, v6.16b
+ mov v26.16b, v6.16b
+ mov v27.16b, v6.16b
+ mov v28.16b, v6.16b
+ mov v25.16b, v6.16b
- sub w5, w5, #8 // width -= 8
- cmp w5, #8 // width >= 8 ?
+ smlal v7.4s, v0.4h, v19.4h
+ smlal v17.4s, v3.4h, v19.4h
+ smlal v26.4s, v0.4h, v29.4h
+ smlal v28.4s, v3.4h, v29.4h
+
+ smlal2 v16.4s, v0.8h, v19.8h
+ smlal2 v18.4s, v3.8h, v19.8h
+ smlal2 v27.4s, v0.8h, v29.8h
+ smlal2 v25.4s, v3.8h, v29.8h
+
+ smlal v7.4s, v1.4h, v20.4h
+ smlal v17.4s, v4.4h, v20.4h
+ smlal v26.4s, v1.4h, v30.4h
+ smlal v28.4s, v4.4h, v30.4h
+
+ smlal2 v16.4s, v1.8h, v20.8h
+ smlal2 v18.4s, v4.8h, v20.8h
+ smlal2 v27.4s, v1.8h, v30.8h
+ smlal2 v25.4s, v4.8h, v30.8h
+
+ smlal v7.4s, v2.4h, v21.4h
+ smlal v17.4s, v5.4h, v21.4h
+ smlal v26.4s, v2.4h, v31.4h
+ smlal v28.4s, v5.4h, v31.4h
+
+ smlal2 v16.4s, v2.8h, v21.8h
+ smlal2 v18.4s, v5.8h, v21.8h
+ smlal2 v27.4s, v2.8h, v31.8h
+ smlal2 v25.4s, v5.8h, v31.8h
+
+ sqshrn v19.4h, v7.4s, #10
+ sqshrn v20.4h, v17.4s, #10
+ sqshrn v22.4h, v26.4s, #10
+ sqshrn v23.4h, v28.4s, #10
+
+ sqshrn2 v19.8h, v16.4s, #10
+ sqshrn2 v20.8h, v18.4s, #10
+ sqshrn2 v22.8h, v27.4s, #10
+ sqshrn2 v23.8h, v25.4s, #10
+
+ stp q19, q22, [x0], #32
+ stp q20, q23, [x1], #32
+
+ sub w5, w5, #16
+ cmp w5, #16
b.ge 1b
- cbz w5, 3f // No pixels left? Exit
+ cbz w5, 3f
-2: // Scalar fallback for remaining pixels
+2:
.if \alpha_first
rgb_load_add_half 1, 5, 2, 6, 3, 7
.else
@@ -239,24 +291,24 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
rgb_load_add_half 0, 4, 1, 5, 2, 6
.endif
.endif
- smaddl x8, w2, w10, x9 // dst_u = ru * r + const_offset
- smaddl x16, w2, w13, x9 // dst_v = rv * r + const_offset (parallel)
+ smaddl x8, w2, w10, x9
+ smaddl x16, w2, w13, x9
- smaddl x8, w4, w11, x8 // dst_u += gu * g
- smaddl x16, w4, w14, x16 // dst_v += gv * g (parallel)
+ smaddl x8, w4, w11, x8
+ smaddl x16, w4, w14, x16
- smaddl x8, w7, w12, x8 // dst_u += bu * b
- smaddl x16, w7, w15, x16 // dst_v += bv * b (parallel)
+ smaddl x8, w7, w12, x8
+ smaddl x16, w7, w15, x16
- asr w8, w8, #10 // dst_u >>= 10
- asr w16, w16, #10 // dst_v >>= 10
+ asr w8, w8, #10
+ asr w16, w16, #10
- strh w8, [x0], #2 // store dst_u
- strh w16, [x1], #2 // store dst_v
+ strh w8, [x0], #2
+ strh w16, [x1], #2
- sub w5, w5, #1 // width--
- add x3, x3, #(2*\element) // Advance source pointer
- cbnz w5, 2b // Process next pixel if any left
+ sub w5, w5, #1
+ add x3, x3, #(2*\element)
+ cbnz w5, 2b
3:
ret
endfunc
--
2.49.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 32 pixels at a time
2025-05-31 9:11 ` [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 32 pixels at a time Dmitriy Kovalenko
@ 2025-05-31 10:32 ` Kieran Kunhya via ffmpeg-devel
2025-05-31 10:43 ` Dmitriy Kovalenko
2025-06-05 12:13 ` Martin Storsjö
1 sibling, 1 reply; 8+ messages in thread
From: Kieran Kunhya via ffmpeg-devel @ 2025-05-31 10:32 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Cc: Kieran Kunhya, Dmitriy Kovalenko
On Sat, 31 May 2025, 10:17 Dmitriy Kovalenko, <dmtr.kovalenko@outlook.com>
wrote:
> This patch integrates so called double bufferring when we are loading
>
Nit: I am not sure this is what most people refer to as "double buffering"
Kieran
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 32 pixels at a time
2025-05-31 10:32 ` Kieran Kunhya via ffmpeg-devel
@ 2025-05-31 10:43 ` Dmitriy Kovalenko
2025-05-31 12:13 ` Martin Storsjö
0 siblings, 1 reply; 8+ messages in thread
From: Dmitriy Kovalenko @ 2025-05-31 10:43 UTC (permalink / raw)
To: Kieran Kunhya; +Cc: ffmpeg-devel
Correct. I meant dual issue https://developer.arm.com/documentation/ddi0460/d/Cycle-Timings-and-Interlock-Behavior/Dual-issue
Best regards,
Dmitriy Kovalenko
On May 31, 2025, at 12:32, Kieran Kunhya <kieran618@googlemail.com> wrote:
On Sat, 31 May 2025, 10:17 Dmitriy Kovalenko, <dmtr.kovalenko@outlook.com<mailto:dmtr.kovalenko@outlook.com>> wrote:
This patch integrates so called double bufferring when we are loading
Nit: I am not sure this is what most people refer to as "double buffering"
Kieran
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 32 pixels at a time
2025-05-31 10:43 ` Dmitriy Kovalenko
@ 2025-05-31 12:13 ` Martin Storsjö
2025-05-31 12:21 ` Dmitriy Kovalenko
0 siblings, 1 reply; 8+ messages in thread
From: Martin Storsjö @ 2025-05-31 12:13 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Kieran Kunhya
On Sat, 31 May 2025, Dmitriy Kovalenko wrote:
> Correct. I meant dual issue https://developer.arm.com/documentation/ddi0460/d/Cycle-Timings-and-Interlock-Behavior/Dual-issue
Do not top post.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 32 pixels at a time
2025-05-31 12:13 ` Martin Storsjö
@ 2025-05-31 12:21 ` Dmitriy Kovalenko
0 siblings, 0 replies; 8+ messages in thread
From: Dmitriy Kovalenko @ 2025-05-31 12:21 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: ffmpeg-devel, Kieran Kunhya
> On May 31, 2025, at 14:13, Martin Storsjö <martin@martin.st> wrote:
>
> On Sat, 31 May 2025, Dmitriy Kovalenko wrote:
>
>> Correct. I meant dual issue https://developer.arm.com/documentation/ddi0460/d/Cycle-Timings-and-Interlock-Behavior/Dual-issue
>
> Do not top post.
>
> // Martin
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Got it 💀 and one last time sorry
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations
2025-05-31 9:11 ` [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations Dmitriy Kovalenko
@ 2025-06-05 12:00 ` Martin Storsjö
0 siblings, 0 replies; 8+ messages in thread
From: Martin Storsjö @ 2025-06-05 12:00 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Dmitriy Kovalenko
On Sat, 31 May 2025, Dmitriy Kovalenko wrote:
> I've found quite a few ways to optimize existing ffmpeg's rgb to yuv
> subsampled conversion. In this patch stack I'll try to
> improve the perofrmance.
>
> This particular set of changes is a small improvement to all the
> existing functions and macro. The biggest performance gain is
> coming from post loading increment of the pointer and immediate
> ~~prefetching of the memory blocks~~(was moved to the next patch in the stack) and interleaving the multiplication shifting operations of
> different registers for better scheduling.
Why keep the mention of prefetching here, when it no longer is included in
the patch at all? This is what you suggest is encoded as the final,
immutable commit message describing this change.
I have further inline comments below, please read them all.
> Also changed a bunch of places where cmp + b.le was used instead
> of one instruction cbnz/tbnz and some other small cleanups.
>
> Here are checkasm results on the macbook pro with the latest M4 max
>
> <before>
>
> bgra_to_uv_1080_c: 257.5 ( 1.00x)
> bgra_to_uv_1080_neon: 211.9 ( 1.22x)
> bgra_to_uv_1920_c: 467.1 ( 1.00x)
> bgra_to_uv_1920_neon: 379.3 ( 1.23x)
> bgra_to_uv_half_1080_c: 198.9 ( 1.00x)
> bgra_to_uv_half_1080_neon: 125.7 ( 1.58x)
> bgra_to_uv_half_1920_c: 346.3 ( 1.00x)
> bgra_to_uv_half_1920_neon: 223.7 ( 1.55x)
>
> <after>
>
> bgra_to_uv_1080_c: 268.3 ( 1.00x)
> bgra_to_uv_1080_neon: 176.0 ( 1.53x)
> bgra_to_uv_1920_c: 456.6 ( 1.00x)
> bgra_to_uv_1920_neon: 307.7 ( 1.48x)
> bgra_to_uv_half_1080_c: 193.2 ( 1.00x)
> bgra_to_uv_half_1080_neon: 96.8 ( 2.00x)
> bgra_to_uv_half_1920_c: 347.2 ( 1.00x)
> bgra_to_uv_half_1920_neon: 182.6 ( 1.92x)
>
> With my proprietary test on IOS it gives around 70% of performance
> improvement converting bgra 1920x1920 image to yuv420p
>
> On my linux arm cortex-r processing the performance improvement not that
> visible but still consistently faster by 5-10% than the current
> implementation.
> ---
> libswscale/aarch64/input.S | 143 +++++++++++++++++++++++--------------
> 1 file changed, 91 insertions(+), 52 deletions(-)
> @@ -292,7 +330,7 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
> smaddl x8, w16, w10, x9 // x8 = ru * r + const_offset
> smaddl x8, w17, w11, x8 // x8 += gu * g
> smaddl x8, w4, w12, x8 // x8 += bu * b
> - asr w8, w8, #9 // x8 >>= 9
> + asr x8, x8, #9 // x8 >>= 9
> strh w8, [x0], #2 // store to dst_u
>
Here you _still_ have one instance of these unrelated changes left in your
patch.
> smaddl x8, w16, w13, x9 // x8 = rv * r + const_offset
> @@ -401,3 +439,4 @@ endfunc
>
> DISABLE_DOTPROD
> #endif
> +
> --
Here you are adding one unrelated empty line at the end of the file. Don't
include any unrelated changes in your patches.
Before sending a patch, do review it yourself first, checking for any such
unrelated stray changes.
Other than those details, the rest of the patch looks ok.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 32 pixels at a time
2025-05-31 9:11 ` [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 32 pixels at a time Dmitriy Kovalenko
2025-05-31 10:32 ` Kieran Kunhya via ffmpeg-devel
@ 2025-06-05 12:13 ` Martin Storsjö
1 sibling, 0 replies; 8+ messages in thread
From: Martin Storsjö @ 2025-06-05 12:13 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Dmitriy Kovalenko
On Sat, 31 May 2025, Dmitriy Kovalenko wrote:
> This patch integrates so called double bufferring when we are loading
> 2 batch of elements at a time and then processing them in parallel. On the
> moden arm processors especially Apple Silicon it gives a visible
> benefit, for subsampled pixel processing it is especially nice because
> it allows to read elements w/ 2 instructions and write with a single one
> (especially visible on a platforms with slower memory like ios).
>
> Including the previous patch in a stack on macbook pro m4 max rgb_to_yuv_half
> in checkasm goes up 2x of the c version
Please quote actual checkasm numbers, which shows exactly which tests were
run and their numbers, before/after the change.
> ---
> libswscale/aarch64/input.S | 130 ++++++++++++++++++++++++++-----------
> 1 file changed, 91 insertions(+), 39 deletions(-)
>
> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> index 260a26e965..b90ca05996 100644
> --- a/libswscale/aarch64/input.S
> +++ b/libswscale/aarch64/input.S
> @@ -178,7 +178,7 @@ rgbToY_neon abgr32, argb32, element=4, alpha_first=1
>
> .macro rgbToUV_half_neon fmt_bgr, fmt_rgb, element, alpha_first=0
> function ff_\fmt_bgr\()ToUV_half_neon, export=1
> - cbz w5, 3f // check width > 0
> + cbz w5, 3f
Unrelated change, you're just removing a comment. Don't intermix unrelated
changes in a patch.
> ldp w12, w11, [x6, #12]
> ldp w10, w15, [x6, #20]
> @@ -187,49 +187,101 @@ function ff_\fmt_bgr\()ToUV_half_neon, export=1
> endfunc
>
> function ff_\fmt_rgb\()ToUV_half_neon, export=1
> - cmp w5, #0 // check width > 0
> + cmp w5, #0
> b.le 3f
Unrelated change; only removing a comment.
>
> - ldp w10, w11, [x6, #12] // w10: ru, w11: gu
> - ldp w12, w13, [x6, #20] // w12: bu, w13: rv
> - ldp w14, w15, [x6, #28] // w14: gv, w15: bv
> + ldp w10, w11, [x6, #12]
> + ldp w12, w13, [x6, #20]
> + ldp w14, w15, [x6, #28]
Unrelated change, removing comments.
> 4:
> - cmp w5, #8
> rgb_set_uv_coeff half=1
> +
> + cmp w5, #16
> b.lt 2f
Any specific reason for moving the cmp instruction to after the
rgb_set_uv_coeff macro? By having it directly before the b.lt instruction
that depends on the cmp instruction, you're introducing stalls on in-order
cores.
> -1: // load 16 pixels
> +
> +1:
> .if \element == 3
> ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48
> + ld3 { v26.16b, v27.16b, v28.16b }, [x3], #48
> .else
> ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
> + ld4 { v26.16b, v27.16b, v28.16b, v29.16b }, [x3], #64
> .endif
>
> .if \alpha_first
> - uaddlp v21.8h, v19.16b // v21: summed b pairs
> - uaddlp v20.8h, v18.16b // v20: summed g pairs
> - uaddlp v19.8h, v17.16b // v19: summed r pairs
> + uaddlp v21.8h, v19.16b
> + uaddlp v20.8h, v18.16b
> + uaddlp v19.8h, v17.16b
> + uaddlp v31.8h, v29.16b
> + uaddlp v30.8h, v28.16b
> + uaddlp v29.8h, v27.16b
Here, you're removing comments that you yourself added in the preceding
commit. Don't do that; if you don't want the comments there, don't add
them in the first commit.
With that in mind, you could entirely drop that change in the first commit
anyway, there's no need to touch those lines there as you're only adding
comments anyway.
> .else
> - uaddlp v19.8h, v16.16b // v19: summed r pairs
> - uaddlp v20.8h, v17.16b // v20: summed g pairs
> - uaddlp v21.8h, v18.16b // v21: summed b pairs
> + uaddlp v19.8h, v16.16b
> + uaddlp v20.8h, v17.16b
> + uaddlp v21.8h, v18.16b
> + uaddlp v29.8h, v26.16b
> + uaddlp v30.8h, v27.16b
> + uaddlp v31.8h, v28.16b
> .endif
>
> - mov v22.16b, v6.16b // U first half
> - mov v23.16b, v6.16b // U second half
> - mov v24.16b, v6.16b // V first half
> - mov v25.16b, v6.16b // V second half
> -
> - rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v22, v23, v24, v25, v16, v17, #10
> -
> - str q16, [x0], #16 // store dst_u
> - str q17, [x1], #16 // store dst_v
> + mov v7.16b, v6.16b
> + mov v16.16b, v6.16b
> + mov v17.16b, v6.16b
> + mov v18.16b, v6.16b
> + mov v26.16b, v6.16b
> + mov v27.16b, v6.16b
> + mov v28.16b, v6.16b
> + mov v25.16b, v6.16b
>
> - sub w5, w5, #8 // width -= 8
> - cmp w5, #8 // width >= 8 ?
> + smlal v7.4s, v0.4h, v19.4h
> + smlal v17.4s, v3.4h, v19.4h
> + smlal v26.4s, v0.4h, v29.4h
> + smlal v28.4s, v3.4h, v29.4h
Here you deinline the macro. Is there a significant perfomance benefit of
doing that, over just calling the macro twice? The long commentless
smlal/smlal2 sequence here feels less readable than what we had before.
> +
> + smlal2 v16.4s, v0.8h, v19.8h
> + smlal2 v18.4s, v3.8h, v19.8h
> + smlal2 v27.4s, v0.8h, v29.8h
> + smlal2 v25.4s, v3.8h, v29.8h
> +
> + smlal v7.4s, v1.4h, v20.4h
> + smlal v17.4s, v4.4h, v20.4h
> + smlal v26.4s, v1.4h, v30.4h
> + smlal v28.4s, v4.4h, v30.4h
> +
> + smlal2 v16.4s, v1.8h, v20.8h
> + smlal2 v18.4s, v4.8h, v20.8h
> + smlal2 v27.4s, v1.8h, v30.8h
> + smlal2 v25.4s, v4.8h, v30.8h
> +
> + smlal v7.4s, v2.4h, v21.4h
> + smlal v17.4s, v5.4h, v21.4h
> + smlal v26.4s, v2.4h, v31.4h
> + smlal v28.4s, v5.4h, v31.4h
> +
> + smlal2 v16.4s, v2.8h, v21.8h
> + smlal2 v18.4s, v5.8h, v21.8h
> + smlal2 v27.4s, v2.8h, v31.8h
> + smlal2 v25.4s, v5.8h, v31.8h
> +
> + sqshrn v19.4h, v7.4s, #10
> + sqshrn v20.4h, v17.4s, #10
> + sqshrn v22.4h, v26.4s, #10
> + sqshrn v23.4h, v28.4s, #10
> +
> + sqshrn2 v19.8h, v16.4s, #10
> + sqshrn2 v20.8h, v18.4s, #10
> + sqshrn2 v22.8h, v27.4s, #10
> + sqshrn2 v23.8h, v25.4s, #10
> +
> + stp q19, q22, [x0], #32
> + stp q20, q23, [x1], #32
> +
> + sub w5, w5, #16
> + cmp w5, #16
> b.ge 1b
> - cbz w5, 3f // No pixels left? Exit
> + cbz w5, 3f
>
> -2: // Scalar fallback for remaining pixels
> +2:
> .if \alpha_first
> rgb_load_add_half 1, 5, 2, 6, 3, 7
> .else
> @@ -239,24 +291,24 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
> rgb_load_add_half 0, 4, 1, 5, 2, 6
> .endif
> .endif
> - smaddl x8, w2, w10, x9 // dst_u = ru * r + const_offset
> - smaddl x16, w2, w13, x9 // dst_v = rv * r + const_offset (parallel)
> + smaddl x8, w2, w10, x9
> + smaddl x16, w2, w13, x9
Unrelated change.
>
> - smaddl x8, w4, w11, x8 // dst_u += gu * g
> - smaddl x16, w4, w14, x16 // dst_v += gv * g (parallel)
> + smaddl x8, w4, w11, x8
> + smaddl x16, w4, w14, x16
Unrelated change.
>
> - smaddl x8, w7, w12, x8 // dst_u += bu * b
> - smaddl x16, w7, w15, x16 // dst_v += bv * b (parallel)
> + smaddl x8, w7, w12, x8
> + smaddl x16, w7, w15, x16
Unrelated change.
>
> - asr w8, w8, #10 // dst_u >>= 10
> - asr w16, w16, #10 // dst_v >>= 10
> + asr w8, w8, #10
> + asr w16, w16, #10
Unrelated change.
>
> - strh w8, [x0], #2 // store dst_u
> - strh w16, [x1], #2 // store dst_v
> + strh w8, [x0], #2
> + strh w16, [x1], #2
Unrelated change.
>
> - sub w5, w5, #1 // width--
> - add x3, x3, #(2*\element) // Advance source pointer
> - cbnz w5, 2b // Process next pixel if any left
> + sub w5, w5, #1
> + add x3, x3, #(2*\element)
> + cbnz w5, 2b
Unrelated change.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2025-06-05 12:13 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <20250531091631.45342-1-dmtr.kovalenko@outlook.com>
2025-05-31 9:11 ` [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations Dmitriy Kovalenko
2025-06-05 12:00 ` Martin Storsjö
2025-05-31 9:11 ` [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 32 pixels at a time Dmitriy Kovalenko
2025-05-31 10:32 ` Kieran Kunhya via ffmpeg-devel
2025-05-31 10:43 ` Dmitriy Kovalenko
2025-05-31 12:13 ` Martin Storsjö
2025-05-31 12:21 ` Dmitriy Kovalenko
2025-06-05 12:13 ` Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git