* [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 16 pixels at a time
[not found] <20250527165800.17159-1-dmtr.kovalenko@outlook.com>
@ 2025-05-27 16:58 ` Dmitriy Kovalenko
2025-05-29 19:09 ` Martin Storsjö
0 siblings, 1 reply; 3+ messages in thread
From: Dmitriy Kovalenko @ 2025-05-27 16:58 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Dmitriy Kovalenko
This patches integrates so called double bufferring when we are loading
2 batch elements at a time and then processing them in parallel. On the
moden arm processors especially Apple Silicon it gives a visible
benefit, for subsampled pixel processing it is especially nice because
it allows to read elements w/ 2 instructions and write with a single one
(which is usually the slowest part).
Including the previous patch in a stack on macbook pro m4 max rgb_to_yuv_half
in checkasm goes up 2x of the c version
---
libswscale/aarch64/input.S | 332 ++++++++++++++++++++++++++++++++++---
1 file changed, 309 insertions(+), 23 deletions(-)
diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index ee8eb24c14..59d66d0022 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -194,40 +194,94 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
ldp w12, w13, [x6, #20] // w12: bu, w13: rv
ldp w14, w15, [x6, #28] // w14: gv, w15: bv
4:
- cmp w5, #8
rgb_set_uv_coeff half=1
- b.lt 2f
-1: // load 16 pixels and prefetch memory for the next block
+
+ cmp w5, #16
+ b.lt 2f // Go directly to scalar if < 16
+
+1:
.if \element == 3
- ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48
- prfm pldl1strm, [x3, #48]
+ ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48 // First 16 pixels
+ ld3 { v26.16b, v27.16b, v28.16b }, [x3], #48 // Second 16 pixels
+ prfm pldl1keep, [x3, #96]
.else
- ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
- prfm pldl1strm, [x3, #64]
+ ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64 // First 16 pixels
+ ld4 { v26.16b, v27.16b, v28.16b, v29.16b }, [x3], #64 // Second 16 pixels
+ prfm pldl1keep, [x3, #128]
.endif
+ // **Sum adjacent pixel pairs**
.if \alpha_first
- uaddlp v21.8h, v19.16b // v21: summed b pairs
- uaddlp v20.8h, v18.16b // v20: summed g pairs
- uaddlp v19.8h, v17.16b // v19: summed r pairs
+ uaddlp v21.8h, v19.16b // Block 1: B sums
+ uaddlp v20.8h, v18.16b // Block 1: G sums
+ uaddlp v19.8h, v17.16b // Block 1: R sums
+ uaddlp v31.8h, v29.16b // Block 2: B sums
+ uaddlp v30.8h, v28.16b // Block 2: G sums
+ uaddlp v29.8h, v27.16b // Block 2: R sums
.else
- uaddlp v19.8h, v16.16b // v19: summed r pairs
- uaddlp v20.8h, v17.16b // v20: summed g pairs
- uaddlp v21.8h, v18.16b // v21: summed b pairs
+ uaddlp v19.8h, v16.16b // Block 1: R sums
+ uaddlp v20.8h, v17.16b // Block 1: G sums
+ uaddlp v21.8h, v18.16b // Block 1: B sums
+ uaddlp v29.8h, v26.16b // Block 2: R sums
+ uaddlp v30.8h, v27.16b // Block 2: G sums
+ uaddlp v31.8h, v28.16b // Block 2: B sums
.endif
- mov v22.16b, v6.16b // U first half
- mov v23.16b, v6.16b // U second half
- mov v24.16b, v6.16b // V first half
- mov v25.16b, v6.16b // V second half
-
- rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v22, v23, v24, v25, v16, v17, #10
+ // init accumulatos for both blocks
+ mov v7.16b, v6.16b // U_low
+ mov v8.16b, v6.16b // U_high
+ mov v9.16b, v6.16b // V_low
+ mov v10.16b, v6.16b // V_high
+ mov v11.16b, v6.16b // U_low
+ mov v12.16b, v6.16b // U_high
+ mov v13.16b, v6.16b // V_low
+ mov v14.16b, v6.16b // V_high
+
+ smlal v7.4s, v0.4h, v19.4h // U += ru * r (0-3)
+ smlal v9.4s, v3.4h, v19.4h // V += rv * r (0-3)
+ smlal v11.4s, v0.4h, v29.4h // U += ru * r (0-3)
+ smlal v13.4s, v3.4h, v29.4h // V += rv * r (0-3)
+
+ smlal2 v8.4s, v0.8h, v19.8h // U += ru * r (4-7)
+ smlal2 v10.4s, v3.8h, v19.8h // V += rv * r (4-7)
+ smlal2 v12.4s, v0.8h, v29.8h // U += ru * r (4-7)
+ smlal2 v14.4s, v3.8h, v29.8h // V += rv * r (4-7)
+
+ smlal v7.4s, v1.4h, v20.4h // U += gu * g (0-3)
+ smlal v9.4s, v4.4h, v20.4h // V += gv * g (0-3)
+ smlal v11.4s, v1.4h, v30.4h // U += gu * g (0-3)
+ smlal v13.4s, v4.4h, v30.4h // V += gv * g (0-3)
+
+ smlal2 v8.4s, v1.8h, v20.8h // U += gu * g (4-7)
+ smlal2 v10.4s, v4.8h, v20.8h // V += gv * g (4-7)
+ smlal2 v12.4s, v1.8h, v30.8h // U += gu * g (4-7)
+ smlal2 v14.4s, v4.8h, v30.8h // V += gv * g (4-7)
+
+ smlal v7.4s, v2.4h, v21.4h // U += bu * b (0-3)
+ smlal v9.4s, v5.4h, v21.4h // V += bv * b (0-3)
+ smlal v11.4s, v2.4h, v31.4h // U += bu * b (0-3)
+ smlal v13.4s, v5.4h, v31.4h // V += bv * b (0-3)
+
+ smlal2 v8.4s, v2.8h, v21.8h // U += bu * b (4-7)
+ smlal2 v10.4s, v5.8h, v21.8h // V += bv * b (4-7)
+ smlal2 v12.4s, v2.8h, v31.8h // U += bu * b (4-7)
+ smlal2 v14.4s, v5.8h, v31.8h // V += bv * b (4-7)
+
+ sqshrn v16.4h, v7.4s, #10 // U (0-3)
+ sqshrn v17.4h, v9.4s, #10 // V (0-3)
+ sqshrn v22.4h, v11.4s, #10 // U (0-3)
+ sqshrn v23.4h, v13.4s, #10 // V (0-3)
+
+ sqshrn2 v16.8h, v8.4s, #10 // U (0-7)
+ sqshrn2 v17.8h, v10.4s, #10 // V (0-7)
+ sqshrn2 v22.8h, v12.4s, #10 // U (0-7)
+ sqshrn2 v23.8h, v14.4s, #10 // V (0-7)
- str q16, [x0], #16 // store dst_u
- str q17, [x1], #16 // store dst_v
+ stp q16, q22, [x0], #32 // Store all 16 U values
+ stp q17, q23, [x1], #32 // Store all 16 V values
- sub w5, w5, #8 // width -= 8
- cmp w5, #8 // width >= 8 ?
+ sub w5, w5, #16 // width -= 16
+ cmp w5, #16 // width >= 16 ?
b.ge 1b
cbz w5, 3f // No pixels left? Exit
@@ -459,3 +513,235 @@ endfunc
DISABLE_DOTPROD
#endif
+
+.macro rgbToUV_half_neon_double fmt_bgr, fmt_rgb, element, alpha_first=0
+function ff_\fmt_bgr\()ToUV_half_neon_double, export=1
+ cbz w5, 9f // exit immediately if width is 0
+ cmp w5, #16 // check if we have at least 16 pixels
+ b.lt _ff_\fmt_bgr\()ToUV_half_neon
+
+ ldp w12, w11, [x6, #12] // w12: bu, w11: gu
+ ldp w10, w15, [x6, #20] // w10: ru, w15: bv
+ ldp w14, w13, [x6, #28] // w14: gv, w13: rv
+
+ b .Lcommon_uv_processing_\fmt_bgr
+endfunc
+
+function ff_\fmt_rgb\()ToUV_half_neon_double, export=1
+ cbz w5, 9f // exit immediately if width is 0
+ cmp w5, #16 // check if we have at least 16 pixels
+ b.lt _ff_\fmt_rgb\()ToUV_half_neon
+
+ ldp w10, w11, [x6, #12] // w10: ru, w11: gu
+ ldp w12, w13, [x6, #20] // w12: bu, w13: rv
+ ldp w14, w15, [x6, #28] // w14: gv, w15: bv
+
+.Lcommon_uv_processing_\fmt_bgr:
+ rgb_set_uv_coeff half=1
+
+ zip1 v7.8h, v0.8h, v3.8h // [ru0, rv0, ru1, rv1, ru2, rv2, ru3, rv3]
+ zip2 v8.8h, v0.8h, v3.8h // [ru4, rv4, ru5, rv5, ru6, rv6, ru7, rv7]
+ zip1 v9.8h, v1.8h, v4.8h // [gu0, gv0, gu1, gv1, gu2, gv2, gu3, gv3]
+ zip2 v10.8h, v1.8h, v4.8h // [gu4, gv4, gu5, gv5, gu6, gv6, gu7, gv7]
+ zip1 v11.8h, v2.8h, v5.8h // [bu0, bv0, bu1, bv1, bu2, bv2, bu3, bv3]
+ zip2 v12.8h, v2.8h, v5.8h // [bu4, bv4, bu5, bv5, bu6, bv6, bu7, bv7]
+
+ zip1 v13.4s, v6.4s, v6.4s // [const, const, const, const] for U/V pairs
+ zip2 v14.4s, v6.4s, v6.4s // [const, const, const, const] for U/V pairs
+
+.Lprocess_16_\fmt_bgr:
+ // **OPTIMIZED LOADING: Load and sum with immediate interleaving**
+ .if \element == 3
+ ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48
+ ld3 { v26.16b, v27.16b, v28.16b }, [x3], #48
+ prfm pldl1keep, [x3, #96] // Early prefetch
+
+ // Sum and immediately create interleaved RGB data
+ uaddlp v19.8h, v16.16b
+ uaddlp v20.8h, v17.16b
+ uaddlp v21.8h, v18.16b
+ uaddlp v29.8h, v26.16b
+ uaddlp v30.8h, v27.16b
+ uaddlp v31.8h, v28.16b
+ .else
+ ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
+ ld4 { v26.16b, v27.16b, v28.16b, v29.16b }, [x3], #64
+ prfm pldl1keep, [x3, #128] // Early prefetch
+
+ .if \alpha_first
+ uaddlp v21.8h, v19.16b
+ uaddlp v20.8h, v18.16b
+ uaddlp v19.8h, v17.16b
+ uaddlp v31.8h, v29.16b
+ uaddlp v30.8h, v28.16b
+ uaddlp v29.8h, v27.16b
+ .else
+ uaddlp v19.8h, v16.16b
+ uaddlp v20.8h, v17.16b
+ uaddlp v21.8h, v18.16b
+ uaddlp v29.8h, v26.16b
+ uaddlp v30.8h, v27.16b
+ uaddlp v31.8h, v28.16b
+ .endif
+ .endif
+
+ zip1 v22.8h, v19.8h, v19.8h // [r0, r0, r1, r1, r2, r2, r3, r3] Block 1
+ zip2 v23.8h, v19.8h, v19.8h // [r4, r4, r5, r5, r6, r6, r7, r7] Block 1
+ zip1 v24.8h, v20.8h, v20.8h // [g0, g0, g1, g1, g2, g2, g3, g3] Block 1
+ zip2 v25.8h, v20.8h, v20.8h // [g4, g4, g5, g5, g6, g6, g7, g7] Block 1
+
+ mov v0.16b, v13.16b // UV accumulator 1a
+ mov v1.16b, v13.16b // UV accumulator 1b
+ mov v2.16b, v14.16b // UV accumulator 1c
+ mov v3.16b, v14.16b // UV accumulator 1d
+
+ smlal v0.4s, v7.4h, v22.4h
+ smlal2 v1.4s, v7.8h, v22.8h
+ smlal v2.4s, v8.4h, v23.4h
+ smlal2 v3.4s, v8.8h, v23.8h
+
+ smlal v0.4s, v9.4h, v24.4h
+ smlal2 v1.4s, v9.8h, v24.8h
+ smlal v2.4s, v10.4h, v25.4h
+ smlal2 v3.4s, v10.8h, v25.8h
+
+ zip1 v22.8h, v21.8h, v21.8h // [b0, b0, b1, b1, b2, b2, b3, b3] Block 1
+ zip2 v23.8h, v21.8h, v21.8h // [b4, b4, b5, b5, b6, b6, b7, b7] Block 1
+
+ smlal2 v1.4s, v11.8h, v22.8h
+ smlal v2.4s, v12.4h, v23.4h
+ smlal2 v3.4s, v12.8h, v23.8h
+
+ zip1 v22.8h, v29.8h, v29.8h // [r0, r0, r1, r1, r2, r2, r3, r3] Block 2
+ zip2 v23.8h, v29.8h, v29.8h // [r4, r4, r5, r5, r6, r6, r7, r7] Block 2
+ zip1 v24.8h, v30.8h, v30.8h // [g0, g0, g1, g1, g2, g2, g3, g3] Block 2
+ zip2 v25.8h, v30.8h, v30.8h // [g4, g4, g5, g5, g6, g6, g7, g7] Block 2
+
+ mov v4.16b, v13.16b
+ mov v5.16b, v13.16b
+ mov v6.16b, v14.16b
+ mov v16.16b, v14.16b
+
+ smlal v4.4s, v7.4h, v22.4h
+ smlal2 v5.4s, v7.8h, v22.8h
+ smlal v6.4s, v8.4h, v23.4h
+ smlal2 v16.4s, v8.8h, v23.8h
+
+ smlal v4.4s, v9.4h, v24.4h
+ smlal2 v5.4s, v9.8h, v24.8h
+ smlal v6.4s, v10.4h, v25.4h
+ smlal2 v16.4s, v10.8h, v25.8h
+
+ zip1 v22.8h, v31.8h, v31.8h // [b0, b0, b1, b1, b2, b2, b3, b3] Block 2
+ zip2 v23.8h, v31.8h, v31.8h // [b4, b4, b5, b5, b6, b6, b7, b7] Block 2
+
+ smlal v4.4s, v11.4h, v22.4h // Process U&V for b0,b1 simultaneously
+ smlal2 v5.4s, v11.8h, v22.8h // Process U&V for b2,b3 simultaneously
+ smlal v6.4s, v12.4h, v23.4h // Process U&V for b4,b5 simultaneously
+ smlal2 v16.4s, v12.8h, v23.8h // Process U&V for b6,b7 simultaneously
+
+ uzp1 v17.4s, v0.4s, v1.4s
+ uzp2 v18.4s, v0.4s, v1.4s
+ uzp1 v19.4s, v2.4s, v3.4s
+ uzp2 v20.4s, v2.4s, v3.4s
+
+ uzp1 v21.4s, v4.4s, v5.4s // Extract U values (Block 2, part 1)
+ uzp2 v22.4s, v4.4s, v5.4s // Extract V values (Block 2, part 1)
+ uzp1 v23.4s, v6.4s, v16.4s // Extract U values (Block 2, part 2)
+ uzp2 v24.4s, v6.4s, v16.4s // Extract V values (Block 2, part 2)
+
+ sqshrn v25.4h, v17.4s, #10 // U1 (first 4)
+ sqshrn2 v25.8h, v19.4s, #10 // U1 (complete 8)
+ sqshrn v26.4h, v18.4s, #10 // V1 (first 4)
+ sqshrn2 v26.8h, v20.4s, #10 // V1 (complete 8)
+
+ sqshrn v27.4h, v21.4s, #10 // U2 (first 4)
+ sqshrn2 v27.8h, v23.4s, #10 // U2 (complete 8)
+ sqshrn v28.4h, v22.4s, #10 // V2 (first 4)
+ sqshrn2 v28.8h, v24.4s, #10 // V2 (complete 8)
+
+ // **EFFICIENT STORE: Use STP for optimal memory bandwidth**
+ stp q25, q27, [x0], #32 // Store U1, U2
+ stp q26, q28, [x1], #32 // Store V1, V2
+
+ sub w5, w5, #16 // Decrement counter
+ cmp w5, #16 // Check for more blocks
+ b.ge .Lprocess_16_\fmt_bgr
+
+ // Standard 8-pixel and scalar fallbacks (unchanged)
+ cmp w5, #8
+ b.lt .Lscalar_loop_init_\fmt_bgr
+
+.Lprocess_8_\fmt_bgr:
+ .if \element == 3
+ ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48
+ uaddlp v19.8h, v16.16b
+ uaddlp v20.8h, v17.16b
+ uaddlp v21.8h, v18.16b
+ .else
+ ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
+ .if \alpha_first
+ uaddlp v21.8h, v19.16b
+ uaddlp v20.8h, v18.16b
+ uaddlp v19.8h, v17.16b
+ .else
+ uaddlp v19.8h, v16.16b
+ uaddlp v20.8h, v17.16b
+ uaddlp v21.8h, v18.16b
+ .endif
+ .endif
+
+ rgb_set_uv_coeff half=1
+ mov v22.16b, v6.16b
+ mov v23.16b, v6.16b
+ mov v24.16b, v6.16b
+ mov v25.16b, v6.16b
+
+ rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v22, v23, v24, v25, v16, v17, #10
+
+ str q16, [x0], #16
+ str q17, [x1], #16
+
+ sub w5, w5, #8
+ cmp w5, #8
+ b.ge .Lprocess_8_\fmt_bgr
+
+.Lscalar_loop_init_\fmt_bgr:
+ cbz w5, 9f
+
+.Lscalar_loop_\fmt_bgr:
+ .if \alpha_first
+ rgb_load_add_half 1, 5, 2, 6, 3, 7
+ .else
+ .if \element == 3
+ rgb_load_add_half 0, 3, 1, 4, 2, 5
+ .else
+ rgb_load_add_half 0, 4, 1, 5, 2, 6
+ .endif
+ .endif
+
+ mov x8, x9
+ mov x17, x9
+ smaddl x8, w2, w10, x8
+ smaddl x17, w2, w13, x17
+ smaddl x8, w4, w11, x8
+ smaddl x17, w4, w14, x17
+ smaddl x8, w7, w12, x8
+ smaddl x17, w7, w15, x17
+ asr w8, w8, #10
+ asr w17, w17, #10
+
+ strh w8, [x0], #2
+ strh w17, [x1], #2
+
+ sub w5, w5, #2
+ add x3, x3, #(2*\element)
+ cbnz w5, .Lscalar_loop_\fmt_bgr
+
+9: ret
+endfunc
+.endm
+
+rgbToUV_half_neon_double bgra32, rgba32, element=4
+
+rgbToUV_half_neon_double abgr32, argb32, element=4, alpha_first=1
--
2.49.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 16 pixels at a time
2025-05-27 16:58 ` [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 16 pixels at a time Dmitriy Kovalenko
@ 2025-05-29 19:09 ` Martin Storsjö
2025-05-30 7:08 ` Martin Storsjö
0 siblings, 1 reply; 3+ messages in thread
From: Martin Storsjö @ 2025-05-29 19:09 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Dmitriy Kovalenko
On Tue, 27 May 2025, Dmitriy Kovalenko wrote:
> This patches integrates so called double bufferring when we are loading
> 2 batch elements at a time and then processing them in parallel. On the
> moden arm processors especially Apple Silicon it gives a visible
> benefit, for subsampled pixel processing it is especially nice because
> it allows to read elements w/ 2 instructions and write with a single one
> (which is usually the slowest part).
>
> Including the previous patch in a stack on macbook pro m4 max rgb_to_yuv_half
> in checkasm goes up 2x of the c version
> ---
> libswscale/aarch64/input.S | 332 ++++++++++++++++++++++++++++++++++---
> 1 file changed, 309 insertions(+), 23 deletions(-)
>
> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> index ee8eb24c14..59d66d0022 100644
> --- a/libswscale/aarch64/input.S
> +++ b/libswscale/aarch64/input.S
> @@ -194,40 +194,94 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
> ldp w12, w13, [x6, #20] // w12: bu, w13: rv
> ldp w14, w15, [x6, #28] // w14: gv, w15: bv
> 4:
> - cmp w5, #8
> rgb_set_uv_coeff half=1
> - b.lt 2f
> -1: // load 16 pixels and prefetch memory for the next block
> +
> + cmp w5, #16
> + b.lt 2f // Go directly to scalar if < 16
> +
> +1:
> .if \element == 3
> - ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48
> - prfm pldl1strm, [x3, #48]
> + ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48 // First 16 pixels
> + ld3 { v26.16b, v27.16b, v28.16b }, [x3], #48 // Second 16 pixels
> + prfm pldl1keep, [x3, #96]
> .else
> - ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
> - prfm pldl1strm, [x3, #64]
> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64 // First 16 pixels
> + ld4 { v26.16b, v27.16b, v28.16b, v29.16b }, [x3], #64 // Second 16 pixels
> + prfm pldl1keep, [x3, #128]
> .endif
>
> + // **Sum adjacent pixel pairs**
> .if \alpha_first
> - uaddlp v21.8h, v19.16b // v21: summed b pairs
> - uaddlp v20.8h, v18.16b // v20: summed g pairs
> - uaddlp v19.8h, v17.16b // v19: summed r pairs
> + uaddlp v21.8h, v19.16b // Block 1: B sums
> + uaddlp v20.8h, v18.16b // Block 1: G sums
> + uaddlp v19.8h, v17.16b // Block 1: R sums
> + uaddlp v31.8h, v29.16b // Block 2: B sums
> + uaddlp v30.8h, v28.16b // Block 2: G sums
> + uaddlp v29.8h, v27.16b // Block 2: R sums
> .else
> - uaddlp v19.8h, v16.16b // v19: summed r pairs
> - uaddlp v20.8h, v17.16b // v20: summed g pairs
> - uaddlp v21.8h, v18.16b // v21: summed b pairs
> + uaddlp v19.8h, v16.16b // Block 1: R sums
> + uaddlp v20.8h, v17.16b // Block 1: G sums
> + uaddlp v21.8h, v18.16b // Block 1: B sums
> + uaddlp v29.8h, v26.16b // Block 2: R sums
> + uaddlp v30.8h, v27.16b // Block 2: G sums
> + uaddlp v31.8h, v28.16b // Block 2: B sums
> .endif
>
> - mov v22.16b, v6.16b // U first half
> - mov v23.16b, v6.16b // U second half
> - mov v24.16b, v6.16b // V first half
> - mov v25.16b, v6.16b // V second half
> -
> - rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v22, v23, v24, v25, v16, v17, #10
> + // init accumulatos for both blocks
> + mov v7.16b, v6.16b // U_low
> + mov v8.16b, v6.16b // U_high
> + mov v9.16b, v6.16b // V_low
> + mov v10.16b, v6.16b // V_high
> + mov v11.16b, v6.16b // U_low
> + mov v12.16b, v6.16b // U_high
> + mov v13.16b, v6.16b // V_low
> + mov v14.16b, v6.16b // V_high
> +
> + smlal v7.4s, v0.4h, v19.4h // U += ru * r (0-3)
> + smlal v9.4s, v3.4h, v19.4h // V += rv * r (0-3)
> + smlal v11.4s, v0.4h, v29.4h // U += ru * r (0-3)
> + smlal v13.4s, v3.4h, v29.4h // V += rv * r (0-3)
> +
> + smlal2 v8.4s, v0.8h, v19.8h // U += ru * r (4-7)
> + smlal2 v10.4s, v3.8h, v19.8h // V += rv * r (4-7)
> + smlal2 v12.4s, v0.8h, v29.8h // U += ru * r (4-7)
> + smlal2 v14.4s, v3.8h, v29.8h // V += rv * r (4-7)
> +
> + smlal v7.4s, v1.4h, v20.4h // U += gu * g (0-3)
> + smlal v9.4s, v4.4h, v20.4h // V += gv * g (0-3)
> + smlal v11.4s, v1.4h, v30.4h // U += gu * g (0-3)
> + smlal v13.4s, v4.4h, v30.4h // V += gv * g (0-3)
> +
> + smlal2 v8.4s, v1.8h, v20.8h // U += gu * g (4-7)
> + smlal2 v10.4s, v4.8h, v20.8h // V += gv * g (4-7)
> + smlal2 v12.4s, v1.8h, v30.8h // U += gu * g (4-7)
> + smlal2 v14.4s, v4.8h, v30.8h // V += gv * g (4-7)
> +
> + smlal v7.4s, v2.4h, v21.4h // U += bu * b (0-3)
> + smlal v9.4s, v5.4h, v21.4h // V += bv * b (0-3)
> + smlal v11.4s, v2.4h, v31.4h // U += bu * b (0-3)
> + smlal v13.4s, v5.4h, v31.4h // V += bv * b (0-3)
> +
> + smlal2 v8.4s, v2.8h, v21.8h // U += bu * b (4-7)
> + smlal2 v10.4s, v5.8h, v21.8h // V += bv * b (4-7)
> + smlal2 v12.4s, v2.8h, v31.8h // U += bu * b (4-7)
> + smlal2 v14.4s, v5.8h, v31.8h // V += bv * b (4-7)
> +
> + sqshrn v16.4h, v7.4s, #10 // U (0-3)
> + sqshrn v17.4h, v9.4s, #10 // V (0-3)
> + sqshrn v22.4h, v11.4s, #10 // U (0-3)
> + sqshrn v23.4h, v13.4s, #10 // V (0-3)
> +
> + sqshrn2 v16.8h, v8.4s, #10 // U (0-7)
> + sqshrn2 v17.8h, v10.4s, #10 // V (0-7)
> + sqshrn2 v22.8h, v12.4s, #10 // U (0-7)
> + sqshrn2 v23.8h, v14.4s, #10 // V (0-7)
>
> - str q16, [x0], #16 // store dst_u
> - str q17, [x1], #16 // store dst_v
> + stp q16, q22, [x0], #32 // Store all 16 U values
> + stp q17, q23, [x1], #32 // Store all 16 V values
>
> - sub w5, w5, #8 // width -= 8
> - cmp w5, #8 // width >= 8 ?
> + sub w5, w5, #16 // width -= 16
> + cmp w5, #16 // width >= 16 ?
> b.ge 1b
> cbz w5, 3f // No pixels left? Exit
>
> @@ -459,3 +513,235 @@ endfunc
>
> DISABLE_DOTPROD
> #endif
> +
> +.macro rgbToUV_half_neon_double fmt_bgr, fmt_rgb, element, alpha_first=0
> +function ff_\fmt_bgr\()ToUV_half_neon_double, export=1
> + cbz w5, 9f // exit immediately if width is 0
> + cmp w5, #16 // check if we have at least 16 pixels
> + b.lt _ff_\fmt_bgr\()ToUV_half_neon
This fails to link on anything other than Darwin targets; other platforms
don't have an underscore prefix on symbols. Use the X() macro around
symbol names to get the right external symbol name for the function.
Also, with that fixed, this fails to properly back up and restore
registers v8-v15; checkasm doesn't notice this on macOS, but on Linux and
windows, checkasm has a call wrapper which does detect such issues.
I have set up a set of test configurations for aarch64 assembly on github;
if you fetch the branch
https://github.com/mstorsjo/ffmpeg/commits/gha-aarch64, append your own
commits on top, and push this to your own fork on github, it'll test
building it in all the relevant configurations (most relevant
platforms/toolchains, including rare ones that not everybody may have
availalbe). (You may need to activate the actions by visiting
http://github.com/<yourusername>/ffmpeg/actions.) It also does check that
the indentation of the assembly matches the common style.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 16 pixels at a time
2025-05-29 19:09 ` Martin Storsjö
@ 2025-05-30 7:08 ` Martin Storsjö
0 siblings, 0 replies; 3+ messages in thread
From: Martin Storsjö @ 2025-05-30 7:08 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Dmitriy Kovalenko
On Thu, 29 May 2025, Martin Storsjö wrote:
> On Tue, 27 May 2025, Dmitriy Kovalenko wrote:
>
>> This patches integrates so called double bufferring when we are loading
>> 2 batch elements at a time and then processing them in parallel. On the
>> moden arm processors especially Apple Silicon it gives a visible
>> benefit, for subsampled pixel processing it is especially nice because
>> it allows to read elements w/ 2 instructions and write with a single one
>> (which is usually the slowest part).
>>
>> Including the previous patch in a stack on macbook pro m4 max
>> rgb_to_yuv_half
>> in checkasm goes up 2x of the c version
>> ---
>> libswscale/aarch64/input.S | 332 ++++++++++++++++++++++++++++++++++---
>> 1 file changed, 309 insertions(+), 23 deletions(-)
>>
>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>> index ee8eb24c14..59d66d0022 100644
>> --- a/libswscale/aarch64/input.S
>> +++ b/libswscale/aarch64/input.S
>> @@ -194,40 +194,94 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
>> ldp w12, w13, [x6, #20] // w12: bu, w13: rv
>> ldp w14, w15, [x6, #28] // w14: gv, w15: bv
>> 4:
>> - cmp w5, #8
>> rgb_set_uv_coeff half=1
>> - b.lt 2f
>> -1: // load 16 pixels and prefetch memory for the next block
>> +
>> + cmp w5, #16
>> + b.lt 2f // Go directly to scalar
>> if < 16
>> +
>> +1:
>> .if \element == 3
>> - ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48
>> - prfm pldl1strm, [x3, #48]
>> + ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48 // First
>> 16 pixels
>> + ld3 { v26.16b, v27.16b, v28.16b }, [x3], #48 //
>> Second 16 pixels
>> + prfm pldl1keep, [x3, #96]
>> .else
>> - ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
>> - prfm pldl1strm, [x3, #64]
>> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
>> // First 16 pixels
>> + ld4 { v26.16b, v27.16b, v28.16b, v29.16b }, [x3], #64
>> // Second 16 pixels
>> + prfm pldl1keep, [x3, #128]
>> .endif
>>
>> + // **Sum adjacent pixel pairs**
>> .if \alpha_first
>> - uaddlp v21.8h, v19.16b // v21: summed b pairs
>> - uaddlp v20.8h, v18.16b // v20: summed g pairs
>> - uaddlp v19.8h, v17.16b // v19: summed r pairs
>> + uaddlp v21.8h, v19.16b // Block 1: B sums
>> + uaddlp v20.8h, v18.16b // Block 1: G sums
>> + uaddlp v19.8h, v17.16b // Block 1: R sums
>> + uaddlp v31.8h, v29.16b // Block 2: B sums
>> + uaddlp v30.8h, v28.16b // Block 2: G sums
>> + uaddlp v29.8h, v27.16b // Block 2: R sums
>> .else
>> - uaddlp v19.8h, v16.16b // v19: summed r pairs
>> - uaddlp v20.8h, v17.16b // v20: summed g pairs
>> - uaddlp v21.8h, v18.16b // v21: summed b pairs
>> + uaddlp v19.8h, v16.16b // Block 1: R sums
>> + uaddlp v20.8h, v17.16b // Block 1: G sums
>> + uaddlp v21.8h, v18.16b // Block 1: B sums
>> + uaddlp v29.8h, v26.16b // Block 2: R sums
>> + uaddlp v30.8h, v27.16b // Block 2: G sums
>> + uaddlp v31.8h, v28.16b // Block 2: B sums
>> .endif
>>
>> - mov v22.16b, v6.16b // U first half
>> - mov v23.16b, v6.16b // U second half
>> - mov v24.16b, v6.16b // V first half
>> - mov v25.16b, v6.16b // V second half
>> -
>> - rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4,
>> v5, v22, v23, v24, v25, v16, v17, #10
>> + // init accumulatos for both blocks
>> + mov v7.16b, v6.16b // U_low
>> + mov v8.16b, v6.16b // U_high
>> + mov v9.16b, v6.16b // V_low
>> + mov v10.16b, v6.16b // V_high
>> + mov v11.16b, v6.16b // U_low
>> + mov v12.16b, v6.16b // U_high
>> + mov v13.16b, v6.16b // V_low
>> + mov v14.16b, v6.16b // V_high
>> +
>> + smlal v7.4s, v0.4h, v19.4h // U += ru * r (0-3)
>> + smlal v9.4s, v3.4h, v19.4h // V += rv * r (0-3)
>> + smlal v11.4s, v0.4h, v29.4h // U += ru * r (0-3)
>> + smlal v13.4s, v3.4h, v29.4h // V += rv * r (0-3)
>> +
>> + smlal2 v8.4s, v0.8h, v19.8h // U += ru * r (4-7)
>> + smlal2 v10.4s, v3.8h, v19.8h // V += rv * r (4-7)
>> + smlal2 v12.4s, v0.8h, v29.8h // U += ru * r (4-7)
>> + smlal2 v14.4s, v3.8h, v29.8h // V += rv * r (4-7)
>> +
>> + smlal v7.4s, v1.4h, v20.4h // U += gu * g (0-3)
>> + smlal v9.4s, v4.4h, v20.4h // V += gv * g (0-3)
>> + smlal v11.4s, v1.4h, v30.4h // U += gu * g (0-3)
>> + smlal v13.4s, v4.4h, v30.4h // V += gv * g (0-3)
>> +
>> + smlal2 v8.4s, v1.8h, v20.8h // U += gu * g (4-7)
>> + smlal2 v10.4s, v4.8h, v20.8h // V += gv * g (4-7)
>> + smlal2 v12.4s, v1.8h, v30.8h // U += gu * g (4-7)
>> + smlal2 v14.4s, v4.8h, v30.8h // V += gv * g (4-7)
>> +
>> + smlal v7.4s, v2.4h, v21.4h // U += bu * b (0-3)
>> + smlal v9.4s, v5.4h, v21.4h // V += bv * b (0-3)
>> + smlal v11.4s, v2.4h, v31.4h // U += bu * b (0-3)
>> + smlal v13.4s, v5.4h, v31.4h // V += bv * b (0-3)
>> +
>> + smlal2 v8.4s, v2.8h, v21.8h // U += bu * b (4-7)
>> + smlal2 v10.4s, v5.8h, v21.8h // V += bv * b (4-7)
>> + smlal2 v12.4s, v2.8h, v31.8h // U += bu * b (4-7)
>> + smlal2 v14.4s, v5.8h, v31.8h // V += bv * b (4-7)
>> +
>> + sqshrn v16.4h, v7.4s, #10 // U (0-3)
>> + sqshrn v17.4h, v9.4s, #10 // V (0-3)
>> + sqshrn v22.4h, v11.4s, #10 // U (0-3)
>> + sqshrn v23.4h, v13.4s, #10 // V (0-3)
>> +
>> + sqshrn2 v16.8h, v8.4s, #10 // U (0-7)
>> + sqshrn2 v17.8h, v10.4s, #10 // V (0-7)
>> + sqshrn2 v22.8h, v12.4s, #10 // U (0-7)
>> + sqshrn2 v23.8h, v14.4s, #10 // V (0-7)
>>
>> - str q16, [x0], #16 // store dst_u
>> - str q17, [x1], #16 // store dst_v
>> + stp q16, q22, [x0], #32 // Store all 16 U values
>> + stp q17, q23, [x1], #32 // Store all 16 V values
>>
>> - sub w5, w5, #8 // width -= 8
>> - cmp w5, #8 // width >= 8 ?
>> + sub w5, w5, #16 // width -= 16
>> + cmp w5, #16 // width >= 16 ?
>> b.ge 1b
>> cbz w5, 3f // No pixels left? Exit
>>
>> @@ -459,3 +513,235 @@ endfunc
>>
>> DISABLE_DOTPROD
>> #endif
>> +
>> +.macro rgbToUV_half_neon_double fmt_bgr, fmt_rgb, element, alpha_first=0
>> +function ff_\fmt_bgr\()ToUV_half_neon_double, export=1
>> + cbz w5, 9f // exit immediately if
>> width is 0
>> + cmp w5, #16 // check if we have at
>> least 16 pixels
>> + b.lt _ff_\fmt_bgr\()ToUV_half_neon
>
> Also, with that fixed, this fails to properly back up and restore registers
> v8-v15; checkasm doesn't notice this on macOS, but on Linux and windows,
> checkasm has a call wrapper which does detect such issues.
This comment is still unaddressed, checkasm still fails on Linux and
Windows.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2025-05-30 7:08 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <20250527165800.17159-1-dmtr.kovalenko@outlook.com>
2025-05-27 16:58 ` [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 16 pixels at a time Dmitriy Kovalenko
2025-05-29 19:09 ` Martin Storsjö
2025-05-30 7:08 ` Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git