Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 16 pixels at a time
       [not found] <20250527165800.17159-1-dmtr.kovalenko@outlook.com>
@ 2025-05-27 16:58 ` Dmitriy Kovalenko
  2025-05-29 19:09   ` Martin Storsjö
  0 siblings, 1 reply; 3+ messages in thread
From: Dmitriy Kovalenko @ 2025-05-27 16:58 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Dmitriy Kovalenko

This patches integrates so called double bufferring when we are loading
2 batch elements at a time and then processing them in parallel. On the
moden arm processors especially Apple Silicon it gives a visible
benefit, for subsampled pixel processing it is especially nice because
it allows to read elements w/ 2 instructions and write with a single one
(which is usually the slowest part).

Including the previous patch in a stack on macbook pro m4 max rgb_to_yuv_half
in checkasm goes up 2x of the c version
---
 libswscale/aarch64/input.S | 332 ++++++++++++++++++++++++++++++++++---
 1 file changed, 309 insertions(+), 23 deletions(-)

diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index ee8eb24c14..59d66d0022 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -194,40 +194,94 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
         ldp             w12, w13, [x6, #20]     // w12: bu, w13: rv
         ldp             w14, w15, [x6, #28]     // w14: gv, w15: bv
 4:
-        cmp             w5, #8
         rgb_set_uv_coeff half=1
-        b.lt            2f
-1:  // load 16 pixels and prefetch memory for the next block
+        
+        cmp             w5, #16
+        b.lt            2f                      // Go directly to scalar if < 16
+
+1:
     .if \element == 3
-        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
-        prfm            pldl1strm, [x3, #48]
+        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48  // First 16 pixels
+        ld3             { v26.16b, v27.16b, v28.16b }, [x3], #48  // Second 16 pixels
+        prfm            pldl1keep, [x3, #96]
     .else
-        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
-        prfm            pldl1strm, [x3, #64]
+        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64  // First 16 pixels
+        ld4             { v26.16b, v27.16b, v28.16b, v29.16b }, [x3], #64  // Second 16 pixels
+        prfm            pldl1keep, [x3, #128]
     .endif
     
+    // **Sum adjacent pixel pairs**
     .if \alpha_first
-        uaddlp          v21.8h, v19.16b         // v21: summed b pairs
-        uaddlp          v20.8h, v18.16b         // v20: summed g pairs
-        uaddlp          v19.8h, v17.16b         // v19: summed r pairs
+        uaddlp          v21.8h, v19.16b         // Block 1: B sums
+        uaddlp          v20.8h, v18.16b         // Block 1: G sums  
+        uaddlp          v19.8h, v17.16b         // Block 1: R sums
+        uaddlp          v31.8h, v29.16b         // Block 2: B sums
+        uaddlp          v30.8h, v28.16b         // Block 2: G sums
+        uaddlp          v29.8h, v27.16b         // Block 2: R sums
     .else
-        uaddlp          v19.8h, v16.16b         // v19: summed r pairs
-        uaddlp          v20.8h, v17.16b         // v20: summed g pairs
-        uaddlp          v21.8h, v18.16b         // v21: summed b pairs
+        uaddlp          v19.8h, v16.16b         // Block 1: R sums
+        uaddlp          v20.8h, v17.16b         // Block 1: G sums
+        uaddlp          v21.8h, v18.16b         // Block 1: B sums
+        uaddlp          v29.8h, v26.16b         // Block 2: R sums
+        uaddlp          v30.8h, v27.16b         // Block 2: G sums
+        uaddlp          v31.8h, v28.16b         // Block 2: B sums
     .endif
 
-        mov             v22.16b, v6.16b         // U first half
-        mov             v23.16b, v6.16b         // U second half
-        mov             v24.16b, v6.16b         // V first half
-        mov             v25.16b, v6.16b         // V second half
-
-        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v22, v23, v24, v25, v16, v17, #10
+        // init accumulatos for both blocks
+        mov             v7.16b, v6.16b          //  U_low 
+        mov             v8.16b, v6.16b          //  U_high
+        mov             v9.16b, v6.16b          //  V_low
+        mov             v10.16b, v6.16b         //  V_high
+        mov             v11.16b, v6.16b         //  U_low
+        mov             v12.16b, v6.16b         //  U_high  
+        mov             v13.16b, v6.16b         //  V_low
+        mov             v14.16b, v6.16b         //  V_high
+
+        smlal           v7.4s, v0.4h, v19.4h    // U += ru * r (0-3)
+        smlal           v9.4s, v3.4h, v19.4h    // V += rv * r (0-3) 
+        smlal           v11.4s, v0.4h, v29.4h   // U += ru * r (0-3)
+        smlal           v13.4s, v3.4h, v29.4h   // V += rv * r (0-3)
+        
+        smlal2          v8.4s, v0.8h, v19.8h    // U += ru * r (4-7)
+        smlal2          v10.4s, v3.8h, v19.8h   // V += rv * r (4-7)
+        smlal2          v12.4s, v0.8h, v29.8h   // U += ru * r (4-7)
+        smlal2          v14.4s, v3.8h, v29.8h   // V += rv * r (4-7)
+
+        smlal           v7.4s, v1.4h, v20.4h    // U += gu * g (0-3)
+        smlal           v9.4s, v4.4h, v20.4h    // V += gv * g (0-3)
+        smlal           v11.4s, v1.4h, v30.4h   // U += gu * g (0-3)
+        smlal           v13.4s, v4.4h, v30.4h   // V += gv * g (0-3)
+        
+        smlal2          v8.4s, v1.8h, v20.8h    // U += gu * g (4-7)
+        smlal2          v10.4s, v4.8h, v20.8h   // V += gv * g (4-7)
+        smlal2          v12.4s, v1.8h, v30.8h   // U += gu * g (4-7)
+        smlal2          v14.4s, v4.8h, v30.8h   // V += gv * g (4-7)
+
+        smlal           v7.4s, v2.4h, v21.4h    // U += bu * b (0-3)
+        smlal           v9.4s, v5.4h, v21.4h    // V += bv * b (0-3)
+        smlal           v11.4s, v2.4h, v31.4h   // U += bu * b (0-3)
+        smlal           v13.4s, v5.4h, v31.4h   // V += bv * b (0-3)
+        
+        smlal2          v8.4s, v2.8h, v21.8h    // U += bu * b (4-7)
+        smlal2          v10.4s, v5.8h, v21.8h   // V += bv * b (4-7)
+        smlal2          v12.4s, v2.8h, v31.8h   // U += bu * b (4-7)
+        smlal2          v14.4s, v5.8h, v31.8h   // V += bv * b (4-7)
+
+        sqshrn          v16.4h, v7.4s, #10      // U (0-3)
+        sqshrn          v17.4h, v9.4s, #10      // V (0-3)
+        sqshrn          v22.4h, v11.4s, #10     // U (0-3)
+        sqshrn          v23.4h, v13.4s, #10     // V (0-3)
+        
+        sqshrn2         v16.8h, v8.4s, #10      // U (0-7)
+        sqshrn2         v17.8h, v10.4s, #10     // V (0-7)
+        sqshrn2         v22.8h, v12.4s, #10     // U (0-7)
+        sqshrn2         v23.8h, v14.4s, #10     // V (0-7)
 
-        str             q16, [x0], #16          // store dst_u
-        str             q17, [x1], #16          // store dst_v
+        stp             q16, q22, [x0], #32     // Store all 16 U values
+        stp             q17, q23, [x1], #32     // Store all 16 V values
         
-        sub             w5, w5, #8              // width -= 8
-        cmp             w5, #8                  // width >= 8 ?
+        sub             w5, w5, #16             // width -= 16
+        cmp             w5, #16                 // width >= 16 ?
         b.ge            1b
         cbz             w5, 3f                  // No pixels left? Exit
 
@@ -459,3 +513,235 @@ endfunc
 
 DISABLE_DOTPROD
 #endif
+
+.macro rgbToUV_half_neon_double fmt_bgr, fmt_rgb, element, alpha_first=0
+function ff_\fmt_bgr\()ToUV_half_neon_double, export=1
+        cbz             w5, 9f                  // exit immediately if width is 0
+        cmp             w5, #16                 // check if we have at least 16 pixels
+        b.lt            _ff_\fmt_bgr\()ToUV_half_neon
+
+        ldp             w12, w11, [x6, #12]     // w12: bu, w11: gu
+        ldp             w10, w15, [x6, #20]     // w10: ru, w15: bv
+        ldp             w14, w13, [x6, #28]     // w14: gv, w13: rv
+        
+        b               .Lcommon_uv_processing_\fmt_bgr
+endfunc
+
+function ff_\fmt_rgb\()ToUV_half_neon_double, export=1
+        cbz             w5, 9f                  // exit immediately if width is 0
+        cmp             w5, #16                 // check if we have at least 16 pixels
+        b.lt            _ff_\fmt_rgb\()ToUV_half_neon
+
+        ldp             w10, w11, [x6, #12]     // w10: ru, w11: gu
+        ldp             w12, w13, [x6, #20]     // w12: bu, w13: rv
+        ldp             w14, w15, [x6, #28]     // w14: gv, w15: bv
+
+.Lcommon_uv_processing_\fmt_bgr:
+        rgb_set_uv_coeff half=1
+        
+        zip1            v7.8h, v0.8h, v3.8h     // [ru0, rv0, ru1, rv1, ru2, rv2, ru3, rv3]
+        zip2            v8.8h, v0.8h, v3.8h     // [ru4, rv4, ru5, rv5, ru6, rv6, ru7, rv7]
+        zip1            v9.8h, v1.8h, v4.8h     // [gu0, gv0, gu1, gv1, gu2, gv2, gu3, gv3]
+        zip2            v10.8h, v1.8h, v4.8h    // [gu4, gv4, gu5, gv5, gu6, gv6, gu7, gv7]
+        zip1            v11.8h, v2.8h, v5.8h    // [bu0, bv0, bu1, bv1, bu2, bv2, bu3, bv3]
+        zip2            v12.8h, v2.8h, v5.8h    // [bu4, bv4, bu5, bv5, bu6, bv6, bu7, bv7]
+        
+        zip1            v13.4s, v6.4s, v6.4s    // [const, const, const, const] for U/V pairs
+        zip2            v14.4s, v6.4s, v6.4s    // [const, const, const, const] for U/V pairs
+
+.Lprocess_16_\fmt_bgr:
+        // **OPTIMIZED LOADING: Load and sum with immediate interleaving**
+    .if \element == 3
+        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
+        ld3             { v26.16b, v27.16b, v28.16b }, [x3], #48
+        prfm            pldl1keep, [x3, #96]    // Early prefetch
+        
+        // Sum and immediately create interleaved RGB data
+        uaddlp          v19.8h, v16.16b
+        uaddlp          v20.8h, v17.16b
+        uaddlp          v21.8h, v18.16b
+        uaddlp          v29.8h, v26.16b
+        uaddlp          v30.8h, v27.16b
+        uaddlp          v31.8h, v28.16b
+    .else
+        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
+        ld4             { v26.16b, v27.16b, v28.16b, v29.16b }, [x3], #64
+        prfm            pldl1keep, [x3, #128]   // Early prefetch
+        
+        .if \alpha_first
+            uaddlp          v21.8h, v19.16b
+            uaddlp          v20.8h, v18.16b
+            uaddlp          v19.8h, v17.16b
+            uaddlp          v31.8h, v29.16b
+            uaddlp          v30.8h, v28.16b
+            uaddlp          v29.8h, v27.16b
+        .else
+            uaddlp          v19.8h, v16.16b
+            uaddlp          v20.8h, v17.16b
+            uaddlp          v21.8h, v18.16b
+            uaddlp          v29.8h, v26.16b
+            uaddlp          v30.8h, v27.16b
+            uaddlp          v31.8h, v28.16b
+        .endif
+    .endif
+        
+        zip1            v22.8h, v19.8h, v19.8h  // [r0, r0, r1, r1, r2, r2, r3, r3] Block 1
+        zip2            v23.8h, v19.8h, v19.8h  // [r4, r4, r5, r5, r6, r6, r7, r7] Block 1
+        zip1            v24.8h, v20.8h, v20.8h  // [g0, g0, g1, g1, g2, g2, g3, g3] Block 1
+        zip2            v25.8h, v20.8h, v20.8h  // [g4, g4, g5, g5, g6, g6, g7, g7] Block 1
+        
+        mov             v0.16b, v13.16b         // UV accumulator 1a
+        mov             v1.16b, v13.16b         // UV accumulator 1b
+        mov             v2.16b, v14.16b         // UV accumulator 1c
+        mov             v3.16b, v14.16b         // UV accumulator 1d
+        
+        smlal           v0.4s, v7.4h, v22.4h
+        smlal2          v1.4s, v7.8h, v22.8h
+        smlal           v2.4s, v8.4h, v23.4h
+        smlal2          v3.4s, v8.8h, v23.8h
+        
+        smlal           v0.4s, v9.4h, v24.4h
+        smlal2          v1.4s, v9.8h, v24.8h
+        smlal           v2.4s, v10.4h, v25.4h
+        smlal2          v3.4s, v10.8h, v25.8h
+        
+        zip1            v22.8h, v21.8h, v21.8h  // [b0, b0, b1, b1, b2, b2, b3, b3] Block 1
+        zip2            v23.8h, v21.8h, v21.8h  // [b4, b4, b5, b5, b6, b6, b7, b7] Block 1
+        
+        smlal2          v1.4s, v11.8h, v22.8h
+        smlal           v2.4s, v12.4h, v23.4h
+        smlal2          v3.4s, v12.8h, v23.8h
+        
+        zip1            v22.8h, v29.8h, v29.8h  // [r0, r0, r1, r1, r2, r2, r3, r3] Block 2
+        zip2            v23.8h, v29.8h, v29.8h  // [r4, r4, r5, r5, r6, r6, r7, r7] Block 2
+        zip1            v24.8h, v30.8h, v30.8h  // [g0, g0, g1, g1, g2, g2, g3, g3] Block 2
+        zip2            v25.8h, v30.8h, v30.8h  // [g4, g4, g5, g5, g6, g6, g7, g7] Block 2
+        
+        mov             v4.16b, v13.16b
+        mov             v5.16b, v13.16b
+        mov             v6.16b, v14.16b
+        mov             v16.16b, v14.16b
+        
+        smlal           v4.4s, v7.4h, v22.4h
+        smlal2          v5.4s, v7.8h, v22.8h
+        smlal           v6.4s, v8.4h, v23.4h
+        smlal2          v16.4s, v8.8h, v23.8h
+        
+        smlal           v4.4s, v9.4h, v24.4h
+        smlal2          v5.4s, v9.8h, v24.8h
+        smlal           v6.4s, v10.4h, v25.4h
+        smlal2          v16.4s, v10.8h, v25.8h
+        
+        zip1            v22.8h, v31.8h, v31.8h  // [b0, b0, b1, b1, b2, b2, b3, b3] Block 2
+        zip2            v23.8h, v31.8h, v31.8h  // [b4, b4, b5, b5, b6, b6, b7, b7] Block 2
+        
+        smlal           v4.4s, v11.4h, v22.4h   // Process U&V for b0,b1 simultaneously
+        smlal2          v5.4s, v11.8h, v22.8h   // Process U&V for b2,b3 simultaneously
+        smlal           v6.4s, v12.4h, v23.4h   // Process U&V for b4,b5 simultaneously
+        smlal2          v16.4s, v12.8h, v23.8h  // Process U&V for b6,b7 simultaneously
+        
+        uzp1            v17.4s, v0.4s, v1.4s
+        uzp2            v18.4s, v0.4s, v1.4s
+        uzp1            v19.4s, v2.4s, v3.4s
+        uzp2            v20.4s, v2.4s, v3.4s
+        
+        uzp1            v21.4s, v4.4s, v5.4s    // Extract U values (Block 2, part 1)
+        uzp2            v22.4s, v4.4s, v5.4s    // Extract V values (Block 2, part 1)
+        uzp1            v23.4s, v6.4s, v16.4s   // Extract U values (Block 2, part 2)
+        uzp2            v24.4s, v6.4s, v16.4s   // Extract V values (Block 2, part 2)
+        
+        sqshrn          v25.4h, v17.4s, #10     // U1 (first 4)
+        sqshrn2         v25.8h, v19.4s, #10     // U1 (complete 8)
+        sqshrn          v26.4h, v18.4s, #10     // V1 (first 4)
+        sqshrn2         v26.8h, v20.4s, #10     // V1 (complete 8)
+        
+        sqshrn          v27.4h, v21.4s, #10     // U2 (first 4)
+        sqshrn2         v27.8h, v23.4s, #10     // U2 (complete 8)
+        sqshrn          v28.4h, v22.4s, #10     // V2 (first 4)
+        sqshrn2         v28.8h, v24.4s, #10     // V2 (complete 8)
+        
+        // **EFFICIENT STORE: Use STP for optimal memory bandwidth**
+        stp             q25, q27, [x0], #32     // Store U1, U2
+        stp             q26, q28, [x1], #32     // Store V1, V2
+        
+        sub             w5, w5, #16             // Decrement counter
+        cmp             w5, #16                 // Check for more blocks
+        b.ge            .Lprocess_16_\fmt_bgr
+
+        // Standard 8-pixel and scalar fallbacks (unchanged)
+        cmp             w5, #8
+        b.lt            .Lscalar_loop_init_\fmt_bgr
+
+.Lprocess_8_\fmt_bgr:
+    .if \element == 3
+        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
+        uaddlp          v19.8h, v16.16b
+        uaddlp          v20.8h, v17.16b
+        uaddlp          v21.8h, v18.16b
+    .else
+        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
+        .if \alpha_first
+            uaddlp          v21.8h, v19.16b
+            uaddlp          v20.8h, v18.16b
+            uaddlp          v19.8h, v17.16b
+        .else
+            uaddlp          v19.8h, v16.16b
+            uaddlp          v20.8h, v17.16b
+            uaddlp          v21.8h, v18.16b
+        .endif
+    .endif
+        
+        rgb_set_uv_coeff half=1
+        mov             v22.16b, v6.16b
+        mov             v23.16b, v6.16b
+        mov             v24.16b, v6.16b
+        mov             v25.16b, v6.16b
+        
+        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v22, v23, v24, v25, v16, v17, #10
+        
+        str             q16, [x0], #16
+        str             q17, [x1], #16
+        
+        sub             w5, w5, #8
+        cmp             w5, #8
+        b.ge            .Lprocess_8_\fmt_bgr
+
+.Lscalar_loop_init_\fmt_bgr:
+        cbz             w5, 9f
+
+.Lscalar_loop_\fmt_bgr:
+    .if \alpha_first
+        rgb_load_add_half 1, 5, 2, 6, 3, 7
+    .else
+        .if \element == 3
+            rgb_load_add_half 0, 3, 1, 4, 2, 5
+        .else
+            rgb_load_add_half 0, 4, 1, 5, 2, 6
+        .endif
+    .endif
+        
+        mov             x8, x9
+        mov             x17, x9
+        smaddl          x8, w2, w10, x8
+        smaddl          x17, w2, w13, x17
+        smaddl          x8, w4, w11, x8
+        smaddl          x17, w4, w14, x17
+        smaddl          x8, w7, w12, x8
+        smaddl          x17, w7, w15, x17
+        asr             w8, w8, #10
+        asr             w17, w17, #10
+        
+        strh            w8, [x0], #2
+        strh            w17, [x1], #2
+        
+        sub             w5, w5, #2
+        add             x3, x3, #(2*\element)
+        cbnz            w5, .Lscalar_loop_\fmt_bgr
+
+9:      ret
+endfunc
+.endm
+
+rgbToUV_half_neon_double bgra32, rgba32, element=4
+
+rgbToUV_half_neon_double abgr32, argb32, element=4, alpha_first=1
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 16 pixels at a time
  2025-05-27 16:58 ` [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 16 pixels at a time Dmitriy Kovalenko
@ 2025-05-29 19:09   ` Martin Storsjö
  2025-05-30  7:08     ` Martin Storsjö
  0 siblings, 1 reply; 3+ messages in thread
From: Martin Storsjö @ 2025-05-29 19:09 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Dmitriy Kovalenko

On Tue, 27 May 2025, Dmitriy Kovalenko wrote:

> This patches integrates so called double bufferring when we are loading
> 2 batch elements at a time and then processing them in parallel. On the
> moden arm processors especially Apple Silicon it gives a visible
> benefit, for subsampled pixel processing it is especially nice because
> it allows to read elements w/ 2 instructions and write with a single one
> (which is usually the slowest part).
>
> Including the previous patch in a stack on macbook pro m4 max rgb_to_yuv_half
> in checkasm goes up 2x of the c version
> ---
> libswscale/aarch64/input.S | 332 ++++++++++++++++++++++++++++++++++---
> 1 file changed, 309 insertions(+), 23 deletions(-)
>
> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> index ee8eb24c14..59d66d0022 100644
> --- a/libswscale/aarch64/input.S
> +++ b/libswscale/aarch64/input.S
> @@ -194,40 +194,94 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
>         ldp             w12, w13, [x6, #20]     // w12: bu, w13: rv
>         ldp             w14, w15, [x6, #28]     // w14: gv, w15: bv
> 4:
> -        cmp             w5, #8
>         rgb_set_uv_coeff half=1
> -        b.lt            2f
> -1:  // load 16 pixels and prefetch memory for the next block
> +
> +        cmp             w5, #16
> +        b.lt            2f                      // Go directly to scalar if < 16
> +
> +1:
>     .if \element == 3
> -        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
> -        prfm            pldl1strm, [x3, #48]
> +        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48  // First 16 pixels
> +        ld3             { v26.16b, v27.16b, v28.16b }, [x3], #48  // Second 16 pixels
> +        prfm            pldl1keep, [x3, #96]
>     .else
> -        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
> -        prfm            pldl1strm, [x3, #64]
> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64  // First 16 pixels
> +        ld4             { v26.16b, v27.16b, v28.16b, v29.16b }, [x3], #64  // Second 16 pixels
> +        prfm            pldl1keep, [x3, #128]
>     .endif
>
> +    // **Sum adjacent pixel pairs**
>     .if \alpha_first
> -        uaddlp          v21.8h, v19.16b         // v21: summed b pairs
> -        uaddlp          v20.8h, v18.16b         // v20: summed g pairs
> -        uaddlp          v19.8h, v17.16b         // v19: summed r pairs
> +        uaddlp          v21.8h, v19.16b         // Block 1: B sums
> +        uaddlp          v20.8h, v18.16b         // Block 1: G sums
> +        uaddlp          v19.8h, v17.16b         // Block 1: R sums
> +        uaddlp          v31.8h, v29.16b         // Block 2: B sums
> +        uaddlp          v30.8h, v28.16b         // Block 2: G sums
> +        uaddlp          v29.8h, v27.16b         // Block 2: R sums
>     .else
> -        uaddlp          v19.8h, v16.16b         // v19: summed r pairs
> -        uaddlp          v20.8h, v17.16b         // v20: summed g pairs
> -        uaddlp          v21.8h, v18.16b         // v21: summed b pairs
> +        uaddlp          v19.8h, v16.16b         // Block 1: R sums
> +        uaddlp          v20.8h, v17.16b         // Block 1: G sums
> +        uaddlp          v21.8h, v18.16b         // Block 1: B sums
> +        uaddlp          v29.8h, v26.16b         // Block 2: R sums
> +        uaddlp          v30.8h, v27.16b         // Block 2: G sums
> +        uaddlp          v31.8h, v28.16b         // Block 2: B sums
>     .endif
>
> -        mov             v22.16b, v6.16b         // U first half
> -        mov             v23.16b, v6.16b         // U second half
> -        mov             v24.16b, v6.16b         // V first half
> -        mov             v25.16b, v6.16b         // V second half
> -
> -        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v22, v23, v24, v25, v16, v17, #10
> +        // init accumulatos for both blocks
> +        mov             v7.16b, v6.16b          //  U_low
> +        mov             v8.16b, v6.16b          //  U_high
> +        mov             v9.16b, v6.16b          //  V_low
> +        mov             v10.16b, v6.16b         //  V_high
> +        mov             v11.16b, v6.16b         //  U_low
> +        mov             v12.16b, v6.16b         //  U_high
> +        mov             v13.16b, v6.16b         //  V_low
> +        mov             v14.16b, v6.16b         //  V_high
> +
> +        smlal           v7.4s, v0.4h, v19.4h    // U += ru * r (0-3)
> +        smlal           v9.4s, v3.4h, v19.4h    // V += rv * r (0-3)
> +        smlal           v11.4s, v0.4h, v29.4h   // U += ru * r (0-3)
> +        smlal           v13.4s, v3.4h, v29.4h   // V += rv * r (0-3)
> +
> +        smlal2          v8.4s, v0.8h, v19.8h    // U += ru * r (4-7)
> +        smlal2          v10.4s, v3.8h, v19.8h   // V += rv * r (4-7)
> +        smlal2          v12.4s, v0.8h, v29.8h   // U += ru * r (4-7)
> +        smlal2          v14.4s, v3.8h, v29.8h   // V += rv * r (4-7)
> +
> +        smlal           v7.4s, v1.4h, v20.4h    // U += gu * g (0-3)
> +        smlal           v9.4s, v4.4h, v20.4h    // V += gv * g (0-3)
> +        smlal           v11.4s, v1.4h, v30.4h   // U += gu * g (0-3)
> +        smlal           v13.4s, v4.4h, v30.4h   // V += gv * g (0-3)
> +
> +        smlal2          v8.4s, v1.8h, v20.8h    // U += gu * g (4-7)
> +        smlal2          v10.4s, v4.8h, v20.8h   // V += gv * g (4-7)
> +        smlal2          v12.4s, v1.8h, v30.8h   // U += gu * g (4-7)
> +        smlal2          v14.4s, v4.8h, v30.8h   // V += gv * g (4-7)
> +
> +        smlal           v7.4s, v2.4h, v21.4h    // U += bu * b (0-3)
> +        smlal           v9.4s, v5.4h, v21.4h    // V += bv * b (0-3)
> +        smlal           v11.4s, v2.4h, v31.4h   // U += bu * b (0-3)
> +        smlal           v13.4s, v5.4h, v31.4h   // V += bv * b (0-3)
> +
> +        smlal2          v8.4s, v2.8h, v21.8h    // U += bu * b (4-7)
> +        smlal2          v10.4s, v5.8h, v21.8h   // V += bv * b (4-7)
> +        smlal2          v12.4s, v2.8h, v31.8h   // U += bu * b (4-7)
> +        smlal2          v14.4s, v5.8h, v31.8h   // V += bv * b (4-7)
> +
> +        sqshrn          v16.4h, v7.4s, #10      // U (0-3)
> +        sqshrn          v17.4h, v9.4s, #10      // V (0-3)
> +        sqshrn          v22.4h, v11.4s, #10     // U (0-3)
> +        sqshrn          v23.4h, v13.4s, #10     // V (0-3)
> +
> +        sqshrn2         v16.8h, v8.4s, #10      // U (0-7)
> +        sqshrn2         v17.8h, v10.4s, #10     // V (0-7)
> +        sqshrn2         v22.8h, v12.4s, #10     // U (0-7)
> +        sqshrn2         v23.8h, v14.4s, #10     // V (0-7)
>
> -        str             q16, [x0], #16          // store dst_u
> -        str             q17, [x1], #16          // store dst_v
> +        stp             q16, q22, [x0], #32     // Store all 16 U values
> +        stp             q17, q23, [x1], #32     // Store all 16 V values
>
> -        sub             w5, w5, #8              // width -= 8
> -        cmp             w5, #8                  // width >= 8 ?
> +        sub             w5, w5, #16             // width -= 16
> +        cmp             w5, #16                 // width >= 16 ?
>         b.ge            1b
>         cbz             w5, 3f                  // No pixels left? Exit
>
> @@ -459,3 +513,235 @@ endfunc
>
> DISABLE_DOTPROD
> #endif
> +
> +.macro rgbToUV_half_neon_double fmt_bgr, fmt_rgb, element, alpha_first=0
> +function ff_\fmt_bgr\()ToUV_half_neon_double, export=1
> +        cbz             w5, 9f                  // exit immediately if width is 0
> +        cmp             w5, #16                 // check if we have at least 16 pixels
> +        b.lt            _ff_\fmt_bgr\()ToUV_half_neon

This fails to link on anything other than Darwin targets; other platforms 
don't have an underscore prefix on symbols. Use the X() macro around 
symbol names to get the right external symbol name for the function.

Also, with that fixed, this fails to properly back up and restore 
registers v8-v15; checkasm doesn't notice this on macOS, but on Linux and 
windows, checkasm has a call wrapper which does detect such issues.

I have set up a set of test configurations for aarch64 assembly on github; 
if you fetch the branch 
https://github.com/mstorsjo/ffmpeg/commits/gha-aarch64, append your own 
commits on top, and push this to your own fork on github, it'll test 
building it in all the relevant configurations (most relevant 
platforms/toolchains, including rare ones that not everybody may have 
availalbe). (You may need to activate the actions by visiting 
http://github.com/<yourusername>/ffmpeg/actions.) It also does check that 
the indentation of the assembly matches the common style.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 16 pixels at a time
  2025-05-29 19:09   ` Martin Storsjö
@ 2025-05-30  7:08     ` Martin Storsjö
  0 siblings, 0 replies; 3+ messages in thread
From: Martin Storsjö @ 2025-05-30  7:08 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Dmitriy Kovalenko

On Thu, 29 May 2025, Martin Storsjö wrote:

> On Tue, 27 May 2025, Dmitriy Kovalenko wrote:
>
>> This patches integrates so called double bufferring when we are loading
>> 2 batch elements at a time and then processing them in parallel. On the
>> moden arm processors especially Apple Silicon it gives a visible
>> benefit, for subsampled pixel processing it is especially nice because
>> it allows to read elements w/ 2 instructions and write with a single one
>> (which is usually the slowest part).
>> 
>> Including the previous patch in a stack on macbook pro m4 max 
>> rgb_to_yuv_half
>> in checkasm goes up 2x of the c version
>> ---
>> libswscale/aarch64/input.S | 332 ++++++++++++++++++++++++++++++++++---
>> 1 file changed, 309 insertions(+), 23 deletions(-)
>> 
>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>> index ee8eb24c14..59d66d0022 100644
>> --- a/libswscale/aarch64/input.S
>> +++ b/libswscale/aarch64/input.S
>> @@ -194,40 +194,94 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
>>         ldp             w12, w13, [x6, #20]     // w12: bu, w13: rv
>>         ldp             w14, w15, [x6, #28]     // w14: gv, w15: bv
>> 4:
>> -        cmp             w5, #8
>>         rgb_set_uv_coeff half=1
>> -        b.lt            2f
>> -1:  // load 16 pixels and prefetch memory for the next block
>> +
>> +        cmp             w5, #16
>> +        b.lt            2f                      // Go directly to scalar 
>> if < 16
>> +
>> +1:
>>     .if \element == 3
>> -        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
>> -        prfm            pldl1strm, [x3, #48]
>> +        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48  // First 
>> 16 pixels
>> +        ld3             { v26.16b, v27.16b, v28.16b }, [x3], #48  // 
>> Second 16 pixels
>> +        prfm            pldl1keep, [x3, #96]
>>     .else
>> -        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
>> -        prfm            pldl1strm, [x3, #64]
>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64 
>> // First 16 pixels
>> +        ld4             { v26.16b, v27.16b, v28.16b, v29.16b }, [x3], #64 
>> // Second 16 pixels
>> +        prfm            pldl1keep, [x3, #128]
>>     .endif
>> 
>> +    // **Sum adjacent pixel pairs**
>>     .if \alpha_first
>> -        uaddlp          v21.8h, v19.16b         // v21: summed b pairs
>> -        uaddlp          v20.8h, v18.16b         // v20: summed g pairs
>> -        uaddlp          v19.8h, v17.16b         // v19: summed r pairs
>> +        uaddlp          v21.8h, v19.16b         // Block 1: B sums
>> +        uaddlp          v20.8h, v18.16b         // Block 1: G sums
>> +        uaddlp          v19.8h, v17.16b         // Block 1: R sums
>> +        uaddlp          v31.8h, v29.16b         // Block 2: B sums
>> +        uaddlp          v30.8h, v28.16b         // Block 2: G sums
>> +        uaddlp          v29.8h, v27.16b         // Block 2: R sums
>>     .else
>> -        uaddlp          v19.8h, v16.16b         // v19: summed r pairs
>> -        uaddlp          v20.8h, v17.16b         // v20: summed g pairs
>> -        uaddlp          v21.8h, v18.16b         // v21: summed b pairs
>> +        uaddlp          v19.8h, v16.16b         // Block 1: R sums
>> +        uaddlp          v20.8h, v17.16b         // Block 1: G sums
>> +        uaddlp          v21.8h, v18.16b         // Block 1: B sums
>> +        uaddlp          v29.8h, v26.16b         // Block 2: R sums
>> +        uaddlp          v30.8h, v27.16b         // Block 2: G sums
>> +        uaddlp          v31.8h, v28.16b         // Block 2: B sums
>>     .endif
>> 
>> -        mov             v22.16b, v6.16b         // U first half
>> -        mov             v23.16b, v6.16b         // U second half
>> -        mov             v24.16b, v6.16b         // V first half
>> -        mov             v25.16b, v6.16b         // V second half
>> -
>> -        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, 
>> v5, v22, v23, v24, v25, v16, v17, #10
>> +        // init accumulatos for both blocks
>> +        mov             v7.16b, v6.16b          //  U_low
>> +        mov             v8.16b, v6.16b          //  U_high
>> +        mov             v9.16b, v6.16b          //  V_low
>> +        mov             v10.16b, v6.16b         //  V_high
>> +        mov             v11.16b, v6.16b         //  U_low
>> +        mov             v12.16b, v6.16b         //  U_high
>> +        mov             v13.16b, v6.16b         //  V_low
>> +        mov             v14.16b, v6.16b         //  V_high
>> +
>> +        smlal           v7.4s, v0.4h, v19.4h    // U += ru * r (0-3)
>> +        smlal           v9.4s, v3.4h, v19.4h    // V += rv * r (0-3)
>> +        smlal           v11.4s, v0.4h, v29.4h   // U += ru * r (0-3)
>> +        smlal           v13.4s, v3.4h, v29.4h   // V += rv * r (0-3)
>> +
>> +        smlal2          v8.4s, v0.8h, v19.8h    // U += ru * r (4-7)
>> +        smlal2          v10.4s, v3.8h, v19.8h   // V += rv * r (4-7)
>> +        smlal2          v12.4s, v0.8h, v29.8h   // U += ru * r (4-7)
>> +        smlal2          v14.4s, v3.8h, v29.8h   // V += rv * r (4-7)
>> +
>> +        smlal           v7.4s, v1.4h, v20.4h    // U += gu * g (0-3)
>> +        smlal           v9.4s, v4.4h, v20.4h    // V += gv * g (0-3)
>> +        smlal           v11.4s, v1.4h, v30.4h   // U += gu * g (0-3)
>> +        smlal           v13.4s, v4.4h, v30.4h   // V += gv * g (0-3)
>> +
>> +        smlal2          v8.4s, v1.8h, v20.8h    // U += gu * g (4-7)
>> +        smlal2          v10.4s, v4.8h, v20.8h   // V += gv * g (4-7)
>> +        smlal2          v12.4s, v1.8h, v30.8h   // U += gu * g (4-7)
>> +        smlal2          v14.4s, v4.8h, v30.8h   // V += gv * g (4-7)
>> +
>> +        smlal           v7.4s, v2.4h, v21.4h    // U += bu * b (0-3)
>> +        smlal           v9.4s, v5.4h, v21.4h    // V += bv * b (0-3)
>> +        smlal           v11.4s, v2.4h, v31.4h   // U += bu * b (0-3)
>> +        smlal           v13.4s, v5.4h, v31.4h   // V += bv * b (0-3)
>> +
>> +        smlal2          v8.4s, v2.8h, v21.8h    // U += bu * b (4-7)
>> +        smlal2          v10.4s, v5.8h, v21.8h   // V += bv * b (4-7)
>> +        smlal2          v12.4s, v2.8h, v31.8h   // U += bu * b (4-7)
>> +        smlal2          v14.4s, v5.8h, v31.8h   // V += bv * b (4-7)
>> +
>> +        sqshrn          v16.4h, v7.4s, #10      // U (0-3)
>> +        sqshrn          v17.4h, v9.4s, #10      // V (0-3)
>> +        sqshrn          v22.4h, v11.4s, #10     // U (0-3)
>> +        sqshrn          v23.4h, v13.4s, #10     // V (0-3)
>> +
>> +        sqshrn2         v16.8h, v8.4s, #10      // U (0-7)
>> +        sqshrn2         v17.8h, v10.4s, #10     // V (0-7)
>> +        sqshrn2         v22.8h, v12.4s, #10     // U (0-7)
>> +        sqshrn2         v23.8h, v14.4s, #10     // V (0-7)
>> 
>> -        str             q16, [x0], #16          // store dst_u
>> -        str             q17, [x1], #16          // store dst_v
>> +        stp             q16, q22, [x0], #32     // Store all 16 U values
>> +        stp             q17, q23, [x1], #32     // Store all 16 V values
>> 
>> -        sub             w5, w5, #8              // width -= 8
>> -        cmp             w5, #8                  // width >= 8 ?
>> +        sub             w5, w5, #16             // width -= 16
>> +        cmp             w5, #16                 // width >= 16 ?
>>         b.ge            1b
>>         cbz             w5, 3f                  // No pixels left? Exit
>> 
>> @@ -459,3 +513,235 @@ endfunc
>> 
>> DISABLE_DOTPROD
>> #endif
>> +
>> +.macro rgbToUV_half_neon_double fmt_bgr, fmt_rgb, element, alpha_first=0
>> +function ff_\fmt_bgr\()ToUV_half_neon_double, export=1
>> +        cbz             w5, 9f                  // exit immediately if 
>> width is 0
>> +        cmp             w5, #16                 // check if we have at 
>> least 16 pixels
>> +        b.lt            _ff_\fmt_bgr\()ToUV_half_neon
>
> Also, with that fixed, this fails to properly back up and restore registers 
> v8-v15; checkasm doesn't notice this on macOS, but on Linux and windows, 
> checkasm has a call wrapper which does detect such issues.

This comment is still unaddressed, checkasm still fails on Linux and 
Windows.

// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-05-30  7:08 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20250527165800.17159-1-dmtr.kovalenko@outlook.com>
2025-05-27 16:58 ` [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 16 pixels at a time Dmitriy Kovalenko
2025-05-29 19:09   ` Martin Storsjö
2025-05-30  7:08     ` Martin Storsjö

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git