Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations
@ 2025-05-27 16:57 Dmitriy Kovalenko
  2025-05-29 18:53 ` Martin Storsjö
  0 siblings, 1 reply; 9+ messages in thread
From: Dmitriy Kovalenko @ 2025-05-27 16:57 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Dmitriy Kovalenko

I've found quite a few ways to optimize existing ffmpeg's rgb to yuv
subsampled conversion. In this patch stack I'll try to
improve the perofrmance.

This particular set of changes is a small improvement to all the
existing functions and macro. The biggest performance gain is
coming from post loading increment of the pointer and immediate
prefetching of the memory blocks and interleaving the multiplication shifting operations of
different registers for better scheduling.

Also changed a bunch of places where cmp + b.le was used instead
of one instruction cbnz/tbnz and some other small cleanups.

Here are checkasm results on the macbook pro with the latest M4 max

<before>

bgra_to_uv_1080_c:                                     257.5 ( 1.00x)
bgra_to_uv_1080_neon:                                  211.9 ( 1.22x)
bgra_to_uv_1920_c:                                     467.1 ( 1.00x)
bgra_to_uv_1920_neon:                                  379.3 ( 1.23x)
bgra_to_uv_half_1080_c:                                198.9 ( 1.00x)
bgra_to_uv_half_1080_neon:                             125.7 ( 1.58x)
bgra_to_uv_half_1920_c:                                346.3 ( 1.00x)
bgra_to_uv_half_1920_neon:                             223.7 ( 1.55x)

<after>

bgra_to_uv_1080_c:                                     268.3 ( 1.00x)
bgra_to_uv_1080_neon:                                  176.0 ( 1.53x)
bgra_to_uv_1920_c:                                     456.6 ( 1.00x)
bgra_to_uv_1920_neon:                                  307.7 ( 1.48x)
bgra_to_uv_half_1080_c:                                193.2 ( 1.00x)
bgra_to_uv_half_1080_neon:                              96.8 ( 2.00x)
bgra_to_uv_half_1920_c:                                347.2 ( 1.00x)
bgra_to_uv_half_1920_neon:                             182.6 ( 1.92x)

With my proprietary test on IOS it gives around 70% of performance
improvement converting bgra 1920x1920 image to yuv420p

On my linux arm cortex-r processing the performance improvement not that
visible but still consistently faster by 5-10% than the current
implementation.
---
 libswscale/aarch64/input.S | 166 +++++++++++++++++++++++++------------
 1 file changed, 112 insertions(+), 54 deletions(-)

diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index c1c0adffc8..ee8eb24c14 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -1,5 +1,4 @@
-/*
- * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
+/* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
  *
  * This file is part of FFmpeg.
  *
@@ -57,20 +56,41 @@
         sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
 .endm
 
+// interleaved product version of the rgb to yuv gives slightly better performance on non-performant mobile 
+.macro rgb_to_uv_interleaved_product r, g, b, u_coef0, u_coef1, u_coef2, v_coef0, v_coef1, v_coef2, u_dst1, u_dst2, v_dst1, v_dst2, u_dst, v_dst, right_shift
+    smlal   \u_dst1\().4s, \u_coef0\().4h, \r\().4h     // U += ru * r (first 4)
+    smlal   \v_dst1\().4s, \v_coef0\().4h, \r\().4h     // V += rv * r (first 4)
+    smlal2  \u_dst2\().4s, \u_coef0\().8h, \r\().8h     // U += ru * r (second 4)
+    smlal2  \v_dst2\().4s, \v_coef0\().8h, \r\().8h     // V += rv * r (second 4)
+    
+    smlal   \u_dst1\().4s, \u_coef1\().4h, \g\().4h     // U += gu * g (first 4)
+    smlal   \v_dst1\().4s, \v_coef1\().4h, \g\().4h     // V += gv * g (first 4)
+    smlal2  \u_dst2\().4s, \u_coef1\().8h, \g\().8h     // U += gu * g (second 4)
+    smlal2  \v_dst2\().4s, \v_coef1\().8h, \g\().8h     // V += gv * g (second 4)
+    
+    smlal   \u_dst1\().4s, \u_coef2\().4h, \b\().4h     // U += bu * b (first 4)
+    smlal   \v_dst1\().4s, \v_coef2\().4h, \b\().4h     // V += bv * b (first 4)
+    smlal2  \u_dst2\().4s, \u_coef2\().8h, \b\().8h     // U += bu * b (second 4)
+    smlal2  \v_dst2\().4s, \v_coef2\().8h, \b\().8h     // V += bv * b (second 4)
+
+    sqshrn  \u_dst\().4h, \u_dst1\().4s, \right_shift   // U first 4 pixels
+    sqshrn2 \u_dst\().8h, \u_dst2\().4s, \right_shift   // U all 8 pixels
+    sqshrn  \v_dst\().4h, \v_dst1\().4s, \right_shift   // V first 4 pixels
+    sqshrn2 \v_dst\().8h, \v_dst2\().4s, \right_shift   // V all 8 pixels
+.endm
+
 .macro rgbToY_neon fmt_bgr, fmt_rgb, element, alpha_first=0
 function ff_\fmt_bgr\()ToY_neon, export=1
-        cmp             w4, #0                  // check width > 0
+        cbz             w4, 3f                  // check width > 0
         ldp             w12, w11, [x5]          // w12: ry, w11: gy
         ldr             w10, [x5, #8]           // w10: by
-        b.gt            4f
-        ret
+        b               4f
 endfunc
 
 function ff_\fmt_rgb\()ToY_neon, export=1
-        cmp             w4, #0                  // check width > 0
+        cbz             w4, 3f                  // check width > 0
         ldp             w10, w11, [x5]          // w10: ry, w11: gy
         ldr             w12, [x5, #8]           // w12: by
-        b.le            3f
 4:
         mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
         movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
@@ -158,8 +178,7 @@ rgbToY_neon abgr32, argb32, element=4, alpha_first=1
 
 .macro rgbToUV_half_neon fmt_bgr, fmt_rgb, element, alpha_first=0
 function ff_\fmt_bgr\()ToUV_half_neon, export=1
-        cmp             w5, #0          // check width > 0
-        b.le            3f
+        cbz             w5, 3f          // check width > 0
 
         ldp             w12, w11, [x6, #12]
         ldp             w10, w15, [x6, #20]
@@ -168,7 +187,7 @@ function ff_\fmt_bgr\()ToUV_half_neon, export=1
 endfunc
 
 function ff_\fmt_rgb\()ToUV_half_neon, export=1
-        cmp             w5, #0          // check width > 0
+        cmp             w5, #0                  // check width > 0
         b.le            3f
 
         ldp             w10, w11, [x6, #12]     // w10: ru, w11: gu
@@ -178,32 +197,41 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
         cmp             w5, #8
         rgb_set_uv_coeff half=1
         b.lt            2f
-1:
+1:  // load 16 pixels and prefetch memory for the next block
     .if \element == 3
-        ld3             { v16.16b, v17.16b, v18.16b }, [x3]
+        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
+        prfm            pldl1strm, [x3, #48]
     .else
-        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
+        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
+        prfm            pldl1strm, [x3, #64]
     .endif
+    
     .if \alpha_first
-        uaddlp          v21.8h, v19.16b
-        uaddlp          v20.8h, v18.16b
-        uaddlp          v19.8h, v17.16b
+        uaddlp          v21.8h, v19.16b         // v21: summed b pairs
+        uaddlp          v20.8h, v18.16b         // v20: summed g pairs
+        uaddlp          v19.8h, v17.16b         // v19: summed r pairs
     .else
-        uaddlp          v19.8h, v16.16b         // v19: r
-        uaddlp          v20.8h, v17.16b         // v20: g
-        uaddlp          v21.8h, v18.16b         // v21: b
+        uaddlp          v19.8h, v16.16b         // v19: summed r pairs
+        uaddlp          v20.8h, v17.16b         // v20: summed g pairs
+        uaddlp          v21.8h, v18.16b         // v21: summed b pairs
     .endif
 
-        rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
-        rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
-        sub             w5, w5, #8              // width -= 8
-        add             x3, x3, #(16*\element)
-        cmp             w5, #8                  // width >= 8 ?
+        mov             v22.16b, v6.16b         // U first half
+        mov             v23.16b, v6.16b         // U second half
+        mov             v24.16b, v6.16b         // V first half
+        mov             v25.16b, v6.16b         // V second half
+
+        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v22, v23, v24, v25, v16, v17, #10
+
         str             q16, [x0], #16          // store dst_u
         str             q17, [x1], #16          // store dst_v
+        
+        sub             w5, w5, #8              // width -= 8
+        cmp             w5, #8                  // width >= 8 ?
         b.ge            1b
-        cbz             w5, 3f
-2:
+        cbz             w5, 3f                  // No pixels left? Exit
+
+2:      // Scalar fallback for remaining pixels
 .if \alpha_first
         rgb_load_add_half 1, 5, 2, 6, 3, 7
 .else
@@ -213,21 +241,24 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
         rgb_load_add_half 0, 4, 1, 5, 2, 6
     .endif
 .endif
-
         smaddl          x8, w2, w10, x9         // dst_u = ru * r + const_offset
+        smaddl          x16, w2, w13, x9        // dst_v = rv * r + const_offset (parallel)
+        
         smaddl          x8, w4, w11, x8         // dst_u += gu * g
+        smaddl          x16, w4, w14, x16       // dst_v += gv * g (parallel)
+        
         smaddl          x8, w7, w12, x8         // dst_u += bu * b
-        asr             x8, x8, #10             // dst_u >>= 10
+        smaddl          x16, w7, w15, x16       // dst_v += bv * b (parallel)
+        
+        asr             w8, w8, #10             // dst_u >>= 10
+        asr             w16, w16, #10           // dst_v >>= 10
+        
         strh            w8, [x0], #2            // store dst_u
-
-        smaddl          x8, w2, w13, x9         // dst_v = rv * r + const_offset
-        smaddl          x8, w4, w14, x8         // dst_v += gv * g
-        smaddl          x8, w7, w15, x8         // dst_v += bv * b
-        asr             x8, x8, #10             // dst_v >>= 10
-        sub             w5, w5, #1
-        add             x3, x3, #(2*\element)
-        strh            w8, [x1], #2            // store dst_v
-        cbnz            w5, 2b
+        strh            w16, [x1], #2           // store dst_v
+        
+        sub             w5, w5, #1              // width--
+        add             x3, x3, #(2*\element)   // Advance source pointer
+        cbnz            w5, 2b                  // Process next pixel if any left
 3:
         ret
 endfunc
@@ -244,9 +275,9 @@ function ff_\fmt_bgr\()ToUV_neon, export=1
         cmp             w5, #0                  // check width > 0
         b.le            3f
 
-        ldp             w12, w11, [x6, #12]
-        ldp             w10, w15, [x6, #20]
-        ldp             w14, w13, [x6, #28]
+        ldp             w12, w11, [x6, #12]     // bu, gu
+        ldp             w10, w15, [x6, #20]     // ru, bv
+        ldp             w14, w13, [x6, #28]     // gv, rv
         b               4f
 endfunc
 
@@ -263,21 +294,48 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
         b.lt            2f
 1:
     .if \alpha_first
-        argb_to_yuv_load_rgb x3
+        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
+        uxtl            v21.8h, v19.8b             // v21: b
+        uxtl2           v24.8h, v19.16b            // v24: b
+        uxtl            v19.8h, v17.8b             // v19: r
+        uxtl            v20.8h, v18.8b             // v20: g
+        uxtl2           v22.8h, v17.16b            // v22: r
+        uxtl2           v23.8h, v18.16b            // v23: g
     .else
-        rgb_to_yuv_load_rgb x3, \element
+        .if \element == 3
+            ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
+            prfm            pldl1strm, [x3, #48]
+        .else // element == 4
+            ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
+            prfm            pldl1strm, [x3, #64]
+        .endif
+        uxtl            v19.8h, v16.8b             // v19: r
+        uxtl            v20.8h, v17.8b             // v20: g
+        uxtl            v21.8h, v18.8b             // v21: b
+        uxtl2           v22.8h, v16.16b            // v22: r
+        uxtl2           v23.8h, v17.16b            // v23: g
+        uxtl2           v24.8h, v18.16b            // v24: b
     .endif
-        rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
-        rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
-        rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
-        rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
-        sub             w5, w5, #16
-        add             x3, x3, #(16*\element)
-        cmp             w5, #16
-        stp             q16, q17, [x0], #32     // store to dst_u
-        stp             q18, q19, [x1], #32     // store to dst_v
+        // process 2 groups of 8 pixels
+        mov             v25.16b, v6.16b         // U_dst1 = const_offset (32-bit accumulators)
+        mov             v26.16b, v6.16b         // U_dst2 = const_offset
+        mov             v27.16b, v6.16b         // V_dst1 = const_offset
+        mov             v28.16b, v6.16b         // V_dst2 = const_offset
+        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v25, v26, v27, v28, v16, v18, #9
+
+        mov             v25.16b, v6.16b 
+        mov             v26.16b, v6.16b
+        mov             v27.16b, v6.16b
+        mov             v28.16b, v6.16b
+        rgb_to_uv_interleaved_product v22, v23, v24, v0, v1, v2, v3, v4, v5, v25, v26, v27, v28, v17, v19, #9
+
+        sub             w5, w5, #16             // width -= 16
+        cmp             w5, #16                 // width >= 16 ?
+        stp             q16, q17, [x0], #32     // store to dst_u (post-increment)
+        stp             q18, q19, [x1], #32     // store to dst_v (post-increment)
         b.ge            1b
-        cbz             w5, 3f
+        cbz             w5, 3f                  // No pixels left? Exit
+
 2:
     .if \alpha_first
         ldrb            w16, [x3, #1]           // w16: r
@@ -292,13 +350,13 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
         smaddl          x8, w16, w10, x9        // x8 = ru * r + const_offset
         smaddl          x8, w17, w11, x8        // x8 += gu * g
         smaddl          x8, w4, w12, x8         // x8 += bu * b
-        asr             w8, w8, #9              // x8 >>= 9
+        asr             x8, x8, #9              // x8 >>= 9
         strh            w8, [x0], #2            // store to dst_u
 
         smaddl          x8, w16, w13, x9        // x8 = rv * r + const_offset
         smaddl          x8, w17, w14, x8        // x8 += gv * g
         smaddl          x8, w4, w15, x8         // x8 += bv * b
-        asr             w8, w8, #9              // x8 >>= 9
+        asr             x8, x8, #9              // x8 >>= 9
         sub             w5, w5, #1              // width--
         add             x3, x3, #\element
         strh            w8, [x1], #2            // store to dst_v
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations
  2025-05-27 16:57 [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations Dmitriy Kovalenko
@ 2025-05-29 18:53 ` Martin Storsjö
  2025-05-29 21:38   ` Dmitriy Kovalenko
  2025-05-30  7:07   ` Martin Storsjö
  0 siblings, 2 replies; 9+ messages in thread
From: Martin Storsjö @ 2025-05-29 18:53 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Dmitriy Kovalenko

On Tue, 27 May 2025, Dmitriy Kovalenko wrote:

> This particular set of changes is a small improvement to all the
> existing functions and macro. The biggest performance gain is
> coming from post loading increment of the pointer and immediate
> prefetching

How certain are you about the prefetching actually making a notable 
difference here? I tried this patch, on an M3 Pro, and I see no difference 
in the benchmark numbers from checkasm if I remove those prefetch 
instructions. Do you have a setup where you can actually measure that they 
do make a difference?

So far, nothing within ffmpeg uses prefetch instructions anywhere, and I 
haven't seen a case where they would actually do anything beneficial. (The 
conventional wisdom I've heard is that they mostly don't do anything or 
actually end up harmful. In this case, they also add a direct dependence 
on the updated pointer register from the directly preceding instruction, 
which _is_ harmful on in-order cores, unless it entirely ignores the 
instruction.)

> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> index c1c0adffc8..ee8eb24c14 100644
> --- a/libswscale/aarch64/input.S
> +++ b/libswscale/aarch64/input.S
> @@ -1,5 +1,4 @@
> -/*
> - * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
> +/* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>

This is an unrelated change

>  *
>  * This file is part of FFmpeg.
>  *
> @@ -57,20 +56,41 @@
>         sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
> .endm
>
> +// interleaved product version of the rgb to yuv gives slightly better performance on non-performant mobile
> +.macro rgb_to_uv_interleaved_product r, g, b, u_coef0, u_coef1, u_coef2, v_coef0, v_coef1, v_coef2, u_dst1, u_dst2, v_dst1, v_dst2, u_dst, v_dst, right_shift
> +    smlal   \u_dst1\().4s, \u_coef0\().4h, \r\().4h     // U += ru * r (first 4)
> +    smlal   \v_dst1\().4s, \v_coef0\().4h, \r\().4h     // V += rv * r (first 4)
> +    smlal2  \u_dst2\().4s, \u_coef0\().8h, \r\().8h     // U += ru * r (second 4)
> +    smlal2  \v_dst2\().4s, \v_coef0\().8h, \r\().8h     // V += rv * r (second 4)
> +

The patch adds trailing whitespace here and in many other places; make 
sure you don't do that. (It is visible by doing e.g. "git show".)

Also, as a general rule, indent instructions within macros in the same way 
as elsewhere.

> +    smlal   \u_dst1\().4s, \u_coef1\().4h, \g\().4h     // U += gu * g (first 4)
> +    smlal   \v_dst1\().4s, \v_coef1\().4h, \g\().4h     // V += gv * g (first 4)
> +    smlal2  \u_dst2\().4s, \u_coef1\().8h, \g\().8h     // U += gu * g (second 4)
> +    smlal2  \v_dst2\().4s, \v_coef1\().8h, \g\().8h     // V += gv * g (second 4)

If you with "non-performant mobile" mean small in-order cores, most of 
them can handle repeated accumulation like these even faster, if you 
sequence these so that all accumulations to one register is sequentially. 
E.g. first all "smlal \u_dst1\().4s", followed by all "smlal 
\u_dst2\().4s", followed by \v_dst1, followed by \v_dst2. It's worth 
benchmarking if you do have access to such cores (e.g. Cortex-A53/A55; 
perhaps that's also the case on the Cortex-R you mentioned in the commit 
message).

That kind of code sequence is very counterintuitive, when considering 
instruction scheduling for an in-order core, but there is an explicit 
mention of it in the code optimization guides for them; it's usually 
worthwhile to test out such a form of these accumulations.

> +
> +    smlal   \u_dst1\().4s, \u_coef2\().4h, \b\().4h     // U += bu * b (first 4)
> +    smlal   \v_dst1\().4s, \v_coef2\().4h, \b\().4h     // V += bv * b (first 4)
> +    smlal2  \u_dst2\().4s, \u_coef2\().8h, \b\().8h     // U += bu * b (second 4)
> +    smlal2  \v_dst2\().4s, \v_coef2\().8h, \b\().8h     // V += bv * b (second 4)
> +
> +    sqshrn  \u_dst\().4h, \u_dst1\().4s, \right_shift   // U first 4 pixels
> +    sqshrn2 \u_dst\().8h, \u_dst2\().4s, \right_shift   // U all 8 pixels
> +    sqshrn  \v_dst\().4h, \v_dst1\().4s, \right_shift   // V first 4 pixels
> +    sqshrn2 \v_dst\().8h, \v_dst2\().4s, \right_shift   // V all 8 pixels
> +.endm
> +
> .macro rgbToY_neon fmt_bgr, fmt_rgb, element, alpha_first=0
> function ff_\fmt_bgr\()ToY_neon, export=1
> -        cmp             w4, #0                  // check width > 0
> +        cbz             w4, 3f                  // check width > 0
>         ldp             w12, w11, [x5]          // w12: ry, w11: gy
>         ldr             w10, [x5, #8]           // w10: by
> -        b.gt            4f
> -        ret
> +        b               4f
> endfunc
>
> function ff_\fmt_rgb\()ToY_neon, export=1
> -        cmp             w4, #0                  // check width > 0
> +        cbz             w4, 3f                  // check width > 0
>         ldp             w10, w11, [x5]          // w10: ry, w11: gy
>         ldr             w12, [x5, #8]           // w12: by
> -        b.le            3f
> 4:
>         mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
>         movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
> @@ -158,8 +178,7 @@ rgbToY_neon abgr32, argb32, element=4, alpha_first=1
>
> .macro rgbToUV_half_neon fmt_bgr, fmt_rgb, element, alpha_first=0
> function ff_\fmt_bgr\()ToUV_half_neon, export=1
> -        cmp             w5, #0          // check width > 0
> -        b.le            3f
> +        cbz             w5, 3f          // check width > 0
>
>         ldp             w12, w11, [x6, #12]
>         ldp             w10, w15, [x6, #20]
> @@ -168,7 +187,7 @@ function ff_\fmt_bgr\()ToUV_half_neon, export=1
> endfunc
>
> function ff_\fmt_rgb\()ToUV_half_neon, export=1
> -        cmp             w5, #0          // check width > 0
> +        cmp             w5, #0                  // check width > 0
>         b.le            3f
>
>         ldp             w10, w11, [x6, #12]     // w10: ru, w11: gu
> @@ -178,32 +197,41 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
>         cmp             w5, #8
>         rgb_set_uv_coeff half=1
>         b.lt            2f
> -1:
> +1:  // load 16 pixels and prefetch memory for the next block
>     .if \element == 3
> -        ld3             { v16.16b, v17.16b, v18.16b }, [x3]
> +        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
> +        prfm            pldl1strm, [x3, #48]
>     .else
> -        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
> +        prfm            pldl1strm, [x3, #64]
>     .endif
> +
>     .if \alpha_first
> -        uaddlp          v21.8h, v19.16b
> -        uaddlp          v20.8h, v18.16b
> -        uaddlp          v19.8h, v17.16b
> +        uaddlp          v21.8h, v19.16b         // v21: summed b pairs
> +        uaddlp          v20.8h, v18.16b         // v20: summed g pairs
> +        uaddlp          v19.8h, v17.16b         // v19: summed r pairs
>     .else
> -        uaddlp          v19.8h, v16.16b         // v19: r
> -        uaddlp          v20.8h, v17.16b         // v20: g
> -        uaddlp          v21.8h, v18.16b         // v21: b
> +        uaddlp          v19.8h, v16.16b         // v19: summed r pairs
> +        uaddlp          v20.8h, v17.16b         // v20: summed g pairs
> +        uaddlp          v21.8h, v18.16b         // v21: summed b pairs
>     .endif
>
> -        rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
> -        rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
> -        sub             w5, w5, #8              // width -= 8
> -        add             x3, x3, #(16*\element)
> -        cmp             w5, #8                  // width >= 8 ?
> +        mov             v22.16b, v6.16b         // U first half
> +        mov             v23.16b, v6.16b         // U second half
> +        mov             v24.16b, v6.16b         // V first half
> +        mov             v25.16b, v6.16b         // V second half
> +
> +        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v22, v23, v24, v25, v16, v17, #10
> +
>         str             q16, [x0], #16          // store dst_u
>         str             q17, [x1], #16          // store dst_v
> +
> +        sub             w5, w5, #8              // width -= 8
> +        cmp             w5, #8                  // width >= 8 ?
>         b.ge            1b
> -        cbz             w5, 3f
> -2:
> +        cbz             w5, 3f                  // No pixels left? Exit
> +
> +2:      // Scalar fallback for remaining pixels
> .if \alpha_first
>         rgb_load_add_half 1, 5, 2, 6, 3, 7
> .else
> @@ -213,21 +241,24 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
>         rgb_load_add_half 0, 4, 1, 5, 2, 6
>     .endif
> .endif
> -
>         smaddl          x8, w2, w10, x9         // dst_u = ru * r + const_offset
> +        smaddl          x16, w2, w13, x9        // dst_v = rv * r + const_offset (parallel)
> +
>         smaddl          x8, w4, w11, x8         // dst_u += gu * g
> +        smaddl          x16, w4, w14, x16       // dst_v += gv * g (parallel)
> +
>         smaddl          x8, w7, w12, x8         // dst_u += bu * b
> -        asr             x8, x8, #10             // dst_u >>= 10
> +        smaddl          x16, w7, w15, x16       // dst_v += bv * b (parallel)
> +
> +        asr             w8, w8, #10             // dst_u >>= 10
> +        asr             w16, w16, #10           // dst_v >>= 10
> +
>         strh            w8, [x0], #2            // store dst_u
> -
> -        smaddl          x8, w2, w13, x9         // dst_v = rv * r + const_offset
> -        smaddl          x8, w4, w14, x8         // dst_v += gv * g
> -        smaddl          x8, w7, w15, x8         // dst_v += bv * b
> -        asr             x8, x8, #10             // dst_v >>= 10
> -        sub             w5, w5, #1
> -        add             x3, x3, #(2*\element)
> -        strh            w8, [x1], #2            // store dst_v
> -        cbnz            w5, 2b
> +        strh            w16, [x1], #2           // store dst_v
> +
> +        sub             w5, w5, #1              // width--
> +        add             x3, x3, #(2*\element)   // Advance source pointer
> +        cbnz            w5, 2b                  // Process next pixel if any left
> 3:
>         ret
> endfunc
> @@ -244,9 +275,9 @@ function ff_\fmt_bgr\()ToUV_neon, export=1
>         cmp             w5, #0                  // check width > 0
>         b.le            3f
>
> -        ldp             w12, w11, [x6, #12]
> -        ldp             w10, w15, [x6, #20]
> -        ldp             w14, w13, [x6, #28]
> +        ldp             w12, w11, [x6, #12]     // bu, gu
> +        ldp             w10, w15, [x6, #20]     // ru, bv
> +        ldp             w14, w13, [x6, #28]     // gv, rv
>         b               4f
> endfunc
>
> @@ -263,21 +294,48 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
>         b.lt            2f
> 1:
>     .if \alpha_first
> -        argb_to_yuv_load_rgb x3
> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
> +        uxtl            v21.8h, v19.8b             // v21: b
> +        uxtl2           v24.8h, v19.16b            // v24: b
> +        uxtl            v19.8h, v17.8b             // v19: r
> +        uxtl            v20.8h, v18.8b             // v20: g
> +        uxtl2           v22.8h, v17.16b            // v22: r
> +        uxtl2           v23.8h, v18.16b            // v23: g

The code here, and below, is exactly the same as before, except for the 
postincrement on the load (plus the prefetch). Can we add the 
postincrement to the macro rather than unmacroing the code?

>     .else
> -        rgb_to_yuv_load_rgb x3, \element
> +        .if \element == 3
> +            ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
> +            prfm            pldl1strm, [x3, #48]

Instead of adding unusual indentation of the instructions here, you could 
use 2 instead of 4 spaces for the .if directives, to keep the vertical 
alignment of the instructions.

> +        .else // element == 4
> +            ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
> +            prfm            pldl1strm, [x3, #64]
> +        .endif
> +        uxtl            v19.8h, v16.8b             // v19: r
> +        uxtl            v20.8h, v17.8b             // v20: g
> +        uxtl            v21.8h, v18.8b             // v21: b
> +        uxtl2           v22.8h, v16.16b            // v22: r
> +        uxtl2           v23.8h, v17.16b            // v23: g
> +        uxtl2           v24.8h, v18.16b            // v24: b
>     .endif
> -        rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
> -        rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
> -        rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
> -        rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
> -        sub             w5, w5, #16
> -        add             x3, x3, #(16*\element)
> -        cmp             w5, #16
> -        stp             q16, q17, [x0], #32     // store to dst_u
> -        stp             q18, q19, [x1], #32     // store to dst_v
> +        // process 2 groups of 8 pixels
> +        mov             v25.16b, v6.16b         // U_dst1 = const_offset (32-bit accumulators)
> +        mov             v26.16b, v6.16b         // U_dst2 = const_offset
> +        mov             v27.16b, v6.16b         // V_dst1 = const_offset
> +        mov             v28.16b, v6.16b         // V_dst2 = const_offset
> +        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v25, v26, v27, v28, v16, v18, #9
> +
> +        mov             v25.16b, v6.16b
> +        mov             v26.16b, v6.16b
> +        mov             v27.16b, v6.16b
> +        mov             v28.16b, v6.16b
> +        rgb_to_uv_interleaved_product v22, v23, v24, v0, v1, v2, v3, v4, v5, v25, v26, v27, v28, v17, v19, #9
> +
> +        sub             w5, w5, #16             // width -= 16
> +        cmp             w5, #16                 // width >= 16 ?
> +        stp             q16, q17, [x0], #32     // store to dst_u (post-increment)
> +        stp             q18, q19, [x1], #32     // store to dst_v (post-increment)
>         b.ge            1b
> -        cbz             w5, 3f
> +        cbz             w5, 3f                  // No pixels left? Exit
> +
> 2:
>     .if \alpha_first
>         ldrb            w16, [x3, #1]           // w16: r
> @@ -292,13 +350,13 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
>         smaddl          x8, w16, w10, x9        // x8 = ru * r + const_offset
>         smaddl          x8, w17, w11, x8        // x8 += gu * g
>         smaddl          x8, w4, w12, x8         // x8 += bu * b
> -        asr             w8, w8, #9              // x8 >>= 9
> +        asr             x8, x8, #9              // x8 >>= 9
>         strh            w8, [x0], #2            // store to dst_u

Does this make any practical difference, as we're just storing the lower 
32 bits anyway?

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations
  2025-05-29 18:53 ` Martin Storsjö
@ 2025-05-29 21:38   ` Dmitriy Kovalenko
  2025-05-30  7:09     ` Martin Storsjö
  2025-05-30  7:07   ` Martin Storsjö
  1 sibling, 1 reply; 9+ messages in thread
From: Dmitriy Kovalenko @ 2025-05-29 21:38 UTC (permalink / raw)
  To: Martin Storsjö; +Cc: FFmpeg development discussions and patches

I appreciate the review for both the commits. I did fix all the unrelated changes and iterated in the new version, would appreciate the rearview. 

> On May 29, 2025, at 20:53, Martin Storsjö <martin@martin.st> wrote:
> 
> On Tue, 27 May 2025, Dmitriy Kovalenko wrote:
> 
>> This particular set of changes is a small improvement to all the
>> existing functions and macro. The biggest performance gain is
>> coming from post loading increment of the pointer and immediate
>> prefetching
> 
> How certain are you about the prefetching actually making a notable difference here? I tried this patch, on an M3 Pro, and I see no difference in the benchmark numbers from checkasm if I remove those prefetch instructions. Do you have a setup where you can actually measure that they do make a difference?
> 
> So far, nothing within ffmpeg uses prefetch instructions anywhere, and I haven't seen a case where they would actually do anything beneficial. (The conventional wisdom I've heard is that they mostly don't do anything or actually end up harmful. In this case, they also add a direct dependence on the updated pointer register from the directly preceding instruction, which _is_ harmful on in-order cores, unless it entirely ignores the instruction.)
> 
>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>> index c1c0adffc8..ee8eb24c14 100644
>> --- a/libswscale/aarch64/input.S
>> +++ b/libswscale/aarch64/input.S
>> @@ -1,5 +1,4 @@
>> -/*
>> - * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
>> +/* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
> 
> This is an unrelated change
> 
>> *
>> * This file is part of FFmpeg.
>> *
>> @@ -57,20 +56,41 @@
>>        sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
>> .endm
>> 
>> +// interleaved product version of the rgb to yuv gives slightly better performance on non-performant mobile
>> +.macro rgb_to_uv_interleaved_product r, g, b, u_coef0, u_coef1, u_coef2, v_coef0, v_coef1, v_coef2, u_dst1, u_dst2, v_dst1, v_dst2, u_dst, v_dst, right_shift
>> +    smlal   \u_dst1\().4s, \u_coef0\().4h, \r\().4h     // U += ru * r (first 4)
>> +    smlal   \v_dst1\().4s, \v_coef0\().4h, \r\().4h     // V += rv * r (first 4)
>> +    smlal2  \u_dst2\().4s, \u_coef0\().8h, \r\().8h     // U += ru * r (second 4)
>> +    smlal2  \v_dst2\().4s, \v_coef0\().8h, \r\().8h     // V += rv * r (second 4)
>> +
> 
> The patch adds trailing whitespace here and in many other places; make sure you don't do that. (It is visible by doing e.g. "git show".)
> 
> Also, as a general rule, indent instructions within macros in the same way as elsewhere.
> 
>> +    smlal   \u_dst1\().4s, \u_coef1\().4h, \g\().4h     // U += gu * g (first 4)
>> +    smlal   \v_dst1\().4s, \v_coef1\().4h, \g\().4h     // V += gv * g (first 4)
>> +    smlal2  \u_dst2\().4s, \u_coef1\().8h, \g\().8h     // U += gu * g (second 4)
>> +    smlal2  \v_dst2\().4s, \v_coef1\().8h, \g\().8h     // V += gv * g (second 4)
> 
> If you with "non-performant mobile" mean small in-order cores, most of them can handle repeated accumulation like these even faster, if you sequence these so that all accumulations to one register is sequentially. E.g. first all "smlal \u_dst1\().4s", followed by all "smlal \u_dst2\().4s", followed by \v_dst1, followed by \v_dst2. It's worth benchmarking if you do have access to such cores (e.g. Cortex-A53/A55; perhaps that's also the case on the Cortex-R you mentioned in the commit message).
> 
> That kind of code sequence is very counterintuitive, when considering instruction scheduling for an in-order core, but there is an explicit mention of it in the code optimization guides for them; it's usually worthwhile to test out such a form of these accumulations.
> 
>> +
>> +    smlal   \u_dst1\().4s, \u_coef2\().4h, \b\().4h     // U += bu * b (first 4)
>> +    smlal   \v_dst1\().4s, \v_coef2\().4h, \b\().4h     // V += bv * b (first 4)
>> +    smlal2  \u_dst2\().4s, \u_coef2\().8h, \b\().8h     // U += bu * b (second 4)
>> +    smlal2  \v_dst2\().4s, \v_coef2\().8h, \b\().8h     // V += bv * b (second 4)
>> +
>> +    sqshrn  \u_dst\().4h, \u_dst1\().4s, \right_shift   // U first 4 pixels
>> +    sqshrn2 \u_dst\().8h, \u_dst2\().4s, \right_shift   // U all 8 pixels
>> +    sqshrn  \v_dst\().4h, \v_dst1\().4s, \right_shift   // V first 4 pixels
>> +    sqshrn2 \v_dst\().8h, \v_dst2\().4s, \right_shift   // V all 8 pixels
>> +.endm
>> +
>> .macro rgbToY_neon fmt_bgr, fmt_rgb, element, alpha_first=0
>> function ff_\fmt_bgr\()ToY_neon, export=1
>> -        cmp             w4, #0                  // check width > 0
>> +        cbz             w4, 3f                  // check width > 0
>>        ldp             w12, w11, [x5]          // w12: ry, w11: gy
>>        ldr             w10, [x5, #8]           // w10: by
>> -        b.gt            4f
>> -        ret
>> +        b               4f
>> endfunc
>> 
>> function ff_\fmt_rgb\()ToY_neon, export=1
>> -        cmp             w4, #0                  // check width > 0
>> +        cbz             w4, 3f                  // check width > 0
>>        ldp             w10, w11, [x5]          // w10: ry, w11: gy
>>        ldr             w12, [x5, #8]           // w12: by
>> -        b.le            3f
>> 4:
>>        mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
>>        movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
>> @@ -158,8 +178,7 @@ rgbToY_neon abgr32, argb32, element=4, alpha_first=1
>> 
>> .macro rgbToUV_half_neon fmt_bgr, fmt_rgb, element, alpha_first=0
>> function ff_\fmt_bgr\()ToUV_half_neon, export=1
>> -        cmp             w5, #0          // check width > 0
>> -        b.le            3f
>> +        cbz             w5, 3f          // check width > 0
>> 
>>        ldp             w12, w11, [x6, #12]
>>        ldp             w10, w15, [x6, #20]
>> @@ -168,7 +187,7 @@ function ff_\fmt_bgr\()ToUV_half_neon, export=1
>> endfunc
>> 
>> function ff_\fmt_rgb\()ToUV_half_neon, export=1
>> -        cmp             w5, #0          // check width > 0
>> +        cmp             w5, #0                  // check width > 0
>>        b.le            3f
>> 
>>        ldp             w10, w11, [x6, #12]     // w10: ru, w11: gu
>> @@ -178,32 +197,41 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
>>        cmp             w5, #8
>>        rgb_set_uv_coeff half=1
>>        b.lt            2f
>> -1:
>> +1:  // load 16 pixels and prefetch memory for the next block
>>    .if \element == 3
>> -        ld3             { v16.16b, v17.16b, v18.16b }, [x3]
>> +        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
>> +        prfm            pldl1strm, [x3, #48]
>>    .else
>> -        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
>> +        prfm            pldl1strm, [x3, #64]
>>    .endif
>> +
>>    .if \alpha_first
>> -        uaddlp          v21.8h, v19.16b
>> -        uaddlp          v20.8h, v18.16b
>> -        uaddlp          v19.8h, v17.16b
>> +        uaddlp          v21.8h, v19.16b         // v21: summed b pairs
>> +        uaddlp          v20.8h, v18.16b         // v20: summed g pairs
>> +        uaddlp          v19.8h, v17.16b         // v19: summed r pairs
>>    .else
>> -        uaddlp          v19.8h, v16.16b         // v19: r
>> -        uaddlp          v20.8h, v17.16b         // v20: g
>> -        uaddlp          v21.8h, v18.16b         // v21: b
>> +        uaddlp          v19.8h, v16.16b         // v19: summed r pairs
>> +        uaddlp          v20.8h, v17.16b         // v20: summed g pairs
>> +        uaddlp          v21.8h, v18.16b         // v21: summed b pairs
>>    .endif
>> 
>> -        rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>> -        rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>> -        sub             w5, w5, #8              // width -= 8
>> -        add             x3, x3, #(16*\element)
>> -        cmp             w5, #8                  // width >= 8 ?
>> +        mov             v22.16b, v6.16b         // U first half
>> +        mov             v23.16b, v6.16b         // U second half
>> +        mov             v24.16b, v6.16b         // V first half
>> +        mov             v25.16b, v6.16b         // V second half
>> +
>> +        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v22, v23, v24, v25, v16, v17, #10
>> +
>>        str             q16, [x0], #16          // store dst_u
>>        str             q17, [x1], #16          // store dst_v
>> +
>> +        sub             w5, w5, #8              // width -= 8
>> +        cmp             w5, #8                  // width >= 8 ?
>>        b.ge            1b
>> -        cbz             w5, 3f
>> -2:
>> +        cbz             w5, 3f                  // No pixels left? Exit
>> +
>> +2:      // Scalar fallback for remaining pixels
>> .if \alpha_first
>>        rgb_load_add_half 1, 5, 2, 6, 3, 7
>> .else
>> @@ -213,21 +241,24 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
>>        rgb_load_add_half 0, 4, 1, 5, 2, 6
>>    .endif
>> .endif
>> -
>>        smaddl          x8, w2, w10, x9         // dst_u = ru * r + const_offset
>> +        smaddl          x16, w2, w13, x9        // dst_v = rv * r + const_offset (parallel)
>> +
>>        smaddl          x8, w4, w11, x8         // dst_u += gu * g
>> +        smaddl          x16, w4, w14, x16       // dst_v += gv * g (parallel)
>> +
>>        smaddl          x8, w7, w12, x8         // dst_u += bu * b
>> -        asr             x8, x8, #10             // dst_u >>= 10
>> +        smaddl          x16, w7, w15, x16       // dst_v += bv * b (parallel)
>> +
>> +        asr             w8, w8, #10             // dst_u >>= 10
>> +        asr             w16, w16, #10           // dst_v >>= 10
>> +
>>        strh            w8, [x0], #2            // store dst_u
>> -
>> -        smaddl          x8, w2, w13, x9         // dst_v = rv * r + const_offset
>> -        smaddl          x8, w4, w14, x8         // dst_v += gv * g
>> -        smaddl          x8, w7, w15, x8         // dst_v += bv * b
>> -        asr             x8, x8, #10             // dst_v >>= 10
>> -        sub             w5, w5, #1
>> -        add             x3, x3, #(2*\element)
>> -        strh            w8, [x1], #2            // store dst_v
>> -        cbnz            w5, 2b
>> +        strh            w16, [x1], #2           // store dst_v
>> +
>> +        sub             w5, w5, #1              // width--
>> +        add             x3, x3, #(2*\element)   // Advance source pointer
>> +        cbnz            w5, 2b                  // Process next pixel if any left
>> 3:
>>        ret
>> endfunc
>> @@ -244,9 +275,9 @@ function ff_\fmt_bgr\()ToUV_neon, export=1
>>        cmp             w5, #0                  // check width > 0
>>        b.le            3f
>> 
>> -        ldp             w12, w11, [x6, #12]
>> -        ldp             w10, w15, [x6, #20]
>> -        ldp             w14, w13, [x6, #28]
>> +        ldp             w12, w11, [x6, #12]     // bu, gu
>> +        ldp             w10, w15, [x6, #20]     // ru, bv
>> +        ldp             w14, w13, [x6, #28]     // gv, rv
>>        b               4f
>> endfunc
>> 
>> @@ -263,21 +294,48 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
>>        b.lt            2f
>> 1:
>>    .if \alpha_first
>> -        argb_to_yuv_load_rgb x3
>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
>> +        uxtl            v21.8h, v19.8b             // v21: b
>> +        uxtl2           v24.8h, v19.16b            // v24: b
>> +        uxtl            v19.8h, v17.8b             // v19: r
>> +        uxtl            v20.8h, v18.8b             // v20: g
>> +        uxtl2           v22.8h, v17.16b            // v22: r
>> +        uxtl2           v23.8h, v18.16b            // v23: g
> 
> The code here, and below, is exactly the same as before, except for the postincrement on the load (plus the prefetch). Can we add the postincrement to the macro rather than unmacroing the code?
> 
>>    .else
>> -        rgb_to_yuv_load_rgb x3, \element
>> +        .if \element == 3
>> +            ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
>> +            prfm            pldl1strm, [x3, #48]
> 
> Instead of adding unusual indentation of the instructions here, you could use 2 instead of 4 spaces for the .if directives, to keep the vertical alignment of the instructions.
> 
>> +        .else // element == 4
>> +            ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
>> +            prfm            pldl1strm, [x3, #64]
>> +        .endif
>> +        uxtl            v19.8h, v16.8b             // v19: r
>> +        uxtl            v20.8h, v17.8b             // v20: g
>> +        uxtl            v21.8h, v18.8b             // v21: b
>> +        uxtl2           v22.8h, v16.16b            // v22: r
>> +        uxtl2           v23.8h, v17.16b            // v23: g
>> +        uxtl2           v24.8h, v18.16b            // v24: b
>>    .endif
>> -        rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>> -        rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>> -        rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
>> -        rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
>> -        sub             w5, w5, #16
>> -        add             x3, x3, #(16*\element)
>> -        cmp             w5, #16
>> -        stp             q16, q17, [x0], #32     // store to dst_u
>> -        stp             q18, q19, [x1], #32     // store to dst_v
>> +        // process 2 groups of 8 pixels
>> +        mov             v25.16b, v6.16b         // U_dst1 = const_offset (32-bit accumulators)
>> +        mov             v26.16b, v6.16b         // U_dst2 = const_offset
>> +        mov             v27.16b, v6.16b         // V_dst1 = const_offset
>> +        mov             v28.16b, v6.16b         // V_dst2 = const_offset
>> +        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v25, v26, v27, v28, v16, v18, #9
>> +
>> +        mov             v25.16b, v6.16b
>> +        mov             v26.16b, v6.16b
>> +        mov             v27.16b, v6.16b
>> +        mov             v28.16b, v6.16b
>> +        rgb_to_uv_interleaved_product v22, v23, v24, v0, v1, v2, v3, v4, v5, v25, v26, v27, v28, v17, v19, #9
>> +
>> +        sub             w5, w5, #16             // width -= 16
>> +        cmp             w5, #16                 // width >= 16 ?
>> +        stp             q16, q17, [x0], #32     // store to dst_u (post-increment)
>> +        stp             q18, q19, [x1], #32     // store to dst_v (post-increment)
>>        b.ge            1b
>> -        cbz             w5, 3f
>> +        cbz             w5, 3f                  // No pixels left? Exit
>> +
>> 2:
>>    .if \alpha_first
>>        ldrb            w16, [x3, #1]           // w16: r
>> @@ -292,13 +350,13 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
>>        smaddl          x8, w16, w10, x9        // x8 = ru * r + const_offset
>>        smaddl          x8, w17, w11, x8        // x8 += gu * g
>>        smaddl          x8, w4, w12, x8         // x8 += bu * b
>> -        asr             w8, w8, #9              // x8 >>= 9
>> +        asr             x8, x8, #9              // x8 >>= 9
>>        strh            w8, [x0], #2            // store to dst_u
> 
> Does this make any practical difference, as we're just storing the lower 32 bits anyway?
> 
> // Martin
> 

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations
  2025-05-29 18:53 ` Martin Storsjö
  2025-05-29 21:38   ` Dmitriy Kovalenko
@ 2025-05-30  7:07   ` Martin Storsjö
  1 sibling, 0 replies; 9+ messages in thread
From: Martin Storsjö @ 2025-05-30  7:07 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Dmitriy Kovalenko

On Thu, 29 May 2025, Martin Storsjö wrote:

> In this case, they also add a direct dependence on 
> the updated pointer register from the directly preceding instruction, which 
> _is_ harmful on in-order cores, unless it entirely ignores the instruction.)

I did benchmark this, and indeed it causes a large slowdown on Cortex A53:

After this patch:
bgra_to_uv_8_neon:          209.0 ( 0.75x)
bgra_to_uv_128_neon:        541.5 ( 4.09x)
bgra_to_uv_1080_neon:      4584.8 ( 4.03x)
bgra_to_uv_1920_neon:      7865.9 ( 4.18x)
bgra_to_uv_half_8_neon:     112.0 ( 0.85x)
bgra_to_uv_half_128_neon:   348.8 ( 3.58x)
bgra_to_uv_half_1080_neon: 2873.6 ( 3.60x)
bgra_to_uv_half_1920_neon: 4973.5 ( 3.69x)

With the prfm instructions removed:

bgra_to_uv_8_neon:          215.3 ( 0.72x)
bgra_to_uv_128_neon:        516.5 ( 4.30x)
bgra_to_uv_1080_neon:      4387.6 ( 4.21x)
bgra_to_uv_1920_neon:      7503.5 ( 4.37x)
bgra_to_uv_half_8_neon:     111.8 ( 0.86x)
bgra_to_uv_half_128_neon:   331.8 ( 3.77x)
bgra_to_uv_half_1080_neon: 2739.1 ( 3.78x)
bgra_to_uv_half_1920_neon: 4728.9 ( 3.88x)

This is 5% faster when the prfm instructions are removed.

Since they are controversial within ffmpeg, I urge you to split the 
addition of prefetch instructions to a separate patch.

If they are scheduled where they are now, right after a postincrement load 
that updates the pointer, it is problematic for in-order cores. If you can 
schedule them elsewhere where they don't actively hurt in-order cores, we 
can maybe consider it.

>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>> index c1c0adffc8..ee8eb24c14 100644
>> --- a/libswscale/aarch64/input.S
>> +++ b/libswscale/aarch64/input.S
>> @@ -1,5 +1,4 @@
>> -/*
>> - * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
>> +/* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
>
> This is an unrelated change

This comment is unaddressed

>>  *
>>  * This file is part of FFmpeg.
>>  *
>> @@ -57,20 +56,41 @@
>>         sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // 
>> dst_higher_half = dst2 >> right_shift
>> .endm
>> 
>> +// interleaved product version of the rgb to yuv gives slightly better 
>> performance on non-performant mobile
>> +.macro rgb_to_uv_interleaved_product r, g, b, u_coef0, u_coef1, u_coef2, 
>> v_coef0, v_coef1, v_coef2, u_dst1, u_dst2, v_dst1, v_dst2, u_dst, v_dst, 
>> right_shift
>> +    smlal   \u_dst1\().4s, \u_coef0\().4h, \r\().4h     // U += ru * r 
>> (first 4)
>> +    smlal   \v_dst1\().4s, \v_coef0\().4h, \r\().4h     // V += rv * r 
>> (first 4)
>> +    smlal2  \u_dst2\().4s, \u_coef0\().8h, \r\().8h     // U += ru * r 
>> (second 4)
>> +    smlal2  \v_dst2\().4s, \v_coef0\().8h, \r\().8h     // V += rv * r 
>> (second 4)
>> +
>
> The patch adds trailing whitespace here and in many other places; make sure 
> you don't do that. (It is visible by doing e.g. "git show".)

This comment is unaddressed - you still have trailing whitespace.

> Also, as a general rule, indent instructions within macros in the same way as 
> elsewhere.

The instructions are better indented now, but the operand column is still 
misindented by one space. The branch I referred you to on github does 
contain an indent checker script which would point this out.

>> +    smlal   \u_dst1\().4s, \u_coef1\().4h, \g\().4h     // U += gu * g 
>> (first 4)
>> +    smlal   \v_dst1\().4s, \v_coef1\().4h, \g\().4h     // V += gv * g 
>> (first 4)
>> +    smlal2  \u_dst2\().4s, \u_coef1\().8h, \g\().8h     // U += gu * g 
>> (second 4)
>> +    smlal2  \v_dst2\().4s, \v_coef1\().8h, \g\().8h     // V += gv * g 
>> (second 4)
>
> If you with "non-performant mobile" mean small in-order cores, most of them 
> can handle repeated accumulation like these even faster, if you sequence 
> these so that all accumulations to one register is sequentially. E.g. first 
> all "smlal \u_dst1\().4s", followed by all "smlal \u_dst2\().4s", followed by 
> \v_dst1, followed by \v_dst2. It's worth benchmarking if you do have access 
> to such cores (e.g. Cortex-A53/A55; perhaps that's also the case on the 
> Cortex-R you mentioned in the commit message).
>
> That kind of code sequence is very counterintuitive, when considering 
> instruction scheduling for an in-order core, but there is an explicit mention 
> of it in the code optimization guides for them; it's usually worthwhile to 
> test out such a form of these accumulations.

This comment is unaddressed

>
>> +
>> +    smlal   \u_dst1\().4s, \u_coef2\().4h, \b\().4h     // U += bu * b 
>> (first 4)
>> +    smlal   \v_dst1\().4s, \v_coef2\().4h, \b\().4h     // V += bv * b 
>> (first 4)
>> +    smlal2  \u_dst2\().4s, \u_coef2\().8h, \b\().8h     // U += bu * b 
>> (second 4)
>> +    smlal2  \v_dst2\().4s, \v_coef2\().8h, \b\().8h     // V += bv * b 
>> (second 4)
>> +
>> +    sqshrn  \u_dst\().4h, \u_dst1\().4s, \right_shift   // U first 4 
>> pixels
>> +    sqshrn2 \u_dst\().8h, \u_dst2\().4s, \right_shift   // U all 8 pixels
>> +    sqshrn  \v_dst\().4h, \v_dst1\().4s, \right_shift   // V first 4 
>> pixels
>> +    sqshrn2 \v_dst\().8h, \v_dst2\().4s, \right_shift   // V all 8 pixels
>> +.endm
>> +
>> .macro rgbToY_neon fmt_bgr, fmt_rgb, element, alpha_first=0
>> function ff_\fmt_bgr\()ToY_neon, export=1
>> -        cmp             w4, #0                  // check width > 0
>> +        cbz             w4, 3f                  // check width > 0
>>         ldp             w12, w11, [x5]          // w12: ry, w11: gy
>>         ldr             w10, [x5, #8]           // w10: by
>> -        b.gt            4f
>> -        ret
>> +        b               4f
>> endfunc
>> 
>> function ff_\fmt_rgb\()ToY_neon, export=1
>> -        cmp             w4, #0                  // check width > 0
>> +        cbz             w4, 3f                  // check width > 0
>>         ldp             w10, w11, [x5]          // w10: ry, w11: gy
>>         ldr             w12, [x5, #8]           // w12: by
>> -        b.le            3f
>> 4:
>>         mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT 
>> - 7)
>>         movk            w9, #8, lsl #16         // w9 += 32 << 
>> (RGB2YUV_SHIFT - 1)
>> @@ -158,8 +178,7 @@ rgbToY_neon abgr32, argb32, element=4, alpha_first=1
>> 
>> .macro rgbToUV_half_neon fmt_bgr, fmt_rgb, element, alpha_first=0
>> function ff_\fmt_bgr\()ToUV_half_neon, export=1
>> -        cmp             w5, #0          // check width > 0
>> -        b.le            3f
>> +        cbz             w5, 3f          // check width > 0
>>
>>         ldp             w12, w11, [x6, #12]
>>         ldp             w10, w15, [x6, #20]
>> @@ -168,7 +187,7 @@ function ff_\fmt_bgr\()ToUV_half_neon, export=1
>> endfunc
>> 
>> function ff_\fmt_rgb\()ToUV_half_neon, export=1
>> -        cmp             w5, #0          // check width > 0
>> +        cmp             w5, #0                  // check width > 0
>>         b.le            3f
>>
>>         ldp             w10, w11, [x6, #12]     // w10: ru, w11: gu
>> @@ -178,32 +197,41 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
>>         cmp             w5, #8
>>         rgb_set_uv_coeff half=1
>>         b.lt            2f
>> -1:
>> +1:  // load 16 pixels and prefetch memory for the next block
>>     .if \element == 3
>> -        ld3             { v16.16b, v17.16b, v18.16b }, [x3]
>> +        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
>> +        prfm            pldl1strm, [x3, #48]
>>     .else
>> -        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
>> +        prfm            pldl1strm, [x3, #64]
>>     .endif
>> +
>>     .if \alpha_first
>> -        uaddlp          v21.8h, v19.16b
>> -        uaddlp          v20.8h, v18.16b
>> -        uaddlp          v19.8h, v17.16b
>> +        uaddlp          v21.8h, v19.16b         // v21: summed b pairs
>> +        uaddlp          v20.8h, v18.16b         // v20: summed g pairs
>> +        uaddlp          v19.8h, v17.16b         // v19: summed r pairs
>>     .else
>> -        uaddlp          v19.8h, v16.16b         // v19: r
>> -        uaddlp          v20.8h, v17.16b         // v20: g
>> -        uaddlp          v21.8h, v18.16b         // v21: b
>> +        uaddlp          v19.8h, v16.16b         // v19: summed r pairs
>> +        uaddlp          v20.8h, v17.16b         // v20: summed g pairs
>> +        uaddlp          v21.8h, v18.16b         // v21: summed b pairs
>>     .endif
>> 
>> -        rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>> -        rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>> -        sub             w5, w5, #8              // width -= 8
>> -        add             x3, x3, #(16*\element)
>> -        cmp             w5, #8                  // width >= 8 ?
>> +        mov             v22.16b, v6.16b         // U first half
>> +        mov             v23.16b, v6.16b         // U second half
>> +        mov             v24.16b, v6.16b         // V first half
>> +        mov             v25.16b, v6.16b         // V second half
>> +
>> +        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, 
>> v5, v22, v23, v24, v25, v16, v17, #10
>> +
>>         str             q16, [x0], #16          // store dst_u
>>         str             q17, [x1], #16          // store dst_v
>> +
>> +        sub             w5, w5, #8              // width -= 8
>> +        cmp             w5, #8                  // width >= 8 ?
>>         b.ge            1b
>> -        cbz             w5, 3f
>> -2:
>> +        cbz             w5, 3f                  // No pixels left? Exit
>> +
>> +2:      // Scalar fallback for remaining pixels
>> .if \alpha_first
>>         rgb_load_add_half 1, 5, 2, 6, 3, 7
>> .else
>> @@ -213,21 +241,24 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
>>         rgb_load_add_half 0, 4, 1, 5, 2, 6
>>     .endif
>> .endif
>> -
>>         smaddl          x8, w2, w10, x9         // dst_u = ru * r + 
>> const_offset
>> +        smaddl          x16, w2, w13, x9        // dst_v = rv * r + 
>> const_offset (parallel)
>> +
>>         smaddl          x8, w4, w11, x8         // dst_u += gu * g
>> +        smaddl          x16, w4, w14, x16       // dst_v += gv * g 
>> (parallel)
>> +
>>         smaddl          x8, w7, w12, x8         // dst_u += bu * b
>> -        asr             x8, x8, #10             // dst_u >>= 10
>> +        smaddl          x16, w7, w15, x16       // dst_v += bv * b 
>> (parallel)
>> +
>> +        asr             w8, w8, #10             // dst_u >>= 10
>> +        asr             w16, w16, #10           // dst_v >>= 10
>> +
>>         strh            w8, [x0], #2            // store dst_u
>> -
>> -        smaddl          x8, w2, w13, x9         // dst_v = rv * r + 
>> const_offset
>> -        smaddl          x8, w4, w14, x8         // dst_v += gv * g
>> -        smaddl          x8, w7, w15, x8         // dst_v += bv * b
>> -        asr             x8, x8, #10             // dst_v >>= 10
>> -        sub             w5, w5, #1
>> -        add             x3, x3, #(2*\element)
>> -        strh            w8, [x1], #2            // store dst_v
>> -        cbnz            w5, 2b
>> +        strh            w16, [x1], #2           // store dst_v
>> +
>> +        sub             w5, w5, #1              // width--
>> +        add             x3, x3, #(2*\element)   // Advance source pointer
>> +        cbnz            w5, 2b                  // Process next pixel if 
>> any left
>> 3:
>>         ret
>> endfunc
>> @@ -244,9 +275,9 @@ function ff_\fmt_bgr\()ToUV_neon, export=1
>>         cmp             w5, #0                  // check width > 0
>>         b.le            3f
>> 
>> -        ldp             w12, w11, [x6, #12]
>> -        ldp             w10, w15, [x6, #20]
>> -        ldp             w14, w13, [x6, #28]
>> +        ldp             w12, w11, [x6, #12]     // bu, gu
>> +        ldp             w10, w15, [x6, #20]     // ru, bv
>> +        ldp             w14, w13, [x6, #28]     // gv, rv
>>         b               4f
>> endfunc
>> 
>> +        .else // element == 4
>> +            ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], 
>> #64
>> +            prfm            pldl1strm, [x3, #64]
>> +        .endif
>> +        uxtl            v19.8h, v16.8b             // v19: r
>> +        uxtl            v20.8h, v17.8b             // v20: g
>> +        uxtl            v21.8h, v18.8b             // v21: b
>> +        uxtl2           v22.8h, v16.16b            // v22: r
>> +        uxtl2           v23.8h, v17.16b            // v23: g
>> +        uxtl2           v24.8h, v18.16b            // v24: b
>>     .endif
>> -        rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>> -        rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>> -        rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
>> -        rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
>> -        sub             w5, w5, #16
>> -        add             x3, x3, #(16*\element)
>> -        cmp             w5, #16
>> -        stp             q16, q17, [x0], #32     // store to dst_u
>> -        stp             q18, q19, [x1], #32     // store to dst_v
>> +        // process 2 groups of 8 pixels
>> +        mov             v25.16b, v6.16b         // U_dst1 = const_offset 
>> (32-bit accumulators)
>> +        mov             v26.16b, v6.16b         // U_dst2 = const_offset
>> +        mov             v27.16b, v6.16b         // V_dst1 = const_offset
>> +        mov             v28.16b, v6.16b         // V_dst2 = const_offset
>> +        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, 
>> v5, v25, v26, v27, v28, v16, v18, #9
>> +
>> +        mov             v25.16b, v6.16b
>> +        mov             v26.16b, v6.16b
>> +        mov             v27.16b, v6.16b
>> +        mov             v28.16b, v6.16b
>> +        rgb_to_uv_interleaved_product v22, v23, v24, v0, v1, v2, v3, v4, 
>> v5, v25, v26, v27, v28, v17, v19, #9
>> +
>> +        sub             w5, w5, #16             // width -= 16
>> +        cmp             w5, #16                 // width >= 16 ?
>> +        stp             q16, q17, [x0], #32     // store to dst_u 
>> (post-increment)
>> +        stp             q18, q19, [x1], #32     // store to dst_v 
>> (post-increment)
>>         b.ge            1b
>> -        cbz             w5, 3f
>> +        cbz             w5, 3f                  // No pixels left? Exit
>> +
>> 2:
>>     .if \alpha_first
>>         ldrb            w16, [x3, #1]           // w16: r
>> @@ -292,13 +350,13 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
>>         smaddl          x8, w16, w10, x9        // x8 = ru * r + 
>> const_offset
>>         smaddl          x8, w17, w11, x8        // x8 += gu * g
>>         smaddl          x8, w4, w12, x8         // x8 += bu * b
>> -        asr             w8, w8, #9              // x8 >>= 9
>> +        asr             x8, x8, #9              // x8 >>= 9
>>         strh            w8, [x0], #2            // store to dst_u
>
> Does this make any practical difference, as we're just storing the lower 32 
> bits anyway?

This comment is unaddressed.

// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations
  2025-05-29 21:38   ` Dmitriy Kovalenko
@ 2025-05-30  7:09     ` Martin Storsjö
  2025-05-30  7:18       ` Dmitriy Kovalenko
  0 siblings, 1 reply; 9+ messages in thread
From: Martin Storsjö @ 2025-05-30  7:09 UTC (permalink / raw)
  To: Dmitriy Kovalenko; +Cc: FFmpeg development discussions and patches

On Thu, 29 May 2025, Dmitriy Kovalenko wrote:

> I appreciate the review for both the commits. I did fix all the 
> unrelated changes and iterated in the new version, would appreciate the 
> rearview.

Don't top post.

There are still at least 5 of my comments unaddressed. If you are not 
going to address them, then you need to respond to the comments and 
explain why you think the change should be kept as is.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations
  2025-05-30  7:09     ` Martin Storsjö
@ 2025-05-30  7:18       ` Dmitriy Kovalenko
  2025-05-30  7:22         ` Martin Storsjö
  0 siblings, 1 reply; 9+ messages in thread
From: Dmitriy Kovalenko @ 2025-05-30  7:18 UTC (permalink / raw)
  To: Martin Storsjö; +Cc: FFmpeg development discussions and patches

All the comments were addressed, except the prefetch one in the patch version 2 I sent earlier today. And how did you test the prefetch, because I literally run a native benchmarking on the device right now and I see that with the patch applied I am getting 5% of performance improvement. Maybe there is an issue in the way you measure the timers? I can for sure remove them because I anyway use my own implementation that is 5-10x faster than the ffmpeg's, but I am genuinely curious how is it possible that you see different benchmarking results.

> On May 30, 2025, at 09:09, Martin Storsjö <martin@martin.st> wrote:
> 
> On Thu, 29 May 2025, Dmitriy Kovalenko wrote:
> 
>> I appreciate the review for both the commits. I did fix all the unrelated changes and iterated in the new version, would appreciate the rearview.
> 
> Don't top post.
> 
> There are still at least 5 of my comments unaddressed. If you are not going to address them, then you need to respond to the comments and explain why you think the change should be kept as is.
> 
> // Martin
> 

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations
  2025-05-30  7:18       ` Dmitriy Kovalenko
@ 2025-05-30  7:22         ` Martin Storsjö
  0 siblings, 0 replies; 9+ messages in thread
From: Martin Storsjö @ 2025-05-30  7:22 UTC (permalink / raw)
  To: Dmitriy Kovalenko; +Cc: FFmpeg development discussions and patches

On Fri, 30 May 2025, Dmitriy Kovalenko wrote:

> All the comments were addressed

If you fail to read the inline responses, we can end this conversation 
right here.

> And how did you test the prefetch, because I literally run a native 
> benchmarking on the device right now and I see that with the patch 
> applied I am getting 5% of performance improvement. Maybe there is an 
> issue in the way you measure the timers? I can for sure remove them 
> because I anyway use my own implementation that is 5-10x faster than the 
> ffmpeg's, but I am genuinely curious how is it possible that you see 
> different benchmarking results.

I tested with checkasm, with patch 1/2, on Linux on a Cortex A53, as-is 
and with the prefetch instructions removed.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations
  2025-05-31  9:11 ` Dmitriy Kovalenko
@ 2025-06-05 12:00   ` Martin Storsjö
  0 siblings, 0 replies; 9+ messages in thread
From: Martin Storsjö @ 2025-06-05 12:00 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Dmitriy Kovalenko

On Sat, 31 May 2025, Dmitriy Kovalenko wrote:

> I've found quite a few ways to optimize existing ffmpeg's rgb to yuv
> subsampled conversion. In this patch stack I'll try to
> improve the perofrmance.
>
> This particular set of changes is a small improvement to all the
> existing functions and macro. The biggest performance gain is
> coming from post loading increment of the pointer and immediate
> ~~prefetching of the memory blocks~~(was moved to the next patch in the stack) and interleaving the multiplication shifting operations of
> different registers for better scheduling.

Why keep the mention of prefetching here, when it no longer is included in 
the patch at all? This is what you suggest is encoded as the final, 
immutable commit message describing this change.

I have further inline comments below, please read them all.

> Also changed a bunch of places where cmp + b.le was used instead
> of one instruction cbnz/tbnz and some other small cleanups.
>
> Here are checkasm results on the macbook pro with the latest M4 max
>
> <before>
>
> bgra_to_uv_1080_c:                                     257.5 ( 1.00x)
> bgra_to_uv_1080_neon:                                  211.9 ( 1.22x)
> bgra_to_uv_1920_c:                                     467.1 ( 1.00x)
> bgra_to_uv_1920_neon:                                  379.3 ( 1.23x)
> bgra_to_uv_half_1080_c:                                198.9 ( 1.00x)
> bgra_to_uv_half_1080_neon:                             125.7 ( 1.58x)
> bgra_to_uv_half_1920_c:                                346.3 ( 1.00x)
> bgra_to_uv_half_1920_neon:                             223.7 ( 1.55x)
>
> <after>
>
> bgra_to_uv_1080_c:                                     268.3 ( 1.00x)
> bgra_to_uv_1080_neon:                                  176.0 ( 1.53x)
> bgra_to_uv_1920_c:                                     456.6 ( 1.00x)
> bgra_to_uv_1920_neon:                                  307.7 ( 1.48x)
> bgra_to_uv_half_1080_c:                                193.2 ( 1.00x)
> bgra_to_uv_half_1080_neon:                              96.8 ( 2.00x)
> bgra_to_uv_half_1920_c:                                347.2 ( 1.00x)
> bgra_to_uv_half_1920_neon:                             182.6 ( 1.92x)
>
> With my proprietary test on IOS it gives around 70% of performance
> improvement converting bgra 1920x1920 image to yuv420p
>
> On my linux arm cortex-r processing the performance improvement not that
> visible but still consistently faster by 5-10% than the current
> implementation.
> ---
> libswscale/aarch64/input.S | 143 +++++++++++++++++++++++--------------
> 1 file changed, 91 insertions(+), 52 deletions(-)

> @@ -292,7 +330,7 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
>         smaddl          x8, w16, w10, x9        // x8 = ru * r + const_offset
>         smaddl          x8, w17, w11, x8        // x8 += gu * g
>         smaddl          x8, w4, w12, x8         // x8 += bu * b
> -        asr             w8, w8, #9              // x8 >>= 9
> +        asr             x8, x8, #9              // x8 >>= 9
>         strh            w8, [x0], #2            // store to dst_u
>

Here you _still_ have one instance of these unrelated changes left in your 
patch.

>         smaddl          x8, w16, w13, x9        // x8 = rv * r + const_offset
> @@ -401,3 +439,4 @@ endfunc
>
> DISABLE_DOTPROD
> #endif
> +
> --

Here you are adding one unrelated empty line at the end of the file. Don't 
include any unrelated changes in your patches.

Before sending a patch, do review it yourself first, checking for any such 
unrelated stray changes.

Other than those details, the rest of the patch looks ok.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations
       [not found] <20250531091631.45342-1-dmtr.kovalenko@outlook.com>
@ 2025-05-31  9:11 ` Dmitriy Kovalenko
  2025-06-05 12:00   ` Martin Storsjö
  0 siblings, 1 reply; 9+ messages in thread
From: Dmitriy Kovalenko @ 2025-05-31  9:11 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Dmitriy Kovalenko

I've found quite a few ways to optimize existing ffmpeg's rgb to yuv
subsampled conversion. In this patch stack I'll try to
improve the perofrmance.

This particular set of changes is a small improvement to all the
existing functions and macro. The biggest performance gain is
coming from post loading increment of the pointer and immediate
~~prefetching of the memory blocks~~(was moved to the next patch in the stack) and interleaving the multiplication shifting operations of
different registers for better scheduling.

Also changed a bunch of places where cmp + b.le was used instead
of one instruction cbnz/tbnz and some other small cleanups.

Here are checkasm results on the macbook pro with the latest M4 max

<before>

bgra_to_uv_1080_c:                                     257.5 ( 1.00x)
bgra_to_uv_1080_neon:                                  211.9 ( 1.22x)
bgra_to_uv_1920_c:                                     467.1 ( 1.00x)
bgra_to_uv_1920_neon:                                  379.3 ( 1.23x)
bgra_to_uv_half_1080_c:                                198.9 ( 1.00x)
bgra_to_uv_half_1080_neon:                             125.7 ( 1.58x)
bgra_to_uv_half_1920_c:                                346.3 ( 1.00x)
bgra_to_uv_half_1920_neon:                             223.7 ( 1.55x)

<after>

bgra_to_uv_1080_c:                                     268.3 ( 1.00x)
bgra_to_uv_1080_neon:                                  176.0 ( 1.53x)
bgra_to_uv_1920_c:                                     456.6 ( 1.00x)
bgra_to_uv_1920_neon:                                  307.7 ( 1.48x)
bgra_to_uv_half_1080_c:                                193.2 ( 1.00x)
bgra_to_uv_half_1080_neon:                              96.8 ( 2.00x)
bgra_to_uv_half_1920_c:                                347.2 ( 1.00x)
bgra_to_uv_half_1920_neon:                             182.6 ( 1.92x)

With my proprietary test on IOS it gives around 70% of performance
improvement converting bgra 1920x1920 image to yuv420p

On my linux arm cortex-r processing the performance improvement not that
visible but still consistently faster by 5-10% than the current
implementation.
---
 libswscale/aarch64/input.S | 143 +++++++++++++++++++++++--------------
 1 file changed, 91 insertions(+), 52 deletions(-)

diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index c1c0adffc8..260a26e965 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -22,9 +22,9 @@
 
 .macro rgb_to_yuv_load_rgb src, element=3
     .if \element == 3
-        ld3             { v16.16b, v17.16b, v18.16b }, [\src]
+        ld3             { v16.16b, v17.16b, v18.16b }, [\src], #48
     .else
-        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
+        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src], #64
     .endif
         uxtl            v19.8h, v16.8b             // v19: r
         uxtl            v20.8h, v17.8b             // v20: g
@@ -35,7 +35,7 @@
 .endm
 
 .macro argb_to_yuv_load_rgb src
-        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
+        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src], #64
         uxtl            v21.8h, v19.8b             // v21: b
         uxtl2           v24.8h, v19.16b            // v24: b
         uxtl            v19.8h, v17.8b             // v19: r
@@ -57,20 +57,41 @@
         sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
 .endm
 
+// interleaved product version of the rgb to yuv gives slightly better performance on non-performant mobile cores
+.macro rgb_to_uv_interleaved_product r, g, b, u_coef0, u_coef1, u_coef2, v_coef0, v_coef1, v_coef2, u_dst1, u_dst2, v_dst1, v_dst2, u_dst, v_dst, right_shift
+        smlal           \u_dst1\().4s, \u_coef0\().4h, \r\().4h     // U += ru * r (first 4)
+        smlal           \v_dst1\().4s, \v_coef0\().4h, \r\().4h     // V += rv * r (first 4)
+        smlal2          \u_dst2\().4s, \u_coef0\().8h, \r\().8h     // U += ru * r (second 4)
+        smlal2          \v_dst2\().4s, \v_coef0\().8h, \r\().8h     // V += rv * r (second 4)
+
+        smlal           \u_dst1\().4s, \u_coef1\().4h, \g\().4h     // U += gu * g (first 4)
+        smlal           \v_dst1\().4s, \v_coef1\().4h, \g\().4h     // V += gv * g (first 4)
+        smlal2          \u_dst2\().4s, \u_coef1\().8h, \g\().8h     // U += gu * g (second 4)
+        smlal2          \v_dst2\().4s, \v_coef1\().8h, \g\().8h     // V += gv * g (second 4)
+
+        smlal           \u_dst1\().4s, \u_coef2\().4h, \b\().4h     // U += bu * b (first 4)
+        smlal           \v_dst1\().4s, \v_coef2\().4h, \b\().4h     // V += bv * b (first 4)
+        smlal2          \u_dst2\().4s, \u_coef2\().8h, \b\().8h     // U += bu * b (second 4)
+        smlal2          \v_dst2\().4s, \v_coef2\().8h, \b\().8h     // V += bv * b (second 4)
+
+        sqshrn          \u_dst\().4h, \u_dst1\().4s, \right_shift   // U first 4 pixels
+        sqshrn2         \u_dst\().8h, \u_dst2\().4s, \right_shift   // U all 8 pixels
+        sqshrn          \v_dst\().4h, \v_dst1\().4s, \right_shift   // V first 4 pixels
+        sqshrn2         \v_dst\().8h, \v_dst2\().4s, \right_shift   // V all 8 pixels
+.endm
+
 .macro rgbToY_neon fmt_bgr, fmt_rgb, element, alpha_first=0
 function ff_\fmt_bgr\()ToY_neon, export=1
-        cmp             w4, #0                  // check width > 0
+        cbz             w4, 3f                  // check width > 0
         ldp             w12, w11, [x5]          // w12: ry, w11: gy
         ldr             w10, [x5, #8]           // w10: by
-        b.gt            4f
-        ret
+        b               4f
 endfunc
 
 function ff_\fmt_rgb\()ToY_neon, export=1
-        cmp             w4, #0                  // check width > 0
+        cbz             w4, 3f                  // check width > 0
         ldp             w10, w11, [x5]          // w10: ry, w11: gy
         ldr             w12, [x5, #8]           // w12: by
-        b.le            3f
 4:
         mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
         movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
@@ -90,7 +111,6 @@ function ff_\fmt_rgb\()ToY_neon, export=1
         rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
         rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
         sub             w4, w4, #16             // width -= 16
-        add             x1, x1, #(16*\element)
         cmp             w4, #16                 // width >= 16 ?
         stp             q16, q17, [x0], #32     // store to dst
         b.ge            1b
@@ -158,8 +178,7 @@ rgbToY_neon abgr32, argb32, element=4, alpha_first=1
 
 .macro rgbToUV_half_neon fmt_bgr, fmt_rgb, element, alpha_first=0
 function ff_\fmt_bgr\()ToUV_half_neon, export=1
-        cmp             w5, #0          // check width > 0
-        b.le            3f
+        cbz             w5, 3f          // check width > 0
 
         ldp             w12, w11, [x6, #12]
         ldp             w10, w15, [x6, #20]
@@ -168,7 +187,7 @@ function ff_\fmt_bgr\()ToUV_half_neon, export=1
 endfunc
 
 function ff_\fmt_rgb\()ToUV_half_neon, export=1
-        cmp             w5, #0          // check width > 0
+        cmp             w5, #0                  // check width > 0
         b.le            3f
 
         ldp             w10, w11, [x6, #12]     // w10: ru, w11: gu
@@ -178,32 +197,39 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
         cmp             w5, #8
         rgb_set_uv_coeff half=1
         b.lt            2f
-1:
+1:  // load 16 pixels
     .if \element == 3
-        ld3             { v16.16b, v17.16b, v18.16b }, [x3]
+        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
     .else
-        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
+        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
     .endif
+
     .if \alpha_first
-        uaddlp          v21.8h, v19.16b
-        uaddlp          v20.8h, v18.16b
-        uaddlp          v19.8h, v17.16b
+        uaddlp          v21.8h, v19.16b         // v21: summed b pairs
+        uaddlp          v20.8h, v18.16b         // v20: summed g pairs
+        uaddlp          v19.8h, v17.16b         // v19: summed r pairs
     .else
-        uaddlp          v19.8h, v16.16b         // v19: r
-        uaddlp          v20.8h, v17.16b         // v20: g
-        uaddlp          v21.8h, v18.16b         // v21: b
+        uaddlp          v19.8h, v16.16b         // v19: summed r pairs
+        uaddlp          v20.8h, v17.16b         // v20: summed g pairs
+        uaddlp          v21.8h, v18.16b         // v21: summed b pairs
     .endif
 
-        rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
-        rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
-        sub             w5, w5, #8              // width -= 8
-        add             x3, x3, #(16*\element)
-        cmp             w5, #8                  // width >= 8 ?
+        mov             v22.16b, v6.16b         // U first half
+        mov             v23.16b, v6.16b         // U second half
+        mov             v24.16b, v6.16b         // V first half
+        mov             v25.16b, v6.16b         // V second half
+
+        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v22, v23, v24, v25, v16, v17, #10
+
         str             q16, [x0], #16          // store dst_u
         str             q17, [x1], #16          // store dst_v
+
+        sub             w5, w5, #8              // width -= 8
+        cmp             w5, #8                  // width >= 8 ?
         b.ge            1b
-        cbz             w5, 3f
-2:
+        cbz             w5, 3f                  // No pixels left? Exit
+
+2:      // Scalar fallback for remaining pixels
 .if \alpha_first
         rgb_load_add_half 1, 5, 2, 6, 3, 7
 .else
@@ -213,21 +239,24 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
         rgb_load_add_half 0, 4, 1, 5, 2, 6
     .endif
 .endif
-
         smaddl          x8, w2, w10, x9         // dst_u = ru * r + const_offset
+        smaddl          x16, w2, w13, x9        // dst_v = rv * r + const_offset (parallel)
+
         smaddl          x8, w4, w11, x8         // dst_u += gu * g
+        smaddl          x16, w4, w14, x16       // dst_v += gv * g (parallel)
+
         smaddl          x8, w7, w12, x8         // dst_u += bu * b
-        asr             x8, x8, #10             // dst_u >>= 10
+        smaddl          x16, w7, w15, x16       // dst_v += bv * b (parallel)
+
+        asr             w8, w8, #10             // dst_u >>= 10
+        asr             w16, w16, #10           // dst_v >>= 10
+
         strh            w8, [x0], #2            // store dst_u
+        strh            w16, [x1], #2           // store dst_v
 
-        smaddl          x8, w2, w13, x9         // dst_v = rv * r + const_offset
-        smaddl          x8, w4, w14, x8         // dst_v += gv * g
-        smaddl          x8, w7, w15, x8         // dst_v += bv * b
-        asr             x8, x8, #10             // dst_v >>= 10
-        sub             w5, w5, #1
-        add             x3, x3, #(2*\element)
-        strh            w8, [x1], #2            // store dst_v
-        cbnz            w5, 2b
+        sub             w5, w5, #1              // width--
+        add             x3, x3, #(2*\element)   // Advance source pointer
+        cbnz            w5, 2b                  // Process next pixel if any left
 3:
         ret
 endfunc
@@ -244,9 +273,9 @@ function ff_\fmt_bgr\()ToUV_neon, export=1
         cmp             w5, #0                  // check width > 0
         b.le            3f
 
-        ldp             w12, w11, [x6, #12]
-        ldp             w10, w15, [x6, #20]
-        ldp             w14, w13, [x6, #28]
+        ldp             w12, w11, [x6, #12]     // bu, gu
+        ldp             w10, w15, [x6, #20]     // ru, bv
+        ldp             w14, w13, [x6, #28]     // gv, rv
         b               4f
 endfunc
 
@@ -267,17 +296,26 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
     .else
         rgb_to_yuv_load_rgb x3, \element
     .endif
-        rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
-        rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
-        rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
-        rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
-        sub             w5, w5, #16
-        add             x3, x3, #(16*\element)
-        cmp             w5, #16
-        stp             q16, q17, [x0], #32     // store to dst_u
-        stp             q18, q19, [x1], #32     // store to dst_v
+        // process 2 groups of 8 pixels
+        mov             v25.16b, v6.16b         // U_dst1 = const_offset (32-bit accumulators)
+        mov             v26.16b, v6.16b         // U_dst2 = const_offset
+        mov             v27.16b, v6.16b         // V_dst1 = const_offset
+        mov             v28.16b, v6.16b         // V_dst2 = const_offset
+        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v25, v26, v27, v28, v16, v18, #9
+
+        mov             v25.16b, v6.16b
+        mov             v26.16b, v6.16b
+        mov             v27.16b, v6.16b
+        mov             v28.16b, v6.16b
+        rgb_to_uv_interleaved_product v22, v23, v24, v0, v1, v2, v3, v4, v5, v25, v26, v27, v28, v17, v19, #9
+
+        sub             w5, w5, #16             // width -= 16
+        cmp             w5, #16                 // width >= 16 ?
+        stp             q16, q17, [x0], #32     // store to dst_u (post-increment)
+        stp             q18, q19, [x1], #32     // store to dst_v (post-increment)
         b.ge            1b
-        cbz             w5, 3f
+        cbz             w5, 3f                  // No pixels left? Exit
+
 2:
     .if \alpha_first
         ldrb            w16, [x3, #1]           // w16: r
@@ -292,7 +330,7 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
         smaddl          x8, w16, w10, x9        // x8 = ru * r + const_offset
         smaddl          x8, w17, w11, x8        // x8 += gu * g
         smaddl          x8, w4, w12, x8         // x8 += bu * b
-        asr             w8, w8, #9              // x8 >>= 9
+        asr             x8, x8, #9              // x8 >>= 9
         strh            w8, [x0], #2            // store to dst_u
 
         smaddl          x8, w16, w13, x9        // x8 = rv * r + const_offset
@@ -401,3 +439,4 @@ endfunc
 
 DISABLE_DOTPROD
 #endif
+
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2025-06-05 12:00 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-05-27 16:57 [FFmpeg-devel] [PATCH 1/2] swscale: rgb_to_yuv neon optimizations Dmitriy Kovalenko
2025-05-29 18:53 ` Martin Storsjö
2025-05-29 21:38   ` Dmitriy Kovalenko
2025-05-30  7:09     ` Martin Storsjö
2025-05-30  7:18       ` Dmitriy Kovalenko
2025-05-30  7:22         ` Martin Storsjö
2025-05-30  7:07   ` Martin Storsjö
     [not found] <20250531091631.45342-1-dmtr.kovalenko@outlook.com>
2025-05-31  9:11 ` Dmitriy Kovalenko
2025-06-05 12:00   ` Martin Storsjö

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git