Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] swscale/aarch64/hscale.S Refactor hscale_16_to_15__fs_4
@ 2025-03-01 12:59 Krzysztof Pyrkosz via ffmpeg-devel
  2025-03-01 23:03 ` Martin Storsjö
  0 siblings, 1 reply; 3+ messages in thread
From: Krzysztof Pyrkosz via ffmpeg-devel @ 2025-03-01 12:59 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Krzysztof Pyrkosz

Before/after:

A78
hscale_16_to_15__fs_4_dstW_8_neon:                      86.8 ( 1.72x)
hscale_16_to_15__fs_4_dstW_24_neon:                    147.5 ( 2.73x)
hscale_16_to_15__fs_4_dstW_128_neon:                   614.0 ( 3.14x)
hscale_16_to_15__fs_4_dstW_144_neon:                   680.5 ( 3.18x)
hscale_16_to_15__fs_4_dstW_256_neon:                  1193.2 ( 3.19x)
hscale_16_to_15__fs_4_dstW_512_neon:                  2305.0 ( 3.27x)

hscale_16_to_15__fs_4_dstW_8_neon:                      86.0 ( 1.74x)
hscale_16_to_15__fs_4_dstW_24_neon:                    106.8 ( 3.78x)
hscale_16_to_15__fs_4_dstW_128_neon:                   404.0 ( 4.81x)
hscale_16_to_15__fs_4_dstW_144_neon:                   451.8 ( 4.80x)
hscale_16_to_15__fs_4_dstW_256_neon:                   760.5 ( 5.06x)
hscale_16_to_15__fs_4_dstW_512_neon:                  1520.0 ( 5.01x)

A72
hscale_16_to_15__fs_4_dstW_8_neon:                     156.8 ( 1.52x)
hscale_16_to_15__fs_4_dstW_24_neon:                    217.8 ( 2.52x)
hscale_16_to_15__fs_4_dstW_128_neon:                   906.8 ( 2.90x)
hscale_16_to_15__fs_4_dstW_144_neon:                  1014.5 ( 2.91x)
hscale_16_to_15__fs_4_dstW_256_neon:                  1751.5 ( 2.96x)
hscale_16_to_15__fs_4_dstW_512_neon:                  3469.3 ( 2.97x)

hscale_16_to_15__fs_4_dstW_8_neon:                     151.2 ( 1.54x)
hscale_16_to_15__fs_4_dstW_24_neon:                    173.4 ( 3.15x)
hscale_16_to_15__fs_4_dstW_128_neon:                   660.0 ( 3.98x)
hscale_16_to_15__fs_4_dstW_144_neon:                   735.7 ( 4.00x)
hscale_16_to_15__fs_4_dstW_256_neon:                  1273.5 ( 4.09x)
hscale_16_to_15__fs_4_dstW_512_neon:                  2488.2 ( 4.16x)
---

This patch removes the use of stack for temporary state and replaces
interleaved ld4 loads with ld1.
I'm aware the component is being deprecated, however in my use case
(screen recording) the total time spent in this function is roughly 15%,
the improvement is significant and worth sharing.

Krzysztof


 libswscale/aarch64/hscale.S | 183 ++++++++++++++----------------------
 1 file changed, 70 insertions(+), 113 deletions(-)

diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S
index 435460c1af..4140fa9c60 100644
--- a/libswscale/aarch64/hscale.S
+++ b/libswscale/aarch64/hscale.S
@@ -638,6 +638,16 @@ function ff_hscale8to19_X4_neon, export=1
         ret
 endfunc
 
+
+.macro hscale_iter src, src2, filter, dst1, dst2
+        uxtl            \src\().4s, \src\().4h
+        sxtl            v19.4s, \filter\().4h
+        mul             \dst1\().4s, \src\().4s, v19.4s
+        uxtl            \src2\().4s, \src2\().4h
+        sxtl2           \filter\().4s, \filter\().8h
+        mul             \dst2\().4s, \src2\().4s, \filter\().4s
+.endm
+
 function ff_hscale16to15_4_neon_asm, export=1
         // w0               int shift
         // x1               int32_t *dst
@@ -664,6 +674,7 @@ function ff_hscale16to15_4_neon_asm, export=1
         add             x5, x5, #32
 
         // shift all filterPos left by one, as uint16_t will be read
+        ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
         lsl             x8, x8, #1
         lsl             x9, x9, #1
         lsl             x10, x10, #1
@@ -674,154 +685,101 @@ function ff_hscale16to15_4_neon_asm, export=1
         lsl             x15, x15, #1
 
         // load src with given offset
-        ldr             x8,  [x3, w8,  uxtw]
-        ldr             x9,  [x3, w9,  uxtw]
-        ldr             x10, [x3, w10, uxtw]
-        ldr             x11, [x3, w11, uxtw]
-        ldr             x12, [x3, w12, uxtw]
-        ldr             x13, [x3, w13, uxtw]
-        ldr             x14, [x3, w14, uxtw]
-        ldr             x15, [x3, w15, uxtw]
-
-        sub             sp, sp, #64
-        // push src on stack so it can be loaded into vectors later
-        stp             x8, x9, [sp]
-        stp             x10, x11, [sp, #16]
-        stp             x12, x13, [sp, #32]
-        stp             x14, x15, [sp, #48]
+        ldr             d0, [x3, w8,  uxtw]
+        ldr             d1, [x3, w9,  uxtw]
+        ldr             d2, [x3, w10, uxtw]
+        ldr             d3, [x3, w11, uxtw]
+        ldr             d4, [x3, w12, uxtw]
+        ldr             d5, [x3, w13, uxtw]
+        ldr             d6, [x3, w14, uxtw]
+        ldr             d7, [x3, w15, uxtw]
 
 1:
-        ld4             {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
-        ld4             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
-
-        // Each of blocks does the following:
-        // Extend src and filter to 32 bits with uxtl and sxtl
-        // multiply or multiply and accumulate results
-        // Extending to 32 bits is necessary, as unit16_t values can't
-        // be represented as int16_t without type promotion.
-        uxtl            v26.4s, v0.4h
-        sxtl            v27.4s, v28.4h
-        uxtl2           v0.4s, v0.8h
-        mul             v5.4s, v26.4s, v27.4s
-        sxtl2           v28.4s, v28.8h
-        uxtl            v26.4s, v1.4h
-        mul             v6.4s, v0.4s, v28.4s
-
-        sxtl            v27.4s, v29.4h
-        uxtl2           v0.4s, v1.8h
-        mla             v5.4s, v27.4s, v26.4s
-        sxtl2           v28.4s, v29.8h
-        uxtl            v26.4s, v2.4h
-        mla             v6.4s, v28.4s, v0.4s
-
-        sxtl            v27.4s, v30.4h
-        uxtl2           v0.4s, v2.8h
-        mla             v5.4s, v27.4s, v26.4s
-        sxtl2           v28.4s, v30.8h
-        uxtl            v26.4s, v3.4h
-        mla             v6.4s, v28.4s, v0.4s
-
-        sxtl            v27.4s, v31.4h
-        uxtl2           v0.4s, v3.8h
-        mla             v5.4s, v27.4s, v26.4s
-        sxtl2           v28.4s, v31.8h
-        sub             w2, w2, #8
-        mla             v6.4s, v28.4s, v0.4s
-
-        sshl            v5.4s, v5.4s, v17.4s
-        sshl            v6.4s, v6.4s, v17.4s
-        smin            v5.4s, v5.4s, v18.4s
-        smin            v6.4s, v6.4s, v18.4s
-        xtn             v5.4h, v5.4s
-        xtn2            v5.8h, v6.4s
-
-        st1             {v5.8h}, [x1], #16
-        cmp             w2, #16
 
         // load filterPositions into registers for next iteration
+
+        hscale_iter     v0, v1, v28, v20, v21
         ldp             w8, w9, [x5]                // filterPos[0], filterPos[1]
+        hscale_iter     v2, v3, v29, v22, v23
         ldp             w10, w11, [x5, #8]          // filterPos[2], filterPos[3]
+        hscale_iter     v4, v5, v30, v24, v25
         ldp             w12, w13, [x5, #16]         // filterPos[4], filterPos[5]
+        hscale_iter     v6, v7, v31, v26, v27
         ldp             w14, w15, [x5, #24]         // filterPos[6], filterPos[7]
+        subs            w2, w2, #8
         add             x5, x5, #32
 
+        ldp             q28, q29, [x4], #32 // filter[0..7]
         lsl             x8, x8, #1
         lsl             x9, x9, #1
         lsl             x10, x10, #1
         lsl             x11, x11, #1
+        ldp             q30, q31, [x4], #32 // filter[0..7]
         lsl             x12, x12, #1
         lsl             x13, x13, #1
         lsl             x14, x14, #1
         lsl             x15, x15, #1
 
-        ldr             x8,  [x3, w8,  uxtw]
-        ldr             x9,  [x3, w9,  uxtw]
-        ldr             x10, [x3, w10, uxtw]
-        ldr             x11, [x3, w11, uxtw]
-        ldr             x12, [x3, w12, uxtw]
-        ldr             x13, [x3, w13, uxtw]
-        ldr             x14, [x3, w14, uxtw]
-        ldr             x15, [x3, w15, uxtw]
+        addp            v20.4s, v20.4s, v21.4s
+        ldr             d0, [x3, w8,  uxtw]
+        addp            v22.4s, v22.4s, v23.4s
+        ldr             d1, [x3, w9,  uxtw]
+        addp            v24.4s, v24.4s, v25.4s
+        ldr             d2, [x3, w10, uxtw]
+        addp            v26.4s, v26.4s, v27.4s
+        ldr             d3, [x3, w11, uxtw]
+        addp            v20.4s, v20.4s, v22.4s
+        ldr             d4, [x3, w12, uxtw]
+        addp            v21.4s, v24.4s, v26.4s
+        ldr             d5, [x3, w13, uxtw]
+        cmp             w2, #16
 
-        stp             x8, x9, [sp]
-        stp             x10, x11, [sp, #16]
-        stp             x12, x13, [sp, #32]
-        stp             x14, x15, [sp, #48]
+        sshl            v20.4s, v20.4s, v17.4s
+        ldr             d6, [x3, w14, uxtw]
+        sshl            v21.4s, v21.4s, v17.4s
+        ldr             d7, [x3, w15, uxtw]
+        smin            v20.4s, v20.4s, v18.4s
+        smin            v21.4s, v21.4s, v18.4s
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v21.4s
+
+        st1             {v20.8h}, [x1], #16
 
         b.ge            1b
 
         // here we make last iteration, without updating the registers
-        ld4             {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
-        ld4             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
-
-        uxtl            v26.4s, v0.4h
-        sxtl            v27.4s, v28.4h
-        uxtl2           v0.4s, v0.8h
-        mul             v5.4s, v26.4s, v27.4s
-        sxtl2           v28.4s, v28.8h
-        uxtl            v26.4s, v1.4h
-        mul             v6.4s, v0.4s, v28.4s
-
-        sxtl            v27.4s, v29.4h
-        uxtl2           v0.4s, v1.8h
-        mla             v5.4s, v26.4s, v27.4s
-        sxtl2           v28.4s, v29.8h
-        uxtl            v26.4s, v2.4h
-        mla             v6.4s, v0.4s, v28.4s
 
-        sxtl            v27.4s, v30.4h
-        uxtl2           v0.4s, v2.8h
-        mla             v5.4s, v26.4s, v27.4s
-        sxtl2           v28.4s, v30.8h
-        uxtl            v26.4s, v3.4h
-        mla             v6.4s, v0.4s, v28.4s
-
-        sxtl            v27.4s, v31.4h
-        uxtl2           v0.4s, v3.8h
-        mla             v5.4s, v26.4s, v27.4s
-        sxtl2           v28.4s, v31.8h
+        hscale_iter     v0, v1, v28, v20, v21
+        hscale_iter     v2, v3, v29, v22, v23
+        hscale_iter     v4, v5, v30, v24, v25
+        hscale_iter     v6, v7, v31, v26, v27
         subs            w2, w2, #8
-        mla             v6.4s, v0.4s, v28.4s
 
-        sshl            v5.4s, v5.4s, v17.4s
-        sshl            v6.4s, v6.4s, v17.4s
-        smin            v5.4s, v5.4s, v18.4s
-        smin            v6.4s, v6.4s, v18.4s
-        xtn             v5.4h, v5.4s
-        xtn2            v5.8h, v6.4s
+        addp            v20.4s, v20.4s, v21.4s
+        addp            v22.4s, v22.4s, v23.4s
+        addp            v24.4s, v24.4s, v25.4s
+        addp            v26.4s, v26.4s, v27.4s
+        addp            v0.4s, v20.4s, v22.4s
+        addp            v1.4s, v24.4s, v26.4s
 
-        st1             {v5.8h}, [x1], #16
-        add             sp, sp, #64                 // restore stack
+        sshl            v0.4s, v0.4s, v17.4s
+        sshl            v1.4s, v1.4s, v17.4s
+        smin            v0.4s, v0.4s, v18.4s
+        smin            v1.4s, v1.4s, v18.4s
+        xtn             v0.4h, v0.4s
+        xtn2            v0.8h, v1.4s
+
+        st1             {v0.8h}, [x1], #16
         cbnz            w2, 2f
 
         ret
 
 2:
         ldr             w8, [x5], #4                // load filterPos
-        lsl             w8, w8, #1
-        add             x9, x3, w8, uxtw            // src + filterPos
+        add             x9, x3, w8, uxtw #1         // src + filterPos
         ld1             {v0.4h}, [x9]               // load 4 * uint16_t
         ld1             {v31.4h}, [x4], #8
+        sub             w2, w2, #1
 
         uxtl            v0.4s, v0.4h
         sxtl            v31.4s, v31.4h
@@ -830,7 +788,6 @@ function ff_hscale16to15_4_neon_asm, export=1
         sshl            v0.4s, v0.4s, v17.4s
         smin            v0.4s, v0.4s, v18.4s
         st1             {v0.h}[0], [x1], #2
-        sub             w2, w2, #1
         cbnz            w2, 2b                      // if iterations remain jump to beginning
 
         ret
-- 
2.47.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/aarch64/hscale.S Refactor hscale_16_to_15__fs_4
  2025-03-01 12:59 [FFmpeg-devel] [PATCH] swscale/aarch64/hscale.S Refactor hscale_16_to_15__fs_4 Krzysztof Pyrkosz via ffmpeg-devel
@ 2025-03-01 23:03 ` Martin Storsjö
  2025-03-01 23:21   ` Martin Storsjö
  0 siblings, 1 reply; 3+ messages in thread
From: Martin Storsjö @ 2025-03-01 23:03 UTC (permalink / raw)
  To: Krzysztof Pyrkosz via ffmpeg-devel; +Cc: Krzysztof Pyrkosz

On Sat, 1 Mar 2025, Krzysztof Pyrkosz via ffmpeg-devel wrote:

> Before/after:
>
> A78
> hscale_16_to_15__fs_4_dstW_8_neon:                      86.8 ( 1.72x)
> hscale_16_to_15__fs_4_dstW_24_neon:                    147.5 ( 2.73x)
> hscale_16_to_15__fs_4_dstW_128_neon:                   614.0 ( 3.14x)
> hscale_16_to_15__fs_4_dstW_144_neon:                   680.5 ( 3.18x)
> hscale_16_to_15__fs_4_dstW_256_neon:                  1193.2 ( 3.19x)
> hscale_16_to_15__fs_4_dstW_512_neon:                  2305.0 ( 3.27x)
>
> hscale_16_to_15__fs_4_dstW_8_neon:                      86.0 ( 1.74x)
> hscale_16_to_15__fs_4_dstW_24_neon:                    106.8 ( 3.78x)
> hscale_16_to_15__fs_4_dstW_128_neon:                   404.0 ( 4.81x)
> hscale_16_to_15__fs_4_dstW_144_neon:                   451.8 ( 4.80x)
> hscale_16_to_15__fs_4_dstW_256_neon:                   760.5 ( 5.06x)
> hscale_16_to_15__fs_4_dstW_512_neon:                  1520.0 ( 5.01x)
>
> A72
> hscale_16_to_15__fs_4_dstW_8_neon:                     156.8 ( 1.52x)
> hscale_16_to_15__fs_4_dstW_24_neon:                    217.8 ( 2.52x)
> hscale_16_to_15__fs_4_dstW_128_neon:                   906.8 ( 2.90x)
> hscale_16_to_15__fs_4_dstW_144_neon:                  1014.5 ( 2.91x)
> hscale_16_to_15__fs_4_dstW_256_neon:                  1751.5 ( 2.96x)
> hscale_16_to_15__fs_4_dstW_512_neon:                  3469.3 ( 2.97x)
>
> hscale_16_to_15__fs_4_dstW_8_neon:                     151.2 ( 1.54x)
> hscale_16_to_15__fs_4_dstW_24_neon:                    173.4 ( 3.15x)
> hscale_16_to_15__fs_4_dstW_128_neon:                   660.0 ( 3.98x)
> hscale_16_to_15__fs_4_dstW_144_neon:                   735.7 ( 4.00x)
> hscale_16_to_15__fs_4_dstW_256_neon:                  1273.5 ( 4.09x)
> hscale_16_to_15__fs_4_dstW_512_neon:                  2488.2 ( 4.16x)
> ---
>
> This patch removes the use of stack for temporary state and replaces
> interleaved ld4 loads with ld1.
> I'm aware the component is being deprecated, however in my use case
> (screen recording) the total time spent in this function is roughly 15%,
> the improvement is significant and worth sharing.

The patch looks good. I didn't follow it in exact detail, but it overall 
looks reasonable, and looks much better than the previous form. This 
description of what the patch does and why also is worth keeping in the 
final commit message, but as there's no need to repost the patch, I could 
just adjust the message myself before pushing it.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/aarch64/hscale.S Refactor hscale_16_to_15__fs_4
  2025-03-01 23:03 ` Martin Storsjö
@ 2025-03-01 23:21   ` Martin Storsjö
  0 siblings, 0 replies; 3+ messages in thread
From: Martin Storsjö @ 2025-03-01 23:21 UTC (permalink / raw)
  To: Krzysztof Pyrkosz via ffmpeg-devel; +Cc: Krzysztof Pyrkosz

On Sun, 2 Mar 2025, Martin Storsjö wrote:

> On Sat, 1 Mar 2025, Krzysztof Pyrkosz via ffmpeg-devel wrote:
>
>> Before/after:
>> 
>> A78
>> hscale_16_to_15__fs_4_dstW_8_neon:                      86.8 ( 1.72x)
>> hscale_16_to_15__fs_4_dstW_24_neon:                    147.5 ( 2.73x)
>> hscale_16_to_15__fs_4_dstW_128_neon:                   614.0 ( 3.14x)
>> hscale_16_to_15__fs_4_dstW_144_neon:                   680.5 ( 3.18x)
>> hscale_16_to_15__fs_4_dstW_256_neon:                  1193.2 ( 3.19x)
>> hscale_16_to_15__fs_4_dstW_512_neon:                  2305.0 ( 3.27x)
>> 
>> hscale_16_to_15__fs_4_dstW_8_neon:                      86.0 ( 1.74x)
>> hscale_16_to_15__fs_4_dstW_24_neon:                    106.8 ( 3.78x)
>> hscale_16_to_15__fs_4_dstW_128_neon:                   404.0 ( 4.81x)
>> hscale_16_to_15__fs_4_dstW_144_neon:                   451.8 ( 4.80x)
>> hscale_16_to_15__fs_4_dstW_256_neon:                   760.5 ( 5.06x)
>> hscale_16_to_15__fs_4_dstW_512_neon:                  1520.0 ( 5.01x)
>> 
>> A72
>> hscale_16_to_15__fs_4_dstW_8_neon:                     156.8 ( 1.52x)
>> hscale_16_to_15__fs_4_dstW_24_neon:                    217.8 ( 2.52x)
>> hscale_16_to_15__fs_4_dstW_128_neon:                   906.8 ( 2.90x)
>> hscale_16_to_15__fs_4_dstW_144_neon:                  1014.5 ( 2.91x)
>> hscale_16_to_15__fs_4_dstW_256_neon:                  1751.5 ( 2.96x)
>> hscale_16_to_15__fs_4_dstW_512_neon:                  3469.3 ( 2.97x)
>> 
>> hscale_16_to_15__fs_4_dstW_8_neon:                     151.2 ( 1.54x)
>> hscale_16_to_15__fs_4_dstW_24_neon:                    173.4 ( 3.15x)
>> hscale_16_to_15__fs_4_dstW_128_neon:                   660.0 ( 3.98x)
>> hscale_16_to_15__fs_4_dstW_144_neon:                   735.7 ( 4.00x)
>> hscale_16_to_15__fs_4_dstW_256_neon:                  1273.5 ( 4.09x)
>> hscale_16_to_15__fs_4_dstW_512_neon:                  2488.2 ( 4.16x)
>> ---
>> 
>> This patch removes the use of stack for temporary state and replaces
>> interleaved ld4 loads with ld1.
>> I'm aware the component is being deprecated, however in my use case
>> (screen recording) the total time spent in this function is roughly 15%,
>> the improvement is significant and worth sharing.
>
> The patch looks good. I didn't follow it in exact detail, but it overall 
> looks reasonable, and looks much better than the previous form. This 
> description of what the patch does and why also is worth keeping in the final 
> commit message, but as there's no need to repost the patch, I could just 
> adjust the message myself before pushing it.

I pushed this one, and the second ac3dsp patch now, with the commit 
messages readjusted a little bit. The first ac3dsp patch should be good 
too if someone verifies that it's ok to handle 16 elements at a time.

// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-03-01 23:21 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-03-01 12:59 [FFmpeg-devel] [PATCH] swscale/aarch64/hscale.S Refactor hscale_16_to_15__fs_4 Krzysztof Pyrkosz via ffmpeg-devel
2025-03-01 23:03 ` Martin Storsjö
2025-03-01 23:21   ` Martin Storsjö

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git