Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c()
@ 2025-06-02  4:36 Harshitha Sarangu Suresh
  2025-06-02  7:52 ` Martin Storsjö
  0 siblings, 1 reply; 5+ messages in thread
From: Harshitha Sarangu Suresh @ 2025-06-02  4:36 UTC (permalink / raw)
  To: Kieran Kunhya via ffmpeg-devel; +Cc: Dash Santosh Sathyanarayanan

[-- Attachment #1: Type: text/plain, Size: 15163 bytes --]

From 7260822a578130a713c1455cca6cdd06f1540db8 Mon Sep 17 00:00:00 2001
From: Harshitha Suresh <harshitha@multicorewareinc.com>
Date: Mon, 19 May 2025 22:37:20 +0530
Subject: [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c()

yuv2nv12cX_2_512_accurate_c:                          3508.8 ( 1.00x)
yuv2nv12cX_2_512_accurate_neon:                        369.2 ( 9.50x)
yuv2nv12cX_2_512_approximate_c:                       3499.0 ( 1.00x)
yuv2nv12cX_2_512_approximate_neon:                     370.2 ( 9.45x)
yuv2nv12cX_4_512_accurate_c:                          4683.0 ( 1.00x)
yuv2nv12cX_4_512_accurate_neon:                        568.8 ( 8.23x)
yuv2nv12cX_4_512_approximate_c:                       4682.6 ( 1.00x)
yuv2nv12cX_4_512_approximate_neon:                     569.9 ( 8.22x)
yuv2nv12cX_8_512_accurate_c:                          7243.0 ( 1.00x)
yuv2nv12cX_8_512_accurate_neon:                        937.6 ( 7.72x)
yuv2nv12cX_8_512_approximate_c:                       7235.9 ( 1.00x)
yuv2nv12cX_8_512_approximate_neon:                     938.3 ( 7.71x)
yuv2nv12cX_16_512_accurate_c:                        13749.7 ( 1.00x)
yuv2nv12cX_16_512_accurate_neon:                      1708.1 ( 8.05x)
yuv2nv12cX_16_512_approximate_c:                     13750.0 ( 1.00x)
yuv2nv12cX_16_512_approximate_neon:                   1708.6 ( 8.05x)
---
 libswscale/aarch64/output.S  | 308 +++++++++++++++++++++++++++++++++++
 libswscale/aarch64/swscale.c |  18 ++
 2 files changed, 326 insertions(+)

diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
index 190c438870..8eb89e8b54 100644
--- a/libswscale/aarch64/output.S
+++ b/libswscale/aarch64/output.S
@@ -226,3 +226,311 @@ function ff_yuv2plane1_8_neon, export=1
         b.gt            2b                              // loop until width consumed
         ret
 endfunc
+
+// void ff_yuv2nv12cX_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
+//                         const int16_t *chrFilter, int chrFilterSize,
+//                         const int16_t **chrUSrc, const int16_t **chrVSrc,
+//                         uint8_t *dest, int chrDstW)
+
+function ff_yuv2nv12cX_notswapped_neon, export=1
+    // x0 - dstFormat (unused)
+    // x1 - uint8_t *chrDither
+    // x2 - int16_t *chrFilter
+    // x3 - int chrFilterSize
+    // x4 - int16_t **chrUSrc
+    // x5 - int16_t **chrVSrc
+    // x6 - uint8_t *dest
+    // x7 - int chrDstW
+
+    // Load dither pattern and compute U and V dither vectors
+    ld1     {v0.8b}, [x1]               // chrDither[0..7]
+    ext     v1.8b, v0.8b, v0.8b, #3      // Rotate for V: (i+3)&7
+
+    uxtl    v0.8h, v0.8b
+    uxtl    v1.8h, v1.8b
+
+    ushll    v2.4s, v0.4h, #12         // U dither low
+    ushll2   v3.4s, v0.8h, #12         // U dither high
+    ushll    v4.4s, v1.4h, #12         // V dither low
+    ushll2   v5.4s, v1.8h, #12         // V dither high
+
+    // Check if we can process 16 pixels at a time
+    tst     w7, #15                    // Check if chrDstW % 16 == 0
+    b.ne    .Lprocess_8_pixels          // If not, use 8-pixel version
+
+    // =============================================
+    // 16-pixel processing path
+    // =============================================
+    mov     x8, #0                     // i = 0
+.Lloop_16_pixels:
+
+    mov     v16.16b, v2.16b            // U acc low
+    mov     v17.16b, v3.16b            // U acc high
+    mov     v18.16b, v4.16b            // V acc low
+    mov     v19.16b, v5.16b            // V acc high
+
+    mov     v20.16b, v2.16b
+    mov     v21.16b, v3.16b
+    mov     v22.16b, v4.16b
+    mov     v23.16b, v5.16b
+
+    mov     w9, w3                      // chrFilterSize counter
+    mov     x10, x2                     // chrFilter pointer
+    mov     x11, x4                     // chrUSrc base
+    mov     x12, x5                     // chrVSrc base
+
+.Lfilter_loop_16:
+    ldr     h6, [x10], #2               // Load filter coefficient
+
+    ldr     x13, [x11], #8              // chrUSrc[j]
+    ldr     x14, [x12], #8              // chrVSrc[j]
+    add     x13, x13, x8, lsl #1        // &chrUSrc[j][i]
+    add     x14, x14, x8, lsl #1        // &chrVSrc[j][i]
+    add     x15, x13, #16               // x15 = &chrUSrc[j][i+8] (8 samples * 2 bytes)
+    add     x16, x14, #16
+
+    ld1     {v24.8h}, [x13]             // U samples 0-7
+    ld1     {v25.8h}, [x14]             // V samples 0-7
+
+    ld1     {v26.8h}, [x15]             // U samples 8-15
+    ld1     {v27.8h}, [x16]             // V samples 8-15
+
+    smlal   v16.4s, v24.4h, v6.h[0]
+    smlal2  v17.4s, v24.8h, v6.h[0]
+    smlal   v18.4s, v25.4h, v6.h[0]
+    smlal2  v19.4s, v25.8h, v6.h[0]
+
+    smlal   v20.4s, v26.4h, v6.h[0]
+    smlal2  v21.4s, v26.8h, v6.h[0]
+    smlal   v22.4s, v27.4h, v6.h[0]
+    smlal2  v23.4s, v27.8h, v6.h[0]
+
+    subs    w9, w9, #1
+    b.gt    .Lfilter_loop_16
+
+    // Process and store first 8 pixels
+    sqshrun  v28.4h, v16.4s, #16
+    sqshrun2 v28.8h, v17.4s, #16
+    sqshrun  v29.4h, v18.4s, #16
+    sqshrun2 v29.8h, v19.4s, #16
+    uqshrn   v30.8b, v28.8h, #3        // U
+    uqshrn   v31.8b, v29.8h, #3        // V
+
+    // Process and store next 8 pixels
+    sqshrun  v28.4h, v20.4s, #16
+    sqshrun2 v28.8h, v21.4s, #16
+    sqshrun  v29.4h, v22.4s, #16
+    sqshrun2 v29.8h, v23.4s, #16
+    uqshrn   v24.8b, v28.8h, #3        // U
+    uqshrn   v25.8b, v29.8h, #3        // V
+
+    // Store both 8-pixel blocks
+    st2     {v30.8b, v31.8b}, [x6], #16
+    st2     {v24.8b, v25.8b}, [x6], #16
+
+    subs    w7, w7, #16
+    add     x8, x8, #16
+    b.gt    .Lloop_16_pixels
+    ret
+
+    // =============================================
+    // 8-pixel processing path (original code)
+    // =============================================
+.Lprocess_8_pixels:
+    mov     x8, #0                     // i = 0
+.Lloop_8_pixels:
+    // Initialize accumulators with dither
+    mov     v16.16b, v2.16b            // U acc low
+    mov     v17.16b, v3.16b            // U acc high
+    mov     v18.16b, v4.16b            // V acc low
+    mov     v19.16b, v5.16b            // V acc high
+
+    mov     w9, w3                      // chrFilterSize counter
+    mov     x10, x2                     // chrFilter pointer
+    mov     x11, x4                     // chrUSrc base
+    mov     x12, x5                     // chrVSrc base
+
+.Lfilter_loop_8:
+    ldr     h6, [x10], #2               // Load filter coefficient
+
+    ldr     x13, [x11], #8              // chrUSrc[j]
+    ldr     x14, [x12], #8              // chrVSrc[j]
+    add     x13, x13, x8, lsl #1        // &chrUSrc[j][i]
+    add     x14, x14, x8, lsl #1        // &chrVSrc[j][i]
+
+    ld1     {v20.8h}, [x13]             // U samples
+    ld1     {v21.8h}, [x14]             // V samples
+
+    smlal   v16.4s, v20.4h, v6.h[0]
+    smlal2  v17.4s, v20.8h, v6.h[0]
+    smlal   v18.4s, v21.4h, v6.h[0]
+    smlal2  v19.4s, v21.8h, v6.h[0]
+
+    subs    w9, w9, #1
+    b.gt    .Lfilter_loop_8
+
+    // Final processing and store
+    sqshrun  v26.4h, v16.4s, #16
+    sqshrun2 v26.8h, v17.4s, #16
+    sqshrun  v27.4h, v18.4s, #16
+    sqshrun2 v27.8h, v19.4s, #16
+    uqshrn   v28.8b, v26.8h, #3        // U
+    uqshrn   v29.8b, v27.8h, #3        // V
+
+    st2     {v28.8b, v29.8b}, [x6], #16
+
+    subs    w7, w7, #8
+    add     x8, x8, #8
+    b.gt    .Lloop_8_pixels
+    ret
+endfunc
+
+function ff_yuv2nv12cX_swapped_neon, export=1
+    // x0 - dstFormat (unused)
+    // x1 - uint8_t *chrDither
+    // x2 - int16_t *chrFilter
+    // x3 - int chrFilterSize
+    // x4 - int16_t **chrUSrc
+    // x5 - int16_t **chrVSrc
+    // x6 - uint8_t *dest
+    // x7 - int chrDstW
+
+    ld1     {v0.8b}, [x1]               // chrDither[0..7]
+    ext     v1.8b, v0.8b, v0.8b, #3      // Rotate for V: (i+3)&7
+
+    uxtl    v0.8h, v0.8b
+    uxtl    v1.8h, v1.8b
+
+    ushll    v2.4s, v0.4h, #12         // U dither low
+    ushll2   v3.4s, v0.8h, #12         // U dither high
+    ushll    v4.4s, v1.4h, #12         // V dither low
+    ushll2   v5.4s, v1.8h, #12         // V dither high
+
+    // Check if we can process 16 pixels at a time
+    tst     w7, #15                    // Check if chrDstW % 16 == 0
+    b.ne    .Lswapped_process_8_pixels // If not, use 8-pixel version
+
+    // =============================================
+    // 16-pixel processing path
+    // =============================================
+    mov     x8, #0                     // i = 0
+.Lswapped_loop_16_pixels:
+
+    mov     v16.16b, v2.16b            // U acc low
+    mov     v17.16b, v3.16b            // U acc high
+    mov     v18.16b, v4.16b            // V acc low
+    mov     v19.16b, v5.16b            // V acc high
+
+    mov     v20.16b, v2.16b
+    mov     v21.16b, v3.16b
+    mov     v22.16b, v4.16b
+    mov     v23.16b, v5.16b
+
+    mov     w9, w3                      // chrFilterSize counter
+    mov     x10, x2                     // chrFilter pointer
+    mov     x11, x4                     // chrUSrc base
+    mov     x12, x5                     // chrVSrc base
+
+.Lswapped_filter_loop_16:
+    ldr     h6, [x10], #2               // Load filter coefficient
+
+    // Load pointers for first 8 pixels
+    ldr     x13, [x11], #8              // chrUSrc[j]
+    ldr     x14, [x12], #8              // chrVSrc[j]
+    add     x13, x13, x8, lsl #1        // &chrUSrc[j][i]
+    add     x14, x14, x8, lsl #1        // &chrVSrc[j][i]
+    add     x15, x13, #16               // x15 = &chrUSrc[j][i+8] (8 samples * 2 bytes)
+    add     x16, x14, #16
+
+    ld1     {v24.8h}, [x13]             // U samples 0-7
+    ld1     {v25.8h}, [x14]             // V samples 0-7
+
+    ld1     {v26.8h}, [x15]             // U samples 8-15
+    ld1     {v27.8h}, [x16]             // V samples 8-15
+
+    smlal   v16.4s, v24.4h, v6.h[0]
+    smlal2  v17.4s, v24.8h, v6.h[0]
+    smlal   v18.4s, v25.4h, v6.h[0]
+    smlal2  v19.4s, v25.8h, v6.h[0]
+
+    smlal   v20.4s, v26.4h, v6.h[0]
+    smlal2  v21.4s, v26.8h, v6.h[0]
+    smlal   v22.4s, v27.4h, v6.h[0]
+    smlal2  v23.4s, v27.8h, v6.h[0]
+
+    subs    w9, w9, #1
+    b.gt    .Lswapped_filter_loop_16
+
+    sqshrun  v28.4h, v16.4s, #16
+    sqshrun2 v28.8h, v17.4s, #16
+    sqshrun  v29.4h, v18.4s, #16
+    sqshrun2 v29.8h, v19.4s, #16
+    uqshrn   v30.8b, v28.8h, #3        // U
+    uqshrn   v31.8b, v29.8h, #3        // V
+
+    sqshrun  v28.4h, v20.4s, #16
+    sqshrun2 v28.8h, v21.4s, #16
+    sqshrun  v29.4h, v22.4s, #16
+    sqshrun2 v29.8h, v23.4s, #16
+    uqshrn   v24.8b, v28.8h, #3        // U
+    uqshrn   v25.8b, v29.8h, #3        // V
+
+    // Store both 8-pixel blocks
+    st2     {v30.8b, v31.8b}, [x6], #16
+    st2     {v24.8b, v25.8b}, [x6], #16
+
+    subs    w7, w7, #16
+    add     x8, x8, #16
+    b.gt    .Lswapped_loop_16_pixels
+    ret
+
+    // =============================================
+    // 8-pixel processing path
+    // =============================================
+.Lswapped_process_8_pixels:
+    mov     x8, #0                     // i = 0
+.Lswapped_loop_8_pixels:
+    // Initialize accumulators with dither
+    mov     v16.16b, v2.16b            // U acc low
+    mov     v17.16b, v3.16b            // U acc high
+    mov     v18.16b, v4.16b            // V acc low
+    mov     v19.16b, v5.16b            // V acc high
+
+    mov     w9, w3                      // chrFilterSize counter
+    mov     x10, x2                     // chrFilter pointer
+    mov     x11, x4                     // chrUSrc base
+    mov     x12, x5                     // chrVSrc base
+
+.Lswapped_filter_loop_8:
+    ldr     h6, [x10], #2               // Load filter coefficient
+
+    ldr     x13, [x11], #8              // chrUSrc[j]
+    ldr     x14, [x12], #8              // chrVSrc[j]
+    add     x13, x13, x8, lsl #1        // &chrUSrc[j][i]
+    add     x14, x14, x8, lsl #1        // &chrVSrc[j][i]
+
+    ld1     {v20.8h}, [x13]             // U samples
+    ld1     {v21.8h}, [x14]             // V samples
+
+    smlal   v16.4s, v20.4h, v6.h[0]
+    smlal2  v17.4s, v20.8h, v6.h[0]
+    smlal   v18.4s, v21.4h, v6.h[0]
+    smlal2  v19.4s, v21.8h, v6.h[0]
+
+    subs    w9, w9, #1
+    b.gt    .Lswapped_filter_loop_8
+
+    sqshrun  v26.4h, v16.4s, #16
+    sqshrun2 v26.8h, v17.4s, #16
+    sqshrun  v27.4h, v18.4s, #16
+    sqshrun2 v27.8h, v19.4s, #16
+    uqshrn   v28.8b, v26.8h, #3        // U
+    uqshrn   v29.8b, v27.8h, #3        // V
+
+    st2     {v28.8b, v29.8b}, [x6], #16
+
+    subs    w7, w7, #8
+    add     x8, x8, #8
+    b.gt    .Lswapped_loop_8_pixels
+    ret
+endfunc
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 6e5a721c1f..0e57112f42 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -168,6 +168,16 @@ void ff_yuv2plane1_8_neon(
         const uint8_t *dither,
         int offset);

+void ff_yuv2nv12cX_notswapped_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
+    const int16_t *chrFilter, int chrFilterSize,
+    const int16_t **chrUSrc, const int16_t **chrVSrc,
+    uint8_t *dest, int chrDstW);
+
+void ff_yuv2nv12cX_swapped_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
+    const int16_t *chrFilter, int chrFilterSize,
+    const int16_t **chrUSrc, const int16_t **chrVSrc,
+    uint8_t *dest, int chrDstW);
+
 #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt) do {              \
     if (c->srcBpc == 8) {                                               \
         if(c->dstBpc <= 14) {                                           \
@@ -201,6 +211,12 @@ void ff_yuv2plane1_8_neon(
     default: break;                                                     \
     }

+#define ASSIGN_YUV2NV12_FUNC(yuv2nv12fn, opt, dstFormat)               \
+    if(!isSwappedChroma(dstFormat))                                    \
+        yuv2nv12fn = ff_yuv2nv12cX_notswapped_  ## opt;                \
+    else                                                               \
+        yuv2nv12fn = ff_yuv2nv12cX_swapped_  ## opt;
+
 #define NEON_INPUT(name) \
 void ff_##name##ToY_neon(uint8_t *dst, const uint8_t *src, const uint8_t *, \
                         const uint8_t *, int w, uint32_t *coeffs, void *); \
@@ -275,7 +291,9 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
         ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon);
         if (c->dstBpc == 8) {
             c->yuv2planeX = ff_yuv2planeX_8_neon;
+            ASSIGN_YUV2NV12_FUNC(c->yuv2nv12cX, neon, c->opts.dst_format);
         }
+
         switch (c->opts.src_format) {
         case AV_PIX_FMT_ABGR:
             c->lumToYV12 = ff_abgr32ToY_neon;
--
2.36.0.windows.1




[-- Attachment #2: swscale-aarch64-output-Implement-neon-assembly-fo.patch --]
[-- Type: application/octet-stream, Size: 14784 bytes --]

[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c()
  2025-06-02  4:36 [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c() Harshitha Sarangu Suresh
@ 2025-06-02  7:52 ` Martin Storsjö
  0 siblings, 0 replies; 5+ messages in thread
From: Martin Storsjö @ 2025-06-02  7:52 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Dash Santosh Sathyanarayanan

On Mon, 2 Jun 2025, Harshitha Sarangu Suresh wrote:

> From 7260822a578130a713c1455cca6cdd06f1540db8 Mon Sep 17 00:00:00 2001
> From: Harshitha Suresh <harshitha@multicorewareinc.com>
> Date: Mon, 19 May 2025 22:37:20 +0530
> Subject: [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c()
>
> yuv2nv12cX_2_512_accurate_c:                          3508.8 ( 1.00x)
> yuv2nv12cX_2_512_accurate_neon:                        369.2 ( 9.50x)
> yuv2nv12cX_2_512_approximate_c:                       3499.0 ( 1.00x)
> yuv2nv12cX_2_512_approximate_neon:                     370.2 ( 9.45x)
> yuv2nv12cX_4_512_accurate_c:                          4683.0 ( 1.00x)
> yuv2nv12cX_4_512_accurate_neon:                        568.8 ( 8.23x)
> yuv2nv12cX_4_512_approximate_c:                       4682.6 ( 1.00x)
> yuv2nv12cX_4_512_approximate_neon:                     569.9 ( 8.22x)
> yuv2nv12cX_8_512_accurate_c:                          7243.0 ( 1.00x)
> yuv2nv12cX_8_512_accurate_neon:                        937.6 ( 7.72x)
> yuv2nv12cX_8_512_approximate_c:                       7235.9 ( 1.00x)
> yuv2nv12cX_8_512_approximate_neon:                     938.3 ( 7.71x)
> yuv2nv12cX_16_512_accurate_c:                        13749.7 ( 1.00x)
> yuv2nv12cX_16_512_accurate_neon:                      1708.1 ( 8.05x)
> yuv2nv12cX_16_512_approximate_c:                     13750.0 ( 1.00x)
> yuv2nv12cX_16_512_approximate_neon:                   1708.6 ( 8.05x)
> ---
> libswscale/aarch64/output.S  | 308 +++++++++++++++++++++++++++++++++++
> libswscale/aarch64/swscale.c |  18 ++
> 2 files changed, 326 insertions(+)
>
> diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
> index 190c438870..8eb89e8b54 100644
> --- a/libswscale/aarch64/output.S
> +++ b/libswscale/aarch64/output.S
> @@ -226,3 +226,311 @@ function ff_yuv2plane1_8_neon, export=1
>         b.gt            2b                              // loop until width consumed
>         ret
> endfunc
> +
> +// void ff_yuv2nv12cX_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
> +//                         const int16_t *chrFilter, int chrFilterSize,
> +//                         const int16_t **chrUSrc, const int16_t **chrVSrc,
> +//                         uint8_t *dest, int chrDstW)
> +
> +function ff_yuv2nv12cX_notswapped_neon, export=1
> +    // x0 - dstFormat (unused)
> +    // x1 - uint8_t *chrDither
> +    // x2 - int16_t *chrFilter
> +    // x3 - int chrFilterSize
> +    // x4 - int16_t **chrUSrc
> +    // x5 - int16_t **chrVSrc
> +    // x6 - uint8_t *dest
> +    // x7 - int chrDstW
> +
> +    // Load dither pattern and compute U and V dither vectors
> +    ld1     {v0.8b}, [x1]               // chrDither[0..7]
> +    ext     v1.8b, v0.8b, v0.8b, #3      // Rotate for V: (i+3)&7

Please adhere to the indentation of the existing code, don't make up your 
own.

This patchset causes a lot of fate tests to fail, and causes ffmpeg to 
crash in a number of tests while running fate. Please fix that.

In order to test the assembly in all relevant configurations before 
submitting, I have made a setup of github actions, which anybody can use. 
This also includes indentation checks for the assembly.

If you make a copy of my branch at 
https://github.com/mstorsjo/ffmpeg/commits/gha-aarch64, add your own 
changes on top, and push to a repo on github, it will run the tests in all 
the configurations.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c()
  2025-06-06  7:07 ` Martin Storsjö
@ 2025-06-06  9:08   ` Harshitha Sarangu Suresh
  0 siblings, 0 replies; 5+ messages in thread
From: Harshitha Sarangu Suresh @ 2025-06-06  9:08 UTC (permalink / raw)
  To: FFmpeg development discussions and patches
  Cc: Dash Santosh Sathyanarayanan, Logaprakash Ramajayam

[-- Attachment #1: Type: text/plain, Size: 19512 bytes --]

Apologies sent older version of the patch.  Here is the proper version that passed all the tests.

From 4ca5eae1e7164f78296719f19aef97239e5b046a Mon Sep 17 00:00:00 2001
From: Harshitha Suresh <harshitha@multicorewareinc.com>
Date: Mon, 19 May 2025 22:37:20 +0530
Subject: [PATCH] v2ff-swscale/aarch64/output: Implement neon assembly for
 yuv2nv12cX_c().

yuv2nv12cX_2_512_accurate_c:                          3508.8 ( 1.00x)
yuv2nv12cX_2_512_accurate_neon:                        369.2 ( 9.50x)
yuv2nv12cX_2_512_approximate_c:                       3499.0 ( 1.00x)
yuv2nv12cX_2_512_approximate_neon:                     370.2 ( 9.45x)
yuv2nv12cX_4_512_accurate_c:                          4683.0 ( 1.00x)
yuv2nv12cX_4_512_accurate_neon:                        568.8 ( 8.23x)
yuv2nv12cX_4_512_approximate_c:                       4682.6 ( 1.00x)
yuv2nv12cX_4_512_approximate_neon:                     569.9 ( 8.22x)
yuv2nv12cX_8_512_accurate_c:                          7243.0 ( 1.00x)
yuv2nv12cX_8_512_accurate_neon:                        937.6 ( 7.72x)
yuv2nv12cX_8_512_approximate_c:                       7235.9 ( 1.00x)
yuv2nv12cX_8_512_approximate_neon:                     938.3 ( 7.71x)
yuv2nv12cX_16_512_accurate_c:                        13749.7 ( 1.00x)
yuv2nv12cX_16_512_accurate_neon:                      1708.1 ( 8.05x)
yuv2nv12cX_16_512_approximate_c:                     13750.0 ( 1.00x)
yuv2nv12cX_16_512_approximate_neon:                   1708.6 ( 8.05x)
---
 libswscale/aarch64/output.S  | 306 +++++++++++++++++++++++++++++++++++
 libswscale/aarch64/swscale.c |  19 +++
 2 files changed, 325 insertions(+)

diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
index 190c438870..2d87cc6a5e 100644
--- a/libswscale/aarch64/output.S
+++ b/libswscale/aarch64/output.S
@@ -226,3 +226,309 @@ function ff_yuv2plane1_8_neon, export=1
         b.gt            2b                              // loop until width consumed
         ret
 endfunc
+
+function ff_yuv2nv12cX_notswapped_neon, export=1
+// x0 - dstFormat (unused)
+// x1 - uint8_t *chrDither
+// x2 - int16_t *chrFilter
+// x3 - int chrFilterSize
+// x4 - int16_t **chrUSrc
+// x5 - int16_t **chrVSrc
+// x6 - uint8_t *dest
+// x7 - int chrDstW
+
+        // Load dither pattern and compute U and V dither vectors
+        ld1             {v0.8b}, [x1]                   // chrDither[0..7]
+        ext             v1.8b, v0.8b, v0.8b, #3         // Rotate for V: (i+3)&7
+
+        uxtl            v0.8h, v0.8b
+        uxtl            v1.8h, v1.8b
+
+        ushll           v2.4s, v0.4h, #12               // U dither low
+        ushll2          v3.4s, v0.8h, #12               // U dither high
+        ushll           v4.4s, v1.4h, #12               // V dither low
+        ushll2          v5.4s, v1.8h, #12               // V dither high
+
+        // Check if we can process 16 pixels at a time
+        tst             w7, #15                         // Check if chrDstW % 16 == 0
+        b.ne            .Lprocess_8_pixels              // If not, use 8-pixel version
+
+        // =============================================
+        // 16-pixel processing path
+        // =============================================
+        mov             x8, #0                          // i = 0
+.Lloop_16_pixels:
+
+        mov             v16.16b, v2.16b                 // U acc low
+        mov             v17.16b, v3.16b                 // U acc high
+        mov             v18.16b, v4.16b                 // V acc low
+        mov             v19.16b, v5.16b                 // V acc high
+
+        mov             v20.16b, v2.16b
+        mov             v21.16b, v3.16b
+        mov             v22.16b, v4.16b
+        mov             v23.16b, v5.16b
+
+        mov             w9, w3                          // chrFilterSize counter
+        mov             x10, x2                         // chrFilter pointer
+        mov             x11, x4                         // chrUSrc base
+        mov             x12, x5                         // chrVSrc base
+
+.Lfilter_loop_16:
+        ldr             h6, [x10], #2                   // Load filter coefficient
+
+        ldr             x13, [x11], #8                  // chrUSrc[j]
+        ldr             x14, [x12], #8                  // chrVSrc[j]
+        add             x13, x13, x8, lsl #1            // &chrUSrc[j][i]
+        add             x14, x14, x8, lsl #1            // &chrVSrc[j][i]
+        add             x15, x13, #16                   // x15 = &chrUSrc[j][i+8] (8 samples * 2 bytes)
+        add             x16, x14, #16
+
+        ld1             {v24.8h}, [x13]                 // U samples 0-7
+        ld1             {v25.8h}, [x14]                 // V samples 0-7
+
+        ld1             {v26.8h}, [x15]                 // U samples 8-15
+        ld1             {v27.8h}, [x16]                 // V samples 8-15
+
+        smlal           v16.4s, v24.4h, v6.h[0]
+        smlal2          v17.4s, v24.8h, v6.h[0]
+        smlal           v18.4s, v25.4h, v6.h[0]
+        smlal2          v19.4s, v25.8h, v6.h[0]
+
+        smlal           v20.4s, v26.4h, v6.h[0]
+        smlal2          v21.4s, v26.8h, v6.h[0]
+        smlal           v22.4s, v27.4h, v6.h[0]
+        smlal2          v23.4s, v27.8h, v6.h[0]
+
+        subs            w9, w9, #1
+        b.gt            .Lfilter_loop_16
+
+        // Process and store first 8 pixels
+        sqshrun         v28.4h, v16.4s, #16
+        sqshrun2        v28.8h, v17.4s, #16
+        sqshrun         v29.4h, v18.4s, #16
+        sqshrun2        v29.8h, v19.4s, #16
+        uqshrn          v30.8b, v28.8h, #3              // U
+        uqshrn          v31.8b, v29.8h, #3              // V
+
+        // Process and store next 8 pixels
+        sqshrun         v28.4h, v20.4s, #16
+        sqshrun2        v28.8h, v21.4s, #16
+        sqshrun         v29.4h, v22.4s, #16
+        sqshrun2        v29.8h, v23.4s, #16
+        uqshrn          v24.8b, v28.8h, #3              // U
+        uqshrn          v25.8b, v29.8h, #3              // V
+
+        // Store both 8-pixel blocks
+        st2             {v30.8b, v31.8b}, [x6], #16
+        st2             {v24.8b, v25.8b}, [x6], #16
+
+        subs            w7, w7, #16
+        add             x8, x8, #16
+        b.gt            .Lloop_16_pixels
+        ret
+
+        // =============================================
+        // 8-pixel processing path (original code)
+        // =============================================
+.Lprocess_8_pixels:
+        mov             x8, #0                        // i = 0
+.Lloop_8_pixels:
+        // Initialize accumulators with dither
+        mov             v16.16b, v2.16b               // U acc low
+        mov             v17.16b, v3.16b               // U acc high
+        mov             v18.16b, v4.16b               // V acc low
+        mov             v19.16b, v5.16b               // V acc high
+
+        mov             w9, w3                        // chrFilterSize counter
+        mov             x10, x2                       // chrFilter pointer
+        mov             x11, x4                       // chrUSrc base
+        mov             x12, x5                       // chrVSrc base
+
+.Lfilter_loop_8:
+        ldr             h6, [x10], #2                 // Load filter coefficient
+
+        ldr             x13, [x11], #8                // chrUSrc[j]
+        ldr             x14, [x12], #8                // chrVSrc[j]
+        add             x13, x13, x8, lsl #1          // &chrUSrc[j][i]
+        add             x14, x14, x8, lsl #1          // &chrVSrc[j][i]
+
+        ld1             {v20.8h}, [x13]               // U samples
+        ld1             {v21.8h}, [x14]               // V samples
+
+        smlal           v16.4s, v20.4h, v6.h[0]
+        smlal2          v17.4s, v20.8h, v6.h[0]
+        smlal           v18.4s, v21.4h, v6.h[0]
+        smlal2          v19.4s, v21.8h, v6.h[0]
+
+        subs            w9, w9, #1
+        b.gt            .Lfilter_loop_8
+
+        // Final processing and store
+        sqshrun         v26.4h, v16.4s, #16
+        sqshrun2        v26.8h, v17.4s, #16
+        sqshrun         v27.4h, v18.4s, #16
+        sqshrun2        v27.8h, v19.4s, #16
+        uqshrn          v28.8b, v26.8h, #3            // U
+        uqshrn          v29.8b, v27.8h, #3            // V
+
+        st2             {v28.8b, v29.8b}, [x6], #16
+
+        subs            w7, w7, #8
+        add             x8, x8, #8
+        b.gt            .Lloop_8_pixels
+        ret
+endfunc
+
+function ff_yuv2nv12cX_swapped_neon, export=1
+// x0 - dstFormat (unused)
+// x1 - uint8_t *chrDither
+// x2 - int16_t *chrFilter
+// x3 - int chrFilterSize
+// x4 - int16_t **chrUSrc
+// x5 - int16_t **chrVSrc
+// x6 - uint8_t *dest
+// x7 - int chrDstW
+
+        // Load dither pattern and compute U and V dither vectors
+        ld1             {v0.8b}, [x1]                   // chrDither[0..7]
+        ext             v1.8b, v0.8b, v0.8b, #3         // Rotate for V: (i+3)&7
+
+        uxtl            v0.8h, v0.8b
+        uxtl            v1.8h, v1.8b
+
+        ushll           v2.4s, v0.4h, #12               // U dither low
+        ushll2          v3.4s, v0.8h, #12               // U dither high
+        ushll           v4.4s, v1.4h, #12               // V dither low
+        ushll2          v5.4s, v1.8h, #12               // V dither high
+
+        // Check if we can process 16 pixels at a time
+        tst             w7, #15                         // Check if chrDstW % 16 == 0
+        b.ne            .Lprocess_swapped_8_pixels              // If not, use 8-pixel version
+
+        // =============================================
+        // 16-pixel processing path
+        // =============================================
+        mov             x8, #0                          // i = 0
+.Lloop_swapped_16_pixels:
+
+        mov             v16.16b, v2.16b                 // U acc low
+        mov             v17.16b, v3.16b                 // U acc high
+        mov             v18.16b, v4.16b                 // V acc low
+        mov             v19.16b, v5.16b                 // V acc high
+
+        mov             v20.16b, v2.16b
+        mov             v21.16b, v3.16b
+        mov             v22.16b, v4.16b
+        mov             v23.16b, v5.16b
+
+        mov             w9, w3                          // chrFilterSize counter
+        mov             x10, x2                         // chrFilter pointer
+        mov             x11, x4                         // chrUSrc base
+        mov             x12, x5                         // chrVSrc base
+
+.Lfilter_swapped_loop_16:
+        ldr             h6, [x10], #2                   // Load filter coefficient
+
+        ldr             x13, [x11], #8                  // chrUSrc[j]
+        ldr             x14, [x12], #8                  // chrVSrc[j]
+        add             x13, x13, x8, lsl #1            // &chrUSrc[j][i]
+        add             x14, x14, x8, lsl #1            // &chrVSrc[j][i]
+        add             x15, x13, #16                   // x15 = &chrUSrc[j][i+8] (8 samples * 2 bytes)
+        add             x16, x14, #16
+
+        ld1             {v24.8h}, [x13]                 // U samples 0-7
+        ld1             {v25.8h}, [x14]                 // V samples 0-7
+
+        ld1             {v26.8h}, [x15]                 // U samples 8-15
+        ld1             {v27.8h}, [x16]                 // V samples 8-15
+
+        smlal           v16.4s, v24.4h, v6.h[0]
+        smlal2          v17.4s, v24.8h, v6.h[0]
+        smlal           v18.4s, v25.4h, v6.h[0]
+        smlal2          v19.4s, v25.8h, v6.h[0]
+
+        smlal           v20.4s, v26.4h, v6.h[0]
+        smlal2          v21.4s, v26.8h, v6.h[0]
+        smlal           v22.4s, v27.4h, v6.h[0]
+        smlal2          v23.4s, v27.8h, v6.h[0]
+
+        subs            w9, w9, #1
+        b.gt            .Lfilter_swapped_loop_16
+
+        // Process and store first 8 pixels
+        sqshrun         v28.4h, v16.4s, #16
+        sqshrun2        v28.8h, v17.4s, #16
+        sqshrun         v29.4h, v18.4s, #16
+        sqshrun2        v29.8h, v19.4s, #16
+        uqshrn          v30.8b, v29.8h, #3              // V
+        uqshrn          v31.8b, v28.8h, #3              // U
+
+        // Process and store next 8 pixels
+        sqshrun         v28.4h, v20.4s, #16
+        sqshrun2        v28.8h, v21.4s, #16
+        sqshrun         v29.4h, v22.4s, #16
+        sqshrun2        v29.8h, v23.4s, #16
+        uqshrn          v24.8b, v29.8h, #3              // V
+        uqshrn          v25.8b, v28.8h, #3              // U
+
+        // Store both 8-pixel blocks
+        st2             {v30.8b, v31.8b}, [x6], #16
+        st2             {v24.8b, v25.8b}, [x6], #16
+
+        subs            w7, w7, #16
+        add             x8, x8, #16
+        b.gt            .Lloop_swapped_16_pixels
+        ret
+
+        // =============================================
+        // 8-pixel processing path (original code)
+        // =============================================
+.Lprocess_swapped_8_pixels:
+        mov             x8, #0                        // i = 0
+.Lloop_swapped_8_pixels:
+        // Initialize accumulators with dither
+        mov             v16.16b, v2.16b               // U acc low
+        mov             v17.16b, v3.16b               // U acc high
+        mov             v18.16b, v4.16b               // V acc low
+        mov             v19.16b, v5.16b               // V acc high
+
+        mov             w9, w3                        // chrFilterSize counter
+        mov             x10, x2                       // chrFilter pointer
+        mov             x11, x4                       // chrUSrc base
+        mov             x12, x5                       // chrVSrc base
+
+.Lfilter_swapped_loop_8:
+        ldr             h6, [x10], #2                 // Load filter coefficient
+
+        ldr             x13, [x11], #8                // chrUSrc[j]
+        ldr             x14, [x12], #8                // chrVSrc[j]
+        add             x13, x13, x8, lsl #1          // &chrUSrc[j][i]
+        add             x14, x14, x8, lsl #1          // &chrVSrc[j][i]
+
+        ld1             {v20.8h}, [x13]               // U samples
+        ld1             {v21.8h}, [x14]               // V samples
+
+        smlal           v16.4s, v20.4h, v6.h[0]
+        smlal2          v17.4s, v20.8h, v6.h[0]
+        smlal           v18.4s, v21.4h, v6.h[0]
+        smlal2          v19.4s, v21.8h, v6.h[0]
+
+        subs            w9, w9, #1
+        b.gt            .Lfilter_swapped_loop_8
+
+        // Final processing and store
+        sqshrun         v26.4h, v16.4s, #16
+        sqshrun2        v26.8h, v17.4s, #16
+        sqshrun         v27.4h, v18.4s, #16
+        sqshrun2        v27.8h, v19.4s, #16
+        uqshrn          v28.8b, v27.8h, #3            // V
+        uqshrn          v29.8b, v26.8h, #3            // U
+
+        st2             {v28.8b, v29.8b}, [x6], #16
+
+        subs            w7, w7, #8
+        add             x8, x8, #8
+        b.gt            .Lloop_swapped_8_pixels
+        ret
+endfunc
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 6e5a721c1f..5246d53a16 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -168,6 +168,16 @@ void ff_yuv2plane1_8_neon(
         const uint8_t *dither,
         int offset);

+void ff_yuv2nv12cX_notswapped_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
+    const int16_t *chrFilter, int chrFilterSize,
+    const int16_t **chrUSrc, const int16_t **chrVSrc,
+    uint8_t *dest, int chrDstW);
+
+void ff_yuv2nv12cX_swapped_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
+    const int16_t *chrFilter, int chrFilterSize,
+    const int16_t **chrUSrc, const int16_t **chrVSrc,
+    uint8_t *dest, int chrDstW);
+
 #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt) do {              \
     if (c->srcBpc == 8) {                                               \
         if(c->dstBpc <= 14) {                                           \
@@ -201,6 +211,12 @@ void ff_yuv2plane1_8_neon(
     default: break;                                                     \
     }

+#define ASSIGN_YUV2NV12_FUNC(yuv2nv12fn, opt, dstFormat)               \
+    if(!isSwappedChroma(dstFormat))                                    \
+        yuv2nv12fn = ff_yuv2nv12cX_notswapped_  ## opt;                \
+    else                                                               \
+        yuv2nv12fn = ff_yuv2nv12cX_swapped_  ## opt;
+
 #define NEON_INPUT(name) \
 void ff_##name##ToY_neon(uint8_t *dst, const uint8_t *src, const uint8_t *, \
                         const uint8_t *, int w, uint32_t *coeffs, void *); \
@@ -275,7 +291,10 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
         ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon);
         if (c->dstBpc == 8) {
             c->yuv2planeX = ff_yuv2planeX_8_neon;
+            if(isSemiPlanarYUV(c->opts.dst_format))
+               ASSIGN_YUV2NV12_FUNC(c->yuv2nv12cX, neon, c->opts.dst_format);
         }
+
         switch (c->opts.src_format) {
         case AV_PIX_FMT_ABGR:
             c->lumToYV12 = ff_abgr32ToY_neon;
--
2.34.1


________________________________
From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> on behalf of Martin Storsjö <martin@martin.st>
Sent: 06 June 2025 12:37
To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: Dash Santosh Sathyanarayanan <dash.sathyanarayanan@multicorewareinc.com>; Logaprakash Ramajayam <logaprakash.ramajayam@multicorewareinc.com>
Subject: Re: [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c()

On Fri, 6 Jun 2025, Harshitha Sarangu Suresh wrote:

> Changed indentation, checked for FATE tests and gha-aarch64 git
> workflow. Everything passed.

I doubt that everything passed; this doesn't even compile. See below:

> @@ -275,7 +291,10 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
>         ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon);
>         if (c->dstBpc == 8) {
>             c->yuv2planeX = ff_yuv2planeX_8_neon;
> +            if(isSemiPlanarYUV(c->opts.dst_format)
> +               ASSIGN_YUV2NV12_FUNC(c->yuv2nv12cX, neon, c->opts.dst_format);
>         }
> +

src/libswscale/aarch64/swscale.c: In function
¡ff_sws_init_swscale_aarch64¢:
src/libswscale/aarch64/swscale.c:294:51: error: expected ¡)¢ before ¡if¢
   294 |             if(isSemiPlanarYUV(c->opts.dst_format)
       |               ~                                   ^
       |                                                   )

// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

[-- Attachment #2: swscale-aarch64-output-Implement-neon-assembly-.patch --]
[-- Type: application/octet-stream, Size: 17316 bytes --]

[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c()
  2025-06-06  6:52 Harshitha Sarangu Suresh
@ 2025-06-06  7:07 ` Martin Storsjö
  2025-06-06  9:08   ` Harshitha Sarangu Suresh
  0 siblings, 1 reply; 5+ messages in thread
From: Martin Storsjö @ 2025-06-06  7:07 UTC (permalink / raw)
  To: FFmpeg development discussions and patches
  Cc: Dash Santosh Sathyanarayanan, Logaprakash Ramajayam

On Fri, 6 Jun 2025, Harshitha Sarangu Suresh wrote:

> Changed indentation, checked for FATE tests and gha-aarch64 git 
> workflow. Everything passed.

I doubt that everything passed; this doesn't even compile. See below:

> @@ -275,7 +291,10 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
>         ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon);
>         if (c->dstBpc == 8) {
>             c->yuv2planeX = ff_yuv2planeX_8_neon;
> +            if(isSemiPlanarYUV(c->opts.dst_format)
> +               ASSIGN_YUV2NV12_FUNC(c->yuv2nv12cX, neon, c->opts.dst_format);
>         }
> +

src/libswscale/aarch64/swscale.c: In function 
‘ff_sws_init_swscale_aarch64’:
src/libswscale/aarch64/swscale.c:294:51: error: expected ‘)’ before ‘if’
   294 |             if(isSemiPlanarYUV(c->opts.dst_format)
       |               ~                                   ^
       |                                                   )

// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c()
@ 2025-06-06  6:52 Harshitha Sarangu Suresh
  2025-06-06  7:07 ` Martin Storsjö
  0 siblings, 1 reply; 5+ messages in thread
From: Harshitha Sarangu Suresh @ 2025-06-06  6:52 UTC (permalink / raw)
  To: Kieran Kunhya via ffmpeg-devel
  Cc: Dash Santosh Sathyanarayanan, Logaprakash Ramajayam

[-- Attachment #1: Type: text/plain, Size: 17808 bytes --]

Changed indentation, checked for FATE tests and gha-aarch64 git workflow. Everything passed.

From 4a590438d61a10d4f75c7a567af2890bc08332ca Mon Sep 17 00:00:00 2001
From: Harshitha Suresh <harshitha@multicorewareinc.com>
Date: Mon, 19 May 2025 22:37:20 +0530
Subject: [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c().

yuv2nv12cX_2_512_accurate_c:                          3508.8 ( 1.00x)
yuv2nv12cX_2_512_accurate_neon:                        369.2 ( 9.50x)
yuv2nv12cX_2_512_approximate_c:                       3499.0 ( 1.00x)
yuv2nv12cX_2_512_approximate_neon:                     370.2 ( 9.45x)
yuv2nv12cX_4_512_accurate_c:                          4683.0 ( 1.00x)
yuv2nv12cX_4_512_accurate_neon:                        568.8 ( 8.23x)
yuv2nv12cX_4_512_approximate_c:                       4682.6 ( 1.00x)
yuv2nv12cX_4_512_approximate_neon:                     569.9 ( 8.22x)
yuv2nv12cX_8_512_accurate_c:                          7243.0 ( 1.00x)
yuv2nv12cX_8_512_accurate_neon:                        937.6 ( 7.72x)
yuv2nv12cX_8_512_approximate_c:                       7235.9 ( 1.00x)
yuv2nv12cX_8_512_approximate_neon:                     938.3 ( 7.71x)
yuv2nv12cX_16_512_accurate_c:                        13749.7 ( 1.00x)
yuv2nv12cX_16_512_accurate_neon:                      1708.1 ( 8.05x)
yuv2nv12cX_16_512_approximate_c:                     13750.0 ( 1.00x)
yuv2nv12cX_16_512_approximate_neon:                   1708.6 ( 8.05x)
---
 libswscale/aarch64/output.S  | 306 +++++++++++++++++++++++++++++++++++
 libswscale/aarch64/swscale.c |  19 +++
 2 files changed, 325 insertions(+)

diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
index 190c438870..2d87cc6a5e 100644
--- a/libswscale/aarch64/output.S
+++ b/libswscale/aarch64/output.S
@@ -226,3 +226,309 @@ function ff_yuv2plane1_8_neon, export=1
         b.gt            2b                              // loop until width consumed
         ret
 endfunc
+
+function ff_yuv2nv12cX_notswapped_neon, export=1
+// x0 - dstFormat (unused)
+// x1 - uint8_t *chrDither
+// x2 - int16_t *chrFilter
+// x3 - int chrFilterSize
+// x4 - int16_t **chrUSrc
+// x5 - int16_t **chrVSrc
+// x6 - uint8_t *dest
+// x7 - int chrDstW
+
+        // Load dither pattern and compute U and V dither vectors
+        ld1             {v0.8b}, [x1]                   // chrDither[0..7]
+        ext             v1.8b, v0.8b, v0.8b, #3         // Rotate for V: (i+3)&7
+
+        uxtl            v0.8h, v0.8b
+        uxtl            v1.8h, v1.8b
+
+        ushll           v2.4s, v0.4h, #12               // U dither low
+        ushll2          v3.4s, v0.8h, #12               // U dither high
+        ushll           v4.4s, v1.4h, #12               // V dither low
+        ushll2          v5.4s, v1.8h, #12               // V dither high
+
+        // Check if we can process 16 pixels at a time
+        tst             w7, #15                         // Check if chrDstW % 16 == 0
+        b.ne            .Lprocess_8_pixels              // If not, use 8-pixel version
+
+        // =============================================
+        // 16-pixel processing path
+        // =============================================
+        mov             x8, #0                          // i = 0
+.Lloop_16_pixels:
+
+        mov             v16.16b, v2.16b                 // U acc low
+        mov             v17.16b, v3.16b                 // U acc high
+        mov             v18.16b, v4.16b                 // V acc low
+        mov             v19.16b, v5.16b                 // V acc high
+
+        mov             v20.16b, v2.16b
+        mov             v21.16b, v3.16b
+        mov             v22.16b, v4.16b
+        mov             v23.16b, v5.16b
+
+        mov             w9, w3                          // chrFilterSize counter
+        mov             x10, x2                         // chrFilter pointer
+        mov             x11, x4                         // chrUSrc base
+        mov             x12, x5                         // chrVSrc base
+
+.Lfilter_loop_16:
+        ldr             h6, [x10], #2                   // Load filter coefficient
+
+        ldr             x13, [x11], #8                  // chrUSrc[j]
+        ldr             x14, [x12], #8                  // chrVSrc[j]
+        add             x13, x13, x8, lsl #1            // &chrUSrc[j][i]
+        add             x14, x14, x8, lsl #1            // &chrVSrc[j][i]
+        add             x15, x13, #16                   // x15 = &chrUSrc[j][i+8] (8 samples * 2 bytes)
+        add             x16, x14, #16
+
+        ld1             {v24.8h}, [x13]                 // U samples 0-7
+        ld1             {v25.8h}, [x14]                 // V samples 0-7
+
+        ld1             {v26.8h}, [x15]                 // U samples 8-15
+        ld1             {v27.8h}, [x16]                 // V samples 8-15
+
+        smlal           v16.4s, v24.4h, v6.h[0]
+        smlal2          v17.4s, v24.8h, v6.h[0]
+        smlal           v18.4s, v25.4h, v6.h[0]
+        smlal2          v19.4s, v25.8h, v6.h[0]
+
+        smlal           v20.4s, v26.4h, v6.h[0]
+        smlal2          v21.4s, v26.8h, v6.h[0]
+        smlal           v22.4s, v27.4h, v6.h[0]
+        smlal2          v23.4s, v27.8h, v6.h[0]
+
+        subs            w9, w9, #1
+        b.gt            .Lfilter_loop_16
+
+        // Process and store first 8 pixels
+        sqshrun         v28.4h, v16.4s, #16
+        sqshrun2        v28.8h, v17.4s, #16
+        sqshrun         v29.4h, v18.4s, #16
+        sqshrun2        v29.8h, v19.4s, #16
+        uqshrn          v30.8b, v28.8h, #3              // U
+        uqshrn          v31.8b, v29.8h, #3              // V
+
+        // Process and store next 8 pixels
+        sqshrun         v28.4h, v20.4s, #16
+        sqshrun2        v28.8h, v21.4s, #16
+        sqshrun         v29.4h, v22.4s, #16
+        sqshrun2        v29.8h, v23.4s, #16
+        uqshrn          v24.8b, v28.8h, #3              // U
+        uqshrn          v25.8b, v29.8h, #3              // V
+
+        // Store both 8-pixel blocks
+        st2             {v30.8b, v31.8b}, [x6], #16
+        st2             {v24.8b, v25.8b}, [x6], #16
+
+        subs            w7, w7, #16
+        add             x8, x8, #16
+        b.gt            .Lloop_16_pixels
+        ret
+
+        // =============================================
+        // 8-pixel processing path (original code)
+        // =============================================
+.Lprocess_8_pixels:
+        mov             x8, #0                        // i = 0
+.Lloop_8_pixels:
+        // Initialize accumulators with dither
+        mov             v16.16b, v2.16b               // U acc low
+        mov             v17.16b, v3.16b               // U acc high
+        mov             v18.16b, v4.16b               // V acc low
+        mov             v19.16b, v5.16b               // V acc high
+
+        mov             w9, w3                        // chrFilterSize counter
+        mov             x10, x2                       // chrFilter pointer
+        mov             x11, x4                       // chrUSrc base
+        mov             x12, x5                       // chrVSrc base
+
+.Lfilter_loop_8:
+        ldr             h6, [x10], #2                 // Load filter coefficient
+
+        ldr             x13, [x11], #8                // chrUSrc[j]
+        ldr             x14, [x12], #8                // chrVSrc[j]
+        add             x13, x13, x8, lsl #1          // &chrUSrc[j][i]
+        add             x14, x14, x8, lsl #1          // &chrVSrc[j][i]
+
+        ld1             {v20.8h}, [x13]               // U samples
+        ld1             {v21.8h}, [x14]               // V samples
+
+        smlal           v16.4s, v20.4h, v6.h[0]
+        smlal2          v17.4s, v20.8h, v6.h[0]
+        smlal           v18.4s, v21.4h, v6.h[0]
+        smlal2          v19.4s, v21.8h, v6.h[0]
+
+        subs            w9, w9, #1
+        b.gt            .Lfilter_loop_8
+
+        // Final processing and store
+        sqshrun         v26.4h, v16.4s, #16
+        sqshrun2        v26.8h, v17.4s, #16
+        sqshrun         v27.4h, v18.4s, #16
+        sqshrun2        v27.8h, v19.4s, #16
+        uqshrn          v28.8b, v26.8h, #3            // U
+        uqshrn          v29.8b, v27.8h, #3            // V
+
+        st2             {v28.8b, v29.8b}, [x6], #16
+
+        subs            w7, w7, #8
+        add             x8, x8, #8
+        b.gt            .Lloop_8_pixels
+        ret
+endfunc
+
+function ff_yuv2nv12cX_swapped_neon, export=1
+// x0 - dstFormat (unused)
+// x1 - uint8_t *chrDither
+// x2 - int16_t *chrFilter
+// x3 - int chrFilterSize
+// x4 - int16_t **chrUSrc
+// x5 - int16_t **chrVSrc
+// x6 - uint8_t *dest
+// x7 - int chrDstW
+
+        // Load dither pattern and compute U and V dither vectors
+        ld1             {v0.8b}, [x1]                   // chrDither[0..7]
+        ext             v1.8b, v0.8b, v0.8b, #3         // Rotate for V: (i+3)&7
+
+        uxtl            v0.8h, v0.8b
+        uxtl            v1.8h, v1.8b
+
+        ushll           v2.4s, v0.4h, #12               // U dither low
+        ushll2          v3.4s, v0.8h, #12               // U dither high
+        ushll           v4.4s, v1.4h, #12               // V dither low
+        ushll2          v5.4s, v1.8h, #12               // V dither high
+
+        // Check if we can process 16 pixels at a time
+        tst             w7, #15                         // Check if chrDstW % 16 == 0
+        b.ne            .Lprocess_swapped_8_pixels              // If not, use 8-pixel version
+
+        // =============================================
+        // 16-pixel processing path
+        // =============================================
+        mov             x8, #0                          // i = 0
+.Lloop_swapped_16_pixels:
+
+        mov             v16.16b, v2.16b                 // U acc low
+        mov             v17.16b, v3.16b                 // U acc high
+        mov             v18.16b, v4.16b                 // V acc low
+        mov             v19.16b, v5.16b                 // V acc high
+
+        mov             v20.16b, v2.16b
+        mov             v21.16b, v3.16b
+        mov             v22.16b, v4.16b
+        mov             v23.16b, v5.16b
+
+        mov             w9, w3                          // chrFilterSize counter
+        mov             x10, x2                         // chrFilter pointer
+        mov             x11, x4                         // chrUSrc base
+        mov             x12, x5                         // chrVSrc base
+
+.Lfilter_swapped_loop_16:
+        ldr             h6, [x10], #2                   // Load filter coefficient
+
+        ldr             x13, [x11], #8                  // chrUSrc[j]
+        ldr             x14, [x12], #8                  // chrVSrc[j]
+        add             x13, x13, x8, lsl #1            // &chrUSrc[j][i]
+        add             x14, x14, x8, lsl #1            // &chrVSrc[j][i]
+        add             x15, x13, #16                   // x15 = &chrUSrc[j][i+8] (8 samples * 2 bytes)
+        add             x16, x14, #16
+
+        ld1             {v24.8h}, [x13]                 // U samples 0-7
+        ld1             {v25.8h}, [x14]                 // V samples 0-7
+
+        ld1             {v26.8h}, [x15]                 // U samples 8-15
+        ld1             {v27.8h}, [x16]                 // V samples 8-15
+
+        smlal           v16.4s, v24.4h, v6.h[0]
+        smlal2          v17.4s, v24.8h, v6.h[0]
+        smlal           v18.4s, v25.4h, v6.h[0]
+        smlal2          v19.4s, v25.8h, v6.h[0]
+
+        smlal           v20.4s, v26.4h, v6.h[0]
+        smlal2          v21.4s, v26.8h, v6.h[0]
+        smlal           v22.4s, v27.4h, v6.h[0]
+        smlal2          v23.4s, v27.8h, v6.h[0]
+
+        subs            w9, w9, #1
+        b.gt            .Lfilter_swapped_loop_16
+
+        // Process and store first 8 pixels
+        sqshrun         v28.4h, v16.4s, #16
+        sqshrun2        v28.8h, v17.4s, #16
+        sqshrun         v29.4h, v18.4s, #16
+        sqshrun2        v29.8h, v19.4s, #16
+        uqshrn          v30.8b, v29.8h, #3              // V
+        uqshrn          v31.8b, v28.8h, #3              // U
+
+        // Process and store next 8 pixels
+        sqshrun         v28.4h, v20.4s, #16
+        sqshrun2        v28.8h, v21.4s, #16
+        sqshrun         v29.4h, v22.4s, #16
+        sqshrun2        v29.8h, v23.4s, #16
+        uqshrn          v24.8b, v29.8h, #3              // V
+        uqshrn          v25.8b, v28.8h, #3              // U
+
+        // Store both 8-pixel blocks
+        st2             {v30.8b, v31.8b}, [x6], #16
+        st2             {v24.8b, v25.8b}, [x6], #16
+
+        subs            w7, w7, #16
+        add             x8, x8, #16
+        b.gt            .Lloop_swapped_16_pixels
+        ret
+
+        // =============================================
+        // 8-pixel processing path (original code)
+        // =============================================
+.Lprocess_swapped_8_pixels:
+        mov             x8, #0                        // i = 0
+.Lloop_swapped_8_pixels:
+        // Initialize accumulators with dither
+        mov             v16.16b, v2.16b               // U acc low
+        mov             v17.16b, v3.16b               // U acc high
+        mov             v18.16b, v4.16b               // V acc low
+        mov             v19.16b, v5.16b               // V acc high
+
+        mov             w9, w3                        // chrFilterSize counter
+        mov             x10, x2                       // chrFilter pointer
+        mov             x11, x4                       // chrUSrc base
+        mov             x12, x5                       // chrVSrc base
+
+.Lfilter_swapped_loop_8:
+        ldr             h6, [x10], #2                 // Load filter coefficient
+
+        ldr             x13, [x11], #8                // chrUSrc[j]
+        ldr             x14, [x12], #8                // chrVSrc[j]
+        add             x13, x13, x8, lsl #1          // &chrUSrc[j][i]
+        add             x14, x14, x8, lsl #1          // &chrVSrc[j][i]
+
+        ld1             {v20.8h}, [x13]               // U samples
+        ld1             {v21.8h}, [x14]               // V samples
+
+        smlal           v16.4s, v20.4h, v6.h[0]
+        smlal2          v17.4s, v20.8h, v6.h[0]
+        smlal           v18.4s, v21.4h, v6.h[0]
+        smlal2          v19.4s, v21.8h, v6.h[0]
+
+        subs            w9, w9, #1
+        b.gt            .Lfilter_swapped_loop_8
+
+        // Final processing and store
+        sqshrun         v26.4h, v16.4s, #16
+        sqshrun2        v26.8h, v17.4s, #16
+        sqshrun         v27.4h, v18.4s, #16
+        sqshrun2        v27.8h, v19.4s, #16
+        uqshrn          v28.8b, v27.8h, #3            // V
+        uqshrn          v29.8b, v26.8h, #3            // U
+
+        st2             {v28.8b, v29.8b}, [x6], #16
+
+        subs            w7, w7, #8
+        add             x8, x8, #8
+        b.gt            .Lloop_swapped_8_pixels
+        ret
+endfunc
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 6e5a721c1f..c07e375025 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -168,6 +168,16 @@ void ff_yuv2plane1_8_neon(
         const uint8_t *dither,
         int offset);

+void ff_yuv2nv12cX_notswapped_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
+    const int16_t *chrFilter, int chrFilterSize,
+    const int16_t **chrUSrc, const int16_t **chrVSrc,
+    uint8_t *dest, int chrDstW);
+
+void ff_yuv2nv12cX_swapped_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
+    const int16_t *chrFilter, int chrFilterSize,
+    const int16_t **chrUSrc, const int16_t **chrVSrc,
+    uint8_t *dest, int chrDstW);
+
 #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt) do {              \
     if (c->srcBpc == 8) {                                               \
         if(c->dstBpc <= 14) {                                           \
@@ -201,6 +211,12 @@ void ff_yuv2plane1_8_neon(
     default: break;                                                     \
     }

+#define ASSIGN_YUV2NV12_FUNC(yuv2nv12fn, opt, dstFormat)               \
+    if(!isSwappedChroma(dstFormat))                                    \
+        yuv2nv12fn = ff_yuv2nv12cX_notswapped_  ## opt;                \
+    else                                                               \
+        yuv2nv12fn = ff_yuv2nv12cX_swapped_  ## opt;
+
 #define NEON_INPUT(name) \
 void ff_##name##ToY_neon(uint8_t *dst, const uint8_t *src, const uint8_t *, \
                         const uint8_t *, int w, uint32_t *coeffs, void *); \
@@ -275,7 +291,10 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
         ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon);
         if (c->dstBpc == 8) {
             c->yuv2planeX = ff_yuv2planeX_8_neon;
+            if(isSemiPlanarYUV(c->opts.dst_format)
+               ASSIGN_YUV2NV12_FUNC(c->yuv2nv12cX, neon, c->opts.dst_format);
         }
+
         switch (c->opts.src_format) {
         case AV_PIX_FMT_ABGR:
             c->lumToYV12 = ff_abgr32ToY_neon;
--
2.36.0.windows.1



[-- Attachment #2: swscale-aarch64-output-Implement-neon-assembly-.patch --]
[-- Type: application/octet-stream, Size: 17326 bytes --]

[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-06-06  9:08 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-06-02  4:36 [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c() Harshitha Sarangu Suresh
2025-06-02  7:52 ` Martin Storsjö
2025-06-06  6:52 Harshitha Sarangu Suresh
2025-06-06  7:07 ` Martin Storsjö
2025-06-06  9:08   ` Harshitha Sarangu Suresh

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git