Handled all the comments, removed code duplication for swapped and non-swapped case.

Checkasm Benchmark Results
yuv2nv12cX_2_512_accurate_c:                                    3496.2 ( 1.00x)

yuv2nv12cX_2_512_accurate_neon:                            409.5 ( 8.54x)

yuv2nv12cX_2_512_approximate_c:                             3495.1 ( 1.00x)

yuv2nv12cX_2_512_approximate_neon:                     409.4 ( 8.54x)

yuv2nv12cX_4_512_accurate_c:                                     4676.5 ( 1.00x)

yuv2nv12cX_4_512_accurate_neon:                             613.1 ( 7.63x)

yuv2nv12cX_4_512_approximate_c:                             4677.8 ( 1.00x)

yuv2nv12cX_4_512_approximate_neon:                     607.8 ( 7.70x)

yuv2nv12cX_8_512_accurate_c:                                     7221.6 ( 1.00x)

yuv2nv12cX_8_512_accurate_neon:                             1003.8 ( 7.19x)

yuv2nv12cX_8_512_approximate_c:                              7221.2 ( 1.00x)

yuv2nv12cX_8_512_approximate_neon:                     1016.4 ( 7.11x)

yuv2nv12cX_16_512_accurate_c:                                   13731.1 ( 1.00x)

yuv2nv12cX_16_512_accurate_neon:                           1757.2 ( 7.81x)

yuv2nv12cX_16_512_approximate_c:                           13740.7 ( 1.00x)

yuv2nv12cX_16_512_approximate_neon:                   1757.3 ( 7.82x)


From 4ca92570924ba42e20041feaae6d7488c02c1e6a Mon Sep 17 00:00:00 2001
From: Harshitha Suresh <harshitha@multicorewareinc.com>
Date: Fri, 4 Jul 2025 14:29:11 +0530
Subject: [PATCH] swscale/output: Implement yuv2nv12cx neon assembly

---
 libswscale/aarch64/output.S  | 233 +++++++++++++++++++++++++++++++++++
 libswscale/aarch64/swscale.c |  24 ++++
 2 files changed, 257 insertions(+)

diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
index 190c438870..92dec3f0ed 100644
--- a/libswscale/aarch64/output.S
+++ b/libswscale/aarch64/output.S
@@ -226,3 +226,236 @@ function ff_yuv2plane1_8_neon, export=1
         b.gt            2b                              // loop until width consumed
         ret
 endfunc
+
+function ff_yuv2nv12cX_neon_asm, export=1
+// w0 - isSwapped
+// x1 - uint8_t *chrDither
+// x2 - int16_t *chrFilter
+// x3 - int chrFilterSize
+// x4 - int16_t **chrUSrc
+// x5 - int16_t **chrVSrc
+// x6 - uint8_t *dest
+// x7 - int chrDstW
+
+        stp             x19, x20, [sp, #-16]!
+        stp             x21, x22, [sp, #-16]!
+
+        ld1             {v0.8b}, [x1]                   // chrDither[0..7]
+        ext             v1.8b, v0.8b, v0.8b, #3         // Rotate for V: (i+3)&7
+
+        uxtl            v0.8h, v0.8b
+        uxtl            v1.8h, v1.8b
+
+        ushll           v2.4s, v0.4h, #12               // U dither low
+        ushll2          v3.4s, v0.8h, #12               // U dither high
+        ushll           v4.4s, v1.4h, #12               // V dither low
+        ushll2          v5.4s, v1.8h, #12               // V dither high
+
+        mov             x8, #0                          // i = 0
+1:
+        cmp             w7, #16
+        blt             7f
+
+        mov             v16.16b, v2.16b                 // U acc low
+        mov             v17.16b, v3.16b                 // U acc high
+        mov             v18.16b, v4.16b                 // V acc low
+        mov             v19.16b, v5.16b                 // V acc high
+
+        mov             v20.16b, v2.16b
+        mov             v21.16b, v3.16b
+        mov             v22.16b, v4.16b
+        mov             v23.16b, v5.16b
+
+        mov             w9, w3                          // chrFilterSize counter
+        mov             x10, x2                         // chrFilter pointer
+        mov             x11, x4                         // chrUSrc base
+        mov             x12, x5                         // chrVSrc base
+
+2:
+        ldr             h6, [x10], #2                   // Load filter coefficient
+
+        ldr             x13, [x11], #8                  // chrUSrc[j]
+        ldr             x14, [x12], #8                  // chrVSrc[j]
+        add             x13, x13, x8, lsl #1            // &chrUSrc[j][i]
+        add             x14, x14, x8, lsl #1            // &chrVSrc[j][i]
+        add             x15, x13, #16
+        add             x16, x14, #16
+
+        ld1             {v24.8h}, [x13]                 // U samples 0-7
+        ld1             {v25.8h}, [x14]                 // V samples 0-7
+
+        ld1             {v26.8h}, [x15]                 // U samples 8-15
+        ld1             {v27.8h}, [x16]                 // V samples 8-15
+
+        smlal           v16.4s, v24.4h, v6.h[0]
+        smlal2          v17.4s, v24.8h, v6.h[0]
+        smlal           v18.4s, v25.4h, v6.h[0]
+        smlal2          v19.4s, v25.8h, v6.h[0]
+
+        smlal           v20.4s, v26.4h, v6.h[0]
+        smlal2          v21.4s, v26.8h, v6.h[0]
+        smlal           v22.4s, v27.4h, v6.h[0]
+        smlal2          v23.4s, v27.8h, v6.h[0]
+
+        subs            w9, w9, #1
+        b.gt            2b
+
+        sqshrun         v28.4h, v16.4s, #16             // Process and store first 8 pixels
+        sqshrun2        v28.8h, v17.4s, #16
+        sqshrun         v29.4h, v18.4s, #16
+        sqshrun2        v29.8h, v19.4s, #16
+
+        cbz             w0, 3f
+        uqshrn          v24.8b, v28.8h, #3              // Storing U
+        uqshrn          v25.8b, v29.8h, #3              // Storing V
+        st2             {v24.8b, v25.8b}, [x6], #16
+        b               4f
+3:
+        uqshrn          v24.8b, v29.8h, #3              // Storing V
+        uqshrn          v25.8b, v28.8h, #3              // Storing U
+        st2             {v24.8b, v25.8b}, [x6], #16
+
+4:
+        sqshrun         v28.4h, v20.4s, #16             // Process and store next 8 pixels
+        sqshrun2        v28.8h, v21.4s, #16
+        sqshrun         v29.4h, v22.4s, #16
+        sqshrun2        v29.8h, v23.4s, #16
+
+        cbz             w0, 5f
+        uqshrn          v30.8b, v28.8h, #3              // Storing U
+        uqshrn          v31.8b, v29.8h, #3              // Storing V
+        st2             {v30.8b, v31.8b}, [x6], #16
+        b               6f
+5:
+        uqshrn          v30.8b, v29.8h, #3              // Storing V
+        uqshrn          v31.8b, v28.8h, #3              // Storing U
+        st2             {v30.8b, v31.8b}, [x6], #16
+
+6:
+        subs            w7, w7, #16
+        add             x8, x8, #16
+        b.gt            1b
+
+7:
+        cmp             w7, #8
+        blt             12f
+8:
+        mov             v16.16b, v2.16b                 // U acc low
+        mov             v17.16b, v3.16b                 // U acc high
+        mov             v18.16b, v4.16b                 // V acc low
+        mov             v19.16b, v5.16b                 // V acc high
+
+        mov             w9, w3                          // chrFilterSize counter
+        mov             x10, x2                         // chrFilter pointer
+        mov             x11, x4                         // chrUSrc base
+        mov             x12, x5                         // chrVSrc base
+
+9:
+        ldr             h6, [x10], #2                   // Load filter coefficient
+
+        ldr             x13, [x11], #8                  // chrUSrc[j]
+        ldr             x14, [x12], #8                  // chrVSrc[j]
+        add             x13, x13, x8, lsl #1            // &chrUSrc[j][i]
+        add             x14, x14, x8, lsl #1            // &chrVSrc[j][i]
+
+        ld1             {v20.8h}, [x13]                 // U samples
+        ld1             {v21.8h}, [x14]                 // V samples
+
+        smlal           v16.4s, v20.4h, v6.h[0]
+        smlal2          v17.4s, v20.8h, v6.h[0]
+        smlal           v18.4s, v21.4h, v6.h[0]
+        smlal2          v19.4s, v21.8h, v6.h[0]
+
+        subs            w9, w9, #1
+        b.gt            9b
+
+        sqshrun         v26.4h, v16.4s, #16             // Final processing and store
+        sqshrun2        v26.8h, v17.4s, #16
+        sqshrun         v27.4h, v18.4s, #16
+        sqshrun2        v27.8h, v19.4s, #16
+
+        cbz             w0, 10f
+        uqshrn          v28.8b, v26.8h, #3              // Storing U
+        uqshrn          v29.8b, v27.8h, #3              // Storing V
+        st2             {v28.8b, v29.8b}, [x6], #16
+        b               11f
+10:
+        uqshrn          v28.8b, v27.8h, #3              // Storing V
+        uqshrn          v29.8b, v26.8h, #3              // Storing U
+        st2             {v28.8b, v29.8b}, [x6], #16
+11:
+        subs            w7, w7, #8
+        add             x8, x8, #8
+
+12:
+        cbz             w7, 17f                         // Scalar loop
+
+13:
+        and             x15, x8, #7
+        ldrb            w9, [x1, x15]
+        sxtw            x9, w9
+        lsl             x9, x9, #12                     // u = chrDither[i & 7] << 12;
+
+        add             x15, x8, #3
+        and             x15, x15, #7
+        ldrb            w10, [x1, x15]
+        sxtw            x10, w10
+        lsl             x10, x10, #12                   // v = chrDither[(i + 3) & 7] << 12;
+
+        mov             w11, w3                         // chrFilterSize counter
+        mov             x12, x2                         // chrFilter pointer
+        mov             x13, x4                         // chrUSrc base
+        mov             x14, x5                         // chrVSrc base
+
+14:
+        ldrsh           x16, [x12], #2
+
+        ldr             x17, [x13], #8                  // chrUSrc[j]
+        ldr             x19, [x14], #8                  // chrVSrc[j]
+        add             x17, x17, x8, lsl #1            // &chrUSrc[j][i]
+        add             x19, x19, x8, lsl #1            // &chrVSrc[j][i]
+
+        ldrsh           x20, [x17]
+        ldrsh           x21, [x19]
+
+        madd            x9, x16, x20, x9
+        madd            x10, x16, x21, x10
+
+        subs            w11, w11, #1
+        b.gt            14b
+
+        asr             x9, x9, #19                     // Process and store U and V
+        asr             x10, x10, #19
+
+        cmp             x9, #0
+        csel            x9, x9, xzr, ge
+        cmp             x10, #0
+        csel            x10, x10, xzr, ge
+
+        mov             x22, #1
+        lsl             x22, x22, #8
+        sub             x22, x22, #1
+
+        cmp             x9, x22
+        csel            x9, x22, x9, gt
+        cmp             x10, x22
+        csel            x10, x22, x10, gt
+
+        cbz             w0, 15f
+        strb            w9, [x6], #1                    // Storing U
+        strb            w10, [x6], #1                   // Storing V
+        b               16f
+15:
+        strb            w10, [x6], #1                   // Storing V
+        strb            w9, [x6], #1                    // Storing U
+
+16:
+        subs            w7, w7, #1
+        add             x8, x8, #1
+        b.gt            13b
+17:
+        ldp             x21, x22, [sp], #16
+        ldp             x19, x20, [sp], #16
+        ret
+
+endfunc
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 6e5a721c1f..a7dcc451dc 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -168,6 +168,28 @@ void ff_yuv2plane1_8_neon(
         const uint8_t *dither,
         int offset);

+void ff_yuv2nv12cX_neon_asm(int isSwapped, const uint8_t *chrDither,
+                            const int16_t *chrFilter, int chrFilterSize,
+                            const int16_t **chrUSrc, const int16_t **chrVSrc,
+                            uint8_t *dest, int chrDstW);
+
+static void ff_yuv2nv12cX_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
+                               const int16_t *chrFilter, int chrFilterSize,
+                               const int16_t **chrUSrc, const int16_t **chrVSrc,
+                               uint8_t *dest, int chrDstW)
+{
+    if (!isSwappedChroma(dstFormat))
+    {
+        ff_yuv2nv12cX_neon_asm(1, chrDither, chrFilter, chrFilterSize,
+                               chrUSrc, chrVSrc, dest, chrDstW);
+    }
+    else
+    {
+        ff_yuv2nv12cX_neon_asm(0, chrDither, chrFilter, chrFilterSize,
+                               chrUSrc, chrVSrc, dest, chrDstW);
+    }
+}
+
 #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt) do {              \
     if (c->srcBpc == 8) {                                               \
         if(c->dstBpc <= 14) {                                           \
@@ -275,6 +297,8 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
         ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon);
         if (c->dstBpc == 8) {
             c->yuv2planeX = ff_yuv2planeX_8_neon;
+            if (isSemiPlanarYUV(c->opts.dst_format))
+               c->yuv2nv12cX = ff_yuv2nv12cX_neon;
         }
         switch (c->opts.src_format) {
         case AV_PIX_FMT_ABGR:
--
2.36.0.windows.1