This optimization provides 5x improvement for the module. The boost in performance was calculated by adding C timers inside the C function and the optimized neon intrinsic function.


From 904144c2db9e5e72d56360c4c2eb38d426852901 Mon Sep 17 00:00:00 2001
From: Harshitha Suresh <harshitha@multicorewareinc.com>
Date: Thu, 22 May 2025 10:23:55 +0530
Subject: [PATCH] swscale/output: Implement neon intrinsics for
 yuv2planeX_10_c_template()

---
 libswscale/output.c | 76 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 1 deletion(-)

diff --git a/libswscale/output.c b/libswscale/output.c
index c37649e7ce..345df5ce59 100644
--- a/libswscale/output.c
+++ b/libswscale/output.c
@@ -22,7 +22,9 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
-
+#if defined (__aarch64__)
+#include <arm_neon.h>
+#endif
 #include "libavutil/attributes.h"
 #include "libavutil/avutil.h"
 #include "libavutil/avassert.h"
@@ -337,6 +339,77 @@ yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW,
     }
 }

+
+#if defined (__aarch64__) && !defined(__APPLE__)
+static av_always_inline void
+yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
+                         const int16_t **src, uint16_t *dest, int dstW,
+                         int big_endian, int output_bits)
+{
+    const int shift = 11 + 16 - output_bits;
+    const int bias = 1 << (shift - 1);
+    const int clip_max = (1 << output_bits) - 1;
+    int i;
+
+    for (i = 0; i < dstW; i += 16) {
+        int32x4_t sum0_lo = vdupq_n_s32(bias);
+        int32x4_t sum0_hi = vdupq_n_s32(bias);
+        int32x4_t sum1_lo = vdupq_n_s32(bias);
+        int32x4_t sum1_hi = vdupq_n_s32(bias);
+
+        for (int j = 0; j < filterSize; j++) {
+            int16x8_t src_vec0 = vld1q_s16(&src[j][i]);
+            int16x8_t src_vec1 = vld1q_s16(&src[j][i + 8]);
+            int16x8_t filter_val = vdupq_n_s16(filter[j]);
+
+            sum0_lo = vmlal_s16(sum0_lo, vget_low_s16(src_vec0), vget_low_s16(filter_val));
+            sum0_hi = vmlal_s16(sum0_hi, vget_high_s16(src_vec0), vget_high_s16(filter_val));
+            sum1_lo = vmlal_s16(sum1_lo, vget_low_s16(src_vec1), vget_low_s16(filter_val));
+            sum1_hi = vmlal_s16(sum1_hi, vget_high_s16(src_vec1), vget_high_s16(filter_val));
+        }
+
+        // Right shift with rounding
+        int32x4_t shift_vec = vdupq_n_s32(-shift);
+        sum0_lo = vshlq_s32(sum0_lo, shift_vec);
+        sum0_hi = vshlq_s32(sum0_hi, shift_vec);
+        sum1_lo = vshlq_s32(sum1_lo, shift_vec);
+        sum1_hi = vshlq_s32(sum1_hi, shift_vec);
+
+        // Clip to output_bits range
+        sum0_lo = vmaxq_s32(vminq_s32(sum0_lo, vdupq_n_s32(clip_max)), vdupq_n_s32(0));
+        sum0_hi = vmaxq_s32(vminq_s32(sum0_hi, vdupq_n_s32(clip_max)), vdupq_n_s32(0));
+        sum1_lo = vmaxq_s32(vminq_s32(sum1_lo, vdupq_n_s32(clip_max)), vdupq_n_s32(0));
+        sum1_hi = vmaxq_s32(vminq_s32(sum1_hi, vdupq_n_s32(clip_max)), vdupq_n_s32(0));
+
+        // Convert to 16-bit
+        uint16x8_t result0 = vcombine_u16(
+            vreinterpret_u16_s16(vmovn_s32(sum0_lo)),
+            vreinterpret_u16_s16(vmovn_s32(sum0_hi))
+        );
+        uint16x8_t result1 = vcombine_u16(
+            vreinterpret_u16_s16(vmovn_s32(sum1_lo)),
+            vreinterpret_u16_s16(vmovn_s32(sum1_hi))
+        );
+
+        // Store with proper endianness
+        if (big_endian) {
+            result0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(result0)));
+            result1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(result1)));
+        }
+        vst1q_u16(&dest[i], result0);
+        vst1q_u16(&dest[i + 8], result1);
+    }
+
+    // Handle remaining pixels
+    for (; i < dstW; i++) {
+        int val = bias;
+        for (int j = 0; j < filterSize; j++) {
+            val += src[j][i] * filter[j];
+        }
+        output_pixel(&dest[i], val);
+    }
+}
+#else
 static av_always_inline void
 yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
                          const int16_t **src, uint16_t *dest, int dstW,
@@ -355,6 +428,7 @@ yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
         output_pixel(&dest[i], val);
     }
 }
+#endif

 #undef output_pixel

--
2.36.0.windows.1