This optimization provides 5x improvement for the module. The boost in performance was calculated by adding C timers inside the C function and the optimized neon intrinsic function. From 904144c2db9e5e72d56360c4c2eb38d426852901 Mon Sep 17 00:00:00 2001 From: Harshitha Suresh Date: Thu, 22 May 2025 10:23:55 +0530 Subject: [PATCH] swscale/output: Implement neon intrinsics for yuv2planeX_10_c_template() --- libswscale/output.c | 76 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/libswscale/output.c b/libswscale/output.c index c37649e7ce..345df5ce59 100644 --- a/libswscale/output.c +++ b/libswscale/output.c @@ -22,7 +22,9 @@ #include #include #include - +#if defined (__aarch64__) +#include +#endif #include "libavutil/attributes.h" #include "libavutil/avutil.h" #include "libavutil/avassert.h" @@ -337,6 +339,77 @@ yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW, } } + +#if defined (__aarch64__) && !defined(__APPLE__) +static av_always_inline void +yuv2planeX_10_c_template(const int16_t *filter, int filterSize, + const int16_t **src, uint16_t *dest, int dstW, + int big_endian, int output_bits) +{ + const int shift = 11 + 16 - output_bits; + const int bias = 1 << (shift - 1); + const int clip_max = (1 << output_bits) - 1; + int i; + + for (i = 0; i < dstW; i += 16) { + int32x4_t sum0_lo = vdupq_n_s32(bias); + int32x4_t sum0_hi = vdupq_n_s32(bias); + int32x4_t sum1_lo = vdupq_n_s32(bias); + int32x4_t sum1_hi = vdupq_n_s32(bias); + + for (int j = 0; j < filterSize; j++) { + int16x8_t src_vec0 = vld1q_s16(&src[j][i]); + int16x8_t src_vec1 = vld1q_s16(&src[j][i + 8]); + int16x8_t filter_val = vdupq_n_s16(filter[j]); + + sum0_lo = vmlal_s16(sum0_lo, vget_low_s16(src_vec0), vget_low_s16(filter_val)); + sum0_hi = vmlal_s16(sum0_hi, vget_high_s16(src_vec0), vget_high_s16(filter_val)); + sum1_lo = vmlal_s16(sum1_lo, vget_low_s16(src_vec1), vget_low_s16(filter_val)); + sum1_hi = vmlal_s16(sum1_hi, vget_high_s16(src_vec1), vget_high_s16(filter_val)); + } + + // Right shift with rounding + int32x4_t shift_vec = vdupq_n_s32(-shift); + sum0_lo = vshlq_s32(sum0_lo, shift_vec); + sum0_hi = vshlq_s32(sum0_hi, shift_vec); + sum1_lo = vshlq_s32(sum1_lo, shift_vec); + sum1_hi = vshlq_s32(sum1_hi, shift_vec); + + // Clip to output_bits range + sum0_lo = vmaxq_s32(vminq_s32(sum0_lo, vdupq_n_s32(clip_max)), vdupq_n_s32(0)); + sum0_hi = vmaxq_s32(vminq_s32(sum0_hi, vdupq_n_s32(clip_max)), vdupq_n_s32(0)); + sum1_lo = vmaxq_s32(vminq_s32(sum1_lo, vdupq_n_s32(clip_max)), vdupq_n_s32(0)); + sum1_hi = vmaxq_s32(vminq_s32(sum1_hi, vdupq_n_s32(clip_max)), vdupq_n_s32(0)); + + // Convert to 16-bit + uint16x8_t result0 = vcombine_u16( + vreinterpret_u16_s16(vmovn_s32(sum0_lo)), + vreinterpret_u16_s16(vmovn_s32(sum0_hi)) + ); + uint16x8_t result1 = vcombine_u16( + vreinterpret_u16_s16(vmovn_s32(sum1_lo)), + vreinterpret_u16_s16(vmovn_s32(sum1_hi)) + ); + + // Store with proper endianness + if (big_endian) { + result0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(result0))); + result1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(result1))); + } + vst1q_u16(&dest[i], result0); + vst1q_u16(&dest[i + 8], result1); + } + + // Handle remaining pixels + for (; i < dstW; i++) { + int val = bias; + for (int j = 0; j < filterSize; j++) { + val += src[j][i] * filter[j]; + } + output_pixel(&dest[i], val); + } +} +#else static av_always_inline void yuv2planeX_10_c_template(const int16_t *filter, int filterSize, const int16_t **src, uint16_t *dest, int dstW, @@ -355,6 +428,7 @@ yuv2planeX_10_c_template(const int16_t *filter, int filterSize, output_pixel(&dest[i], val); } } +#endif #undef output_pixel -- 2.36.0.windows.1