From: Harshitha Sarangu Suresh <harshitha@multicorewareinc.com> To: "ffmpeg-devel@ffmpeg.org" <ffmpeg-devel@ffmpeg.org> Cc: Dash Santosh Sathyanarayanan <dash.sathyanarayanan@multicorewareinc.com> Subject: Re: [FFmpeg-devel] [PATCH] swscale/output: Implement neon intrinsics for yuv2planeX_10_c_template() Date: Mon, 26 May 2025 08:40:17 +0000 Message-ID: <MA0P287MB115807263CED206B88F0B472D665A@MA0P287MB1158.INDP287.PROD.OUTLOOK.COM> (raw) In-Reply-To: <MA0P287MB1158903867B74EADCF10A9C2D699A@MA0P287MB1158.INDP287.PROD.OUTLOOK.COM> Hi, Did you get a chance to review this patch? Get Outlook for Android<https://aka.ms/AAb9ysg> ________________________________ From: Harshitha Sarangu Suresh Sent: Thursday, May 22, 2025 7:27:31 PM To: ffmpeg-devel@ffmpeg.org <ffmpeg-devel@ffmpeg.org> Cc: Dash Santosh Sathyanarayanan <dash.sathyanarayanan@multicorewareinc.com> Subject: [FFmpeg-devel] [PATCH] swscale/output: Implement neon intrinsics for yuv2planeX_10_c_template() This optimization provides 5x improvement for the module. The boost in performance was calculated by adding C timers inside the C function and the optimized neon intrinsic function. From 904144c2db9e5e72d56360c4c2eb38d426852901 Mon Sep 17 00:00:00 2001 From: Harshitha Suresh <harshitha@multicorewareinc.com> Date: Thu, 22 May 2025 10:23:55 +0530 Subject: [PATCH] swscale/output: Implement neon intrinsics for yuv2planeX_10_c_template() --- libswscale/output.c | 76 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/libswscale/output.c b/libswscale/output.c index c37649e7ce..345df5ce59 100644 --- a/libswscale/output.c +++ b/libswscale/output.c @@ -22,7 +22,9 @@ #include <stddef.h> #include <stdint.h> #include <string.h> - +#if defined (__aarch64__) +#include <arm_neon.h> +#endif #include "libavutil/attributes.h" #include "libavutil/avutil.h" #include "libavutil/avassert.h" @@ -337,6 +339,77 @@ yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW, } } + +#if defined (__aarch64__) && !defined(__APPLE__) +static av_always_inline void +yuv2planeX_10_c_template(const int16_t *filter, int filterSize, + const int16_t **src, uint16_t *dest, int dstW, + int big_endian, int output_bits) +{ + const int shift = 11 + 16 - output_bits; + const int bias = 1 << (shift - 1); + const int clip_max = (1 << output_bits) - 1; + int i; + + for (i = 0; i < dstW; i += 16) { + int32x4_t sum0_lo = vdupq_n_s32(bias); + int32x4_t sum0_hi = vdupq_n_s32(bias); + int32x4_t sum1_lo = vdupq_n_s32(bias); + int32x4_t sum1_hi = vdupq_n_s32(bias); + + for (int j = 0; j < filterSize; j++) { + int16x8_t src_vec0 = vld1q_s16(&src[j][i]); + int16x8_t src_vec1 = vld1q_s16(&src[j][i + 8]); + int16x8_t filter_val = vdupq_n_s16(filter[j]); + + sum0_lo = vmlal_s16(sum0_lo, vget_low_s16(src_vec0), vget_low_s16(filter_val)); + sum0_hi = vmlal_s16(sum0_hi, vget_high_s16(src_vec0), vget_high_s16(filter_val)); + sum1_lo = vmlal_s16(sum1_lo, vget_low_s16(src_vec1), vget_low_s16(filter_val)); + sum1_hi = vmlal_s16(sum1_hi, vget_high_s16(src_vec1), vget_high_s16(filter_val)); + } + + // Right shift with rounding + int32x4_t shift_vec = vdupq_n_s32(-shift); + sum0_lo = vshlq_s32(sum0_lo, shift_vec); + sum0_hi = vshlq_s32(sum0_hi, shift_vec); + sum1_lo = vshlq_s32(sum1_lo, shift_vec); + sum1_hi = vshlq_s32(sum1_hi, shift_vec); + + // Clip to output_bits range + sum0_lo = vmaxq_s32(vminq_s32(sum0_lo, vdupq_n_s32(clip_max)), vdupq_n_s32(0)); + sum0_hi = vmaxq_s32(vminq_s32(sum0_hi, vdupq_n_s32(clip_max)), vdupq_n_s32(0)); + sum1_lo = vmaxq_s32(vminq_s32(sum1_lo, vdupq_n_s32(clip_max)), vdupq_n_s32(0)); + sum1_hi = vmaxq_s32(vminq_s32(sum1_hi, vdupq_n_s32(clip_max)), vdupq_n_s32(0)); + + // Convert to 16-bit + uint16x8_t result0 = vcombine_u16( + vreinterpret_u16_s16(vmovn_s32(sum0_lo)), + vreinterpret_u16_s16(vmovn_s32(sum0_hi)) + ); + uint16x8_t result1 = vcombine_u16( + vreinterpret_u16_s16(vmovn_s32(sum1_lo)), + vreinterpret_u16_s16(vmovn_s32(sum1_hi)) + ); + + // Store with proper endianness + if (big_endian) { + result0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(result0))); + result1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(result1))); + } + vst1q_u16(&dest[i], result0); + vst1q_u16(&dest[i + 8], result1); + } + + // Handle remaining pixels + for (; i < dstW; i++) { + int val = bias; + for (int j = 0; j < filterSize; j++) { + val += src[j][i] * filter[j]; + } + output_pixel(&dest[i], val); + } +} +#else static av_always_inline void yuv2planeX_10_c_template(const int16_t *filter, int filterSize, const int16_t **src, uint16_t *dest, int dstW, @@ -355,6 +428,7 @@ yuv2planeX_10_c_template(const int16_t *filter, int filterSize, output_pixel(&dest[i], val); } } +#endif #undef output_pixel -- 2.36.0.windows.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2025-05-26 8:40 UTC|newest] Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top 2025-05-22 13:57 Harshitha Sarangu Suresh 2025-05-26 8:40 ` Harshitha Sarangu Suresh [this message] 2025-05-26 9:05 ` Martin Storsjö
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=MA0P287MB115807263CED206B88F0B472D665A@MA0P287MB1158.INDP287.PROD.OUTLOOK.COM \ --to=harshitha@multicorewareinc.com \ --cc=dash.sathyanarayanan@multicorewareinc.com \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git