From: Harshitha Sarangu Suresh <harshitha@multicorewareinc.com> To: "ffmpeg-devel@ffmpeg.org" <ffmpeg-devel@ffmpeg.org> Cc: Dash Santosh Sathyanarayanan <dash.sathyanarayanan@multicorewareinc.com> Subject: Re: [FFmpeg-devel] [PATCH] swscale/output: Implement neon intrinsics for yuv2nv12cX_c() Date: Mon, 26 May 2025 08:40:48 +0000 Message-ID: <MA0P287MB1158352BE49E1A61B05FECB8D665A@MA0P287MB1158.INDP287.PROD.OUTLOOK.COM> (raw) In-Reply-To: <MA0P287MB1158754434858E1A7DFA1858D699A@MA0P287MB1158.INDP287.PROD.OUTLOOK.COM> Hi, Did you get a a chance to review this patch? Get Outlook for Android<https://aka.ms/AAb9ysg> ________________________________ From: Harshitha Sarangu Suresh Sent: Thursday, May 22, 2025 7:24:15 PM To: ffmpeg-devel@ffmpeg.org <ffmpeg-devel@ffmpeg.org> Cc: Dash Santosh Sathyanarayanan <dash.sathyanarayanan@multicorewareinc.com> Subject: [FFmpeg-devel] [PATCH] swscale/output: Implement neon intrinsics for yuv2nv12cX_c() This optimization provides 6x improvement for the module. The boost in performance was calculated by adding C timers inside the C function and the optimized neon intrinsic function. From 1deceb0394a5acdf70677870dc252fd66a91dd9f Mon Sep 17 00:00:00 2001 From: Harshitha Suresh <harshitha@multicorewareinc.com> Date: Mon, 19 May 2025 22:37:20 +0530 Subject: [PATCH] swscale/output: Implement neon intrinsics for yuv2nv12cX_c() --- libswscale/aarch64/swscale.c | 151 +++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c index 6e5a721c1f..fb59c3f1b0 100644 --- a/libswscale/aarch64/swscale.c +++ b/libswscale/aarch64/swscale.c @@ -21,6 +21,9 @@ #include "libswscale/swscale.h" #include "libswscale/swscale_internal.h" #include "libavutil/aarch64/cpu.h" +#if defined (__aarch64__) +#include <arm_neon.h> +#endif void ff_hscale16to15_4_neon_asm(int shift, int16_t *_dst, int dstW, const uint8_t *_src, const int16_t *filter, @@ -142,6 +145,153 @@ static void ff_hscale16to19_X4_neon(SwsInternal *c, int16_t *_dst, int dstW, } +static void ff_yuv2nv12cX_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither, + const int16_t *chrFilter, int chrFilterSize, + const int16_t **chrUSrc, const int16_t **chrVSrc, + uint8_t *dest, int chrDstW) +{ + + int i; + int u_dither[8], v_dither[8]; + for (i = 0; i < 8; i++) { + u_dither[i] = chrDither[i & 7] << 12; + v_dither[i] = chrDither[(i + 3) & 7] << 12; + } + int32x4_t u0 = vld1q_s32(&u_dither[0]); + int32x4_t u1 = vld1q_s32(&u_dither[4]); + int32x4_t v0 = vld1q_s32(&v_dither[0]); + int32x4_t v1 = vld1q_s32(&v_dither[4]); + + if (!isSwappedChroma(dstFormat)) + { + for (i = 0; i <= chrDstW - 8; i += 8) + { + int32x4_t udst0 = u0; + int32x4_t udst1 = u1; + int32x4_t vdst0 = v0; + int32x4_t vdst1 = v1; + + for (int j = 0; j < chrFilterSize; j++) + { + int16x8_t usrc0 = vld1q_s16(&chrUSrc[j][i]); + int16x8_t vsrc0 = vld1q_s16(&chrVSrc[j][i]); + + int32x4_t usrc0_low = vmovl_s16(vget_low_s16(usrc0)); + int32x4_t usrc0_high = vmovl_s16(vget_high_s16(usrc0)); + int32x4_t vsrc0_low = vmovl_s16(vget_low_s16(vsrc0)); + int32x4_t vsrc0_high = vmovl_s16(vget_high_s16(vsrc0)); + + udst0 = vmlaq_n_s32(udst0, usrc0_low, chrFilter[j]); + udst1 = vmlaq_n_s32(udst1, usrc0_high, chrFilter[j]); + vdst0 = vmlaq_n_s32(vdst0, vsrc0_low, chrFilter[j]); + vdst1 = vmlaq_n_s32(vdst1, vsrc0_high, chrFilter[j]); + + } + // Right shift by 19 + udst0 = vshrq_n_s32(udst0, 19); + udst1 = vshrq_n_s32(udst1, 19); + vdst0 = vshrq_n_s32(vdst0, 19); + vdst1 = vshrq_n_s32(vdst1, 19); + + // Convert to 16-bit and then to uint8, with saturation + int16x8_t u16 = vcombine_s16(vqmovn_s32(udst0), vqmovn_s32(udst1)); + int16x8_t v16 = vcombine_s16(vqmovn_s32(vdst0), vqmovn_s32(vdst1)); + + uint8x8_t u8 = vqmovun_s16(u16); + uint8x8_t v8 = vqmovun_s16(v16); + + // Store interleaved u/v as UV UV UV... + uint8x8x2_t uv; + uv.val[0] = u8; + uv.val[1] = v8; + vst2_u8(dest + 2 * i, uv); + } + + // Handle remaining pixels with scalar fallback + for (; i < chrDstW; i++) + { + int u = chrDither[i & 7] << 12; + int v = chrDither[(i + 3) & 7] << 12; + + for (int j = 0; j < chrFilterSize; j++) + { + u += chrUSrc[j][i] * chrFilter[j]; + v += chrVSrc[j][i] * chrFilter[j]; + } + + uint8_t uu = av_clip_uint8(u >> 19); + uint8_t vv = av_clip_uint8(v >> 19); + dest[2 * i] = uu; + dest[2 * i + 1] = vv; + } + } + else + { + if (!isSwappedChroma(dstFormat)) + { + for (i = 0; i <= chrDstW - 8; i += 8) + { + int32x4_t udst0 = u0; + int32x4_t udst1 = u1; + int32x4_t vdst0 = v0; + int32x4_t vdst1 = v1; + + for (int j = 0; j < chrFilterSize; j++) + { + int16x8_t usrc = vld1q_s16(&chrUSrc[j][i]); + int16x8_t vsrc = vld1q_s16(&chrVSrc[j][i]); + + int32x4_t usrc_low = vmovl_s16(vget_low_s16(usrc)); + int32x4_t usrc_high = vmovl_s16(vget_high_s16(usrc)); + int32x4_t vsrc_low = vmovl_s16(vget_low_s16(vsrc)); + int32x4_t vsrc_high = vmovl_s16(vget_high_s16(vsrc)); + + udst0 = vmlaq_n_s32(udst0, usrc_low, chrFilter[j]); + udst1 = vmlaq_n_s32(udst1, usrc_high, chrFilter[j]); + vdst0 = vmlaq_n_s32(vdst0, vsrc_low, chrFilter[j]); + vdst1 = vmlaq_n_s32(vdst1, vsrc_high, chrFilter[j]); + } + // Right shift by 19 + udst0 = vshrq_n_s32(udst0, 19); + udst1 = vshrq_n_s32(udst1, 19); + vdst0 = vshrq_n_s32(vdst0, 19); + vdst1 = vshrq_n_s32(vdst1, 19); + + // Convert to 16-bit and then to uint8, with saturation + int16x8_t u16 = vcombine_s16(vqmovn_s32(udst0), vqmovn_s32(udst1)); + int16x8_t v16 = vcombine_s16(vqmovn_s32(vdst0), vqmovn_s32(vdst1)); + + uint8x8_t u8 = vqmovun_s16(u16); + uint8x8_t v8 = vqmovun_s16(v16); + + // Store interleaved u/v as UV UV UV... + uint8x8x2_t uv; + uv.val[0] = v8; + uv.val[1] = u8; + vst2_u8(dest + 2 * i, uv); + } + + // Handle remaining pixels with scalar fallback + for (; i < chrDstW; i++) + { + int u = chrDither[i & 7] << 12; + int v = chrDither[(i + 3) & 7] << 12; + + for (int j = 0; j < chrFilterSize; j++) + { + u += chrUSrc[j][i] * chrFilter[j]; + v += chrVSrc[j][i] * chrFilter[j]; + } + + uint8_t uu = av_clip_uint8(u >> 19); + uint8_t vv = av_clip_uint8(v >> 19); + dest[2 * i] = vv; + dest[2 * i + 1] = uu; + } + } + } +} + #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \ SwsInternal *c, int16_t *data, \ @@ -275,6 +425,7 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c) ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon); if (c->dstBpc == 8) { c->yuv2planeX = ff_yuv2planeX_8_neon; + c->yuv2nv12cX = ff_yuv2nv12cX_neon; } switch (c->opts.src_format) { case AV_PIX_FMT_ABGR: -- 2.36.0.windows.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2025-05-26 8:40 UTC|newest] Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top 2025-05-22 13:54 Harshitha Sarangu Suresh 2025-05-26 8:40 ` Harshitha Sarangu Suresh [this message] 2025-05-26 9:34 ` Zhao Zhili 2025-06-02 4:30 ` Harshitha Sarangu Suresh
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=MA0P287MB1158352BE49E1A61B05FECB8D665A@MA0P287MB1158.INDP287.PROD.OUTLOOK.COM \ --to=harshitha@multicorewareinc.com \ --cc=dash.sathyanarayanan@multicorewareinc.com \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git