Re: [FFmpeg-devel] [PATCH] swscale/output: Implement neon intrinsics for yuv2nv12cX_c()

From: Zhao Zhili <quinkblack-at-foxmail.com@ffmpeg.org>
To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: Dash Santosh Sathyanarayanan <dash.sathyanarayanan@multicorewareinc.com>
Subject: Re: [FFmpeg-devel] [PATCH] swscale/output: Implement neon intrinsics for yuv2nv12cX_c()
Date: Mon, 26 May 2025 17:34:43 +0800
Message-ID: <tencent_B27C26AD4A3989C195B6AC686E003568FA0A@qq.com> (raw)
In-Reply-To: <MA0P287MB1158352BE49E1A61B05FECB8D665A@MA0P287MB1158.INDP287.PROD.OUTLOOK.COM>

> On May 26, 2025, at 16:40, Harshitha Sarangu Suresh <harshitha@multicorewareinc.com> wrote:
> 
> Hi,
>     Did you get a a chance to review this patch?

Thank you for your contribution. However, we use manual assembly instead of intrinsics for neon.

> 
> Get Outlook for Android<https://aka.ms/AAb9ysg>
> ________________________________
> From: Harshitha Sarangu Suresh
> Sent: Thursday, May 22, 2025 7:24:15 PM
> To: ffmpeg-devel@ffmpeg.org <ffmpeg-devel@ffmpeg.org>
> Cc: Dash Santosh Sathyanarayanan <dash.sathyanarayanan@multicorewareinc.com>
> Subject: [FFmpeg-devel] [PATCH] swscale/output: Implement neon intrinsics for yuv2nv12cX_c()
> 
> This optimization provides 6x improvement for the module. The boost in performance was calculated by adding C timers inside the C function and the optimized neon intrinsic function.
> 
> 
> From 1deceb0394a5acdf70677870dc252fd66a91dd9f Mon Sep 17 00:00:00 2001
> From: Harshitha Suresh <harshitha@multicorewareinc.com>
> Date: Mon, 19 May 2025 22:37:20 +0530
> Subject: [PATCH] swscale/output: Implement neon intrinsics for yuv2nv12cX_c()
> 
> ---
> libswscale/aarch64/swscale.c | 151 +++++++++++++++++++++++++++++++++++
> 1 file changed, 151 insertions(+)
> 
> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
> index 6e5a721c1f..fb59c3f1b0 100644
> --- a/libswscale/aarch64/swscale.c
> +++ b/libswscale/aarch64/swscale.c
> @@ -21,6 +21,9 @@
> #include "libswscale/swscale.h"
> #include "libswscale/swscale_internal.h"
> #include "libavutil/aarch64/cpu.h"
> +#if defined (__aarch64__)
> +#include <arm_neon.h>
> +#endif
> 
> void ff_hscale16to15_4_neon_asm(int shift, int16_t *_dst, int dstW,
>                       const uint8_t *_src, const int16_t *filter,
> @@ -142,6 +145,153 @@ static void ff_hscale16to19_X4_neon(SwsInternal *c, int16_t *_dst, int dstW,
> 
> }
> 
> +static void ff_yuv2nv12cX_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
> +    const int16_t *chrFilter, int chrFilterSize,
> +    const int16_t **chrUSrc, const int16_t **chrVSrc,
> +    uint8_t *dest, int chrDstW)
> +{
> +
> +    int i;
> +    int u_dither[8], v_dither[8];
> +    for (i = 0; i < 8; i++) {
> +        u_dither[i] = chrDither[i & 7] << 12;
> +        v_dither[i] = chrDither[(i + 3) & 7] << 12;
> +    }
> +    int32x4_t u0 = vld1q_s32(&u_dither[0]);
> +    int32x4_t u1 = vld1q_s32(&u_dither[4]);
> +    int32x4_t v0 = vld1q_s32(&v_dither[0]);
> +    int32x4_t v1 = vld1q_s32(&v_dither[4]);
> +
> +    if (!isSwappedChroma(dstFormat))
> +    {
> +        for (i = 0; i <= chrDstW - 8; i += 8)
> +        {
> +            int32x4_t udst0 = u0;
> +            int32x4_t udst1 = u1;
> +            int32x4_t vdst0 = v0;
> +            int32x4_t vdst1 = v1;
> +
> +            for (int j = 0; j < chrFilterSize; j++)
> +            {
> +                int16x8_t usrc0 = vld1q_s16(&chrUSrc[j][i]);
> +                int16x8_t vsrc0 = vld1q_s16(&chrVSrc[j][i]);
> +
> +                int32x4_t usrc0_low = vmovl_s16(vget_low_s16(usrc0));
> +                int32x4_t usrc0_high = vmovl_s16(vget_high_s16(usrc0));
> +                int32x4_t vsrc0_low = vmovl_s16(vget_low_s16(vsrc0));
> +                int32x4_t vsrc0_high = vmovl_s16(vget_high_s16(vsrc0));
> +
> +                udst0 = vmlaq_n_s32(udst0, usrc0_low, chrFilter[j]);
> +                udst1 = vmlaq_n_s32(udst1, usrc0_high, chrFilter[j]);
> +                vdst0 = vmlaq_n_s32(vdst0, vsrc0_low, chrFilter[j]);
> +                vdst1 = vmlaq_n_s32(vdst1, vsrc0_high, chrFilter[j]);
> +
> +            }
> +            // Right shift by 19
> +            udst0 = vshrq_n_s32(udst0, 19);
> +            udst1 = vshrq_n_s32(udst1, 19);
> +            vdst0 = vshrq_n_s32(vdst0, 19);
> +            vdst1 = vshrq_n_s32(vdst1, 19);
> +
> +            // Convert to 16-bit and then to uint8, with saturation
> +            int16x8_t u16 = vcombine_s16(vqmovn_s32(udst0), vqmovn_s32(udst1));
> +            int16x8_t v16 = vcombine_s16(vqmovn_s32(vdst0), vqmovn_s32(vdst1));
> +
> +            uint8x8_t u8 = vqmovun_s16(u16);
> +            uint8x8_t v8 = vqmovun_s16(v16);
> +
> +            // Store interleaved u/v as UV UV UV...
> +            uint8x8x2_t uv;
> +            uv.val[0] = u8;
> +            uv.val[1] = v8;
> +            vst2_u8(dest + 2 * i, uv);
> +        }
> +
> +        // Handle remaining pixels with scalar fallback
> +        for (; i < chrDstW; i++)
> +        {
> +            int u = chrDither[i & 7] << 12;
> +            int v = chrDither[(i + 3) & 7] << 12;
> +
> +            for (int j = 0; j < chrFilterSize; j++)
> +            {
> +                u += chrUSrc[j][i] * chrFilter[j];
> +                v += chrVSrc[j][i] * chrFilter[j];
> +            }
> +
> +            uint8_t uu = av_clip_uint8(u >> 19);
> +            uint8_t vv = av_clip_uint8(v >> 19);
> +            dest[2 * i] = uu;
> +            dest[2 * i + 1] = vv;
> +        }
> +    }
> +    else
> +    {
> +        if (!isSwappedChroma(dstFormat))
> +        {
> +            for (i = 0; i <= chrDstW - 8; i += 8)
> +            {
> +                int32x4_t udst0 = u0;
> +                int32x4_t udst1 = u1;
> +                int32x4_t vdst0 = v0;
> +                int32x4_t vdst1 = v1;
> +
> +                for (int j = 0; j < chrFilterSize; j++)
> +                {
> +                    int16x8_t usrc = vld1q_s16(&chrUSrc[j][i]);
> +                    int16x8_t vsrc = vld1q_s16(&chrVSrc[j][i]);
> +
> +                    int32x4_t usrc_low = vmovl_s16(vget_low_s16(usrc));
> +                    int32x4_t usrc_high = vmovl_s16(vget_high_s16(usrc));
> +                    int32x4_t vsrc_low = vmovl_s16(vget_low_s16(vsrc));
> +                    int32x4_t vsrc_high = vmovl_s16(vget_high_s16(vsrc));
> +
> +                    udst0 = vmlaq_n_s32(udst0, usrc_low, chrFilter[j]);
> +                    udst1 = vmlaq_n_s32(udst1, usrc_high, chrFilter[j]);
> +                    vdst0 = vmlaq_n_s32(vdst0, vsrc_low, chrFilter[j]);
> +                    vdst1 = vmlaq_n_s32(vdst1, vsrc_high, chrFilter[j]);
> +                }
> +                // Right shift by 19
> +                udst0 = vshrq_n_s32(udst0, 19);
> +                udst1 = vshrq_n_s32(udst1, 19);
> +                vdst0 = vshrq_n_s32(vdst0, 19);
> +                vdst1 = vshrq_n_s32(vdst1, 19);
> +
> +                // Convert to 16-bit and then to uint8, with saturation
> +                int16x8_t u16 = vcombine_s16(vqmovn_s32(udst0), vqmovn_s32(udst1));
> +                int16x8_t v16 = vcombine_s16(vqmovn_s32(vdst0), vqmovn_s32(vdst1));
> +
> +                uint8x8_t u8 = vqmovun_s16(u16);
> +                uint8x8_t v8 = vqmovun_s16(v16);
> +
> +                // Store interleaved u/v as UV UV UV...
> +                uint8x8x2_t uv;
> +                uv.val[0] = v8;
> +                uv.val[1] = u8;
> +                vst2_u8(dest + 2 * i, uv);
> +            }
> +
> +            // Handle remaining pixels with scalar fallback
> +            for (; i < chrDstW; i++)
> +            {
> +                int u = chrDither[i & 7] << 12;
> +                int v = chrDither[(i + 3) & 7] << 12;
> +
> +                for (int j = 0; j < chrFilterSize; j++)
> +                {
> +                    u += chrUSrc[j][i] * chrFilter[j];
> +                    v += chrVSrc[j][i] * chrFilter[j];
> +                }
> +
> +                uint8_t uu = av_clip_uint8(u >> 19);
> +                uint8_t vv = av_clip_uint8(v >> 19);
> +                dest[2 * i] = vv;
> +                dest[2 * i + 1] = uu;
> +            }
> +        }
> +    }
> +}
> +
> #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
> void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
>                                                 SwsInternal *c, int16_t *data, \
> @@ -275,6 +425,7 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
>         ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon);
>         if (c->dstBpc == 8) {
>             c->yuv2planeX = ff_yuv2planeX_8_neon;
> +            c->yuv2nv12cX = ff_yuv2nv12cX_neon;
>         }
>         switch (c->opts.src_format) {
>         case AV_PIX_FMT_ABGR:
> --
> 2.36.0.windows.1
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".