Re: [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c() [v2]

From: "Martin Storsjö" <martin@martin.st>
To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: Dash Santosh Sathyanarayanan
	<dash.sathyanarayanan@multicorewareinc.com>,
	Logaprakash Ramajayam
	<logaprakash.ramajayam@multicorewareinc.com>
Subject: Re: [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c() [v2]
Date: Thu, 19 Jun 2025 00:01:32 +0300 (EEST)
Message-ID: <747ed620-2660-706e-a834-6c9b468c442@martin.st> (raw)
In-Reply-To: <MA0P287MB1158BA521247AE411CFF896DD66BA@MA0P287MB1158.INDP287.PROD.OUTLOOK.COM>

On Mon, 9 Jun 2025, Harshitha Sarangu Suresh wrote:

> From 4ca5eae1e7164f78296719f19aef97239e5b046a Mon Sep 17 00:00:00 2001
> From: Harshitha Suresh <harshitha@multicorewareinc.com>
> Date: Mon, 19 May 2025 22:37:20 +0530
> Subject: [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c().
>
> yuv2nv12cX_2_512_accurate_c:                          3508.8 ( 1.00x)
> yuv2nv12cX_2_512_accurate_neon:                        369.2 ( 9.50x)
> yuv2nv12cX_2_512_approximate_c:                       3499.0 ( 1.00x)
> yuv2nv12cX_2_512_approximate_neon:                     370.2 ( 9.45x)
> yuv2nv12cX_4_512_accurate_c:                          4683.0 ( 1.00x)
> yuv2nv12cX_4_512_accurate_neon:                        568.8 ( 8.23x)
> yuv2nv12cX_4_512_approximate_c:                       4682.6 ( 1.00x)
> yuv2nv12cX_4_512_approximate_neon:                     569.9 ( 8.22x)
> yuv2nv12cX_8_512_accurate_c:                          7243.0 ( 1.00x)
> yuv2nv12cX_8_512_accurate_neon:                        937.6 ( 7.72x)
> yuv2nv12cX_8_512_approximate_c:                       7235.9 ( 1.00x)
> yuv2nv12cX_8_512_approximate_neon:                     938.3 ( 7.71x)
> yuv2nv12cX_16_512_accurate_c:                        13749.7 ( 1.00x)
> yuv2nv12cX_16_512_accurate_neon:                      1708.1 ( 8.05x)
> yuv2nv12cX_16_512_approximate_c:                     13750.0 ( 1.00x)
> yuv2nv12cX_16_512_approximate_neon:                   1708.6 ( 8.05x)
> ---
> libswscale/aarch64/output.S  | 306 +++++++++++++++++++++++++++++++++++
> libswscale/aarch64/swscale.c |  19 +++
> 2 files changed, 325 insertions(+)
>
> diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
> index 190c438870..2d87cc6a5e 100644
> --- a/libswscale/aarch64/output.S
> +++ b/libswscale/aarch64/output.S
> @@ -226,3 +226,309 @@ function ff_yuv2plane1_8_neon, export=1
>         b.gt            2b                              // loop until width consumed
>         ret
> endfunc
> +
> +function ff_yuv2nv12cX_notswapped_neon, export=1
> +// x0 - dstFormat (unused)
> +// x1 - uint8_t *chrDither
> +// x2 - int16_t *chrFilter
> +// x3 - int chrFilterSize
> +// x4 - int16_t **chrUSrc
> +// x5 - int16_t **chrVSrc
> +// x6 - uint8_t *dest
> +// x7 - int chrDstW
> +
> +        // Load dither pattern and compute U and V dither vectors
> +        ld1             {v0.8b}, [x1]                   // chrDither[0..7]
> +        ext             v1.8b, v0.8b, v0.8b, #3         // Rotate for V: (i+3)&7
> +
> +        uxtl            v0.8h, v0.8b
> +        uxtl            v1.8h, v1.8b
> +
> +        ushll           v2.4s, v0.4h, #12               // U dither low
> +        ushll2          v3.4s, v0.8h, #12               // U dither high
> +        ushll           v4.4s, v1.4h, #12               // V dither low
> +        ushll2          v5.4s, v1.8h, #12               // V dither high
> +
> +        // Check if we can process 16 pixels at a time
> +        tst             w7, #15                         // Check if chrDstW % 16 == 0
> +        b.ne            .Lprocess_8_pixels              // If not, use 8-pixel version

So, if the width isn't a multiple of 16, we'll end up writing 8 pixels at 
a time? Does this mean that we assume that the width always is a multiple 
of 8? If not, we'll end up overwriting the end of each line by 0-7 pixels 
- is that right?

It seems a little peculiar to me that we'd have a 8 pixel alignment 
guarantee but not a 16 pixel alignment here - either we'd probably have 
both or neither at all? If we can't assume alignment, we'd need to have a 
scalar loop as well.

This is not a rhetorical question - I don't know what the 
requirements/guarantees of this function are (and finding out can, worst 
case, be a fair bit of work), so you'd need to find that out and point to 
references. Long time preexisting SIMD implementations of the function 
doing it one particular way may also be a reasonable reference.

Secondly - if the width isn't a multiple of 16, but is a large number, it 
seems wasteful to iterate over it 8 pixels at a time? Wouldn't it be more 
efficient to handle as many pixels as possible, 16 pixels at a time, and 
then finally handle the remainder in whichever form is most suitable.

> +        // =============================================
> +        // 16-pixel processing path
> +        // =============================================

FWIW, this banner feels a bit large compared with the style of the rest of 
the code.

> +        mov             x8, #0                          // i = 0
> +.Lloop_16_pixels:
> +
> +        mov             v16.16b, v2.16b                 // U acc low
> +        mov             v17.16b, v3.16b                 // U acc high
> +        mov             v18.16b, v4.16b                 // V acc low
> +        mov             v19.16b, v5.16b                 // V acc high
> +
> +        mov             v20.16b, v2.16b
> +        mov             v21.16b, v3.16b
> +        mov             v22.16b, v4.16b
> +        mov             v23.16b, v5.16b
> +
> +        mov             w9, w3                          // chrFilterSize counter
> +        mov             x10, x2                         // chrFilter pointer
> +        mov             x11, x4                         // chrUSrc base
> +        mov             x12, x5                         // chrVSrc base
> +
> +.Lfilter_loop_16:
> +        ldr             h6, [x10], #2                   // Load filter coefficient
> +
> +        ldr             x13, [x11], #8                  // chrUSrc[j]
> +        ldr             x14, [x12], #8                  // chrVSrc[j]
> +        add             x13, x13, x8, lsl #1            // &chrUSrc[j][i]
> +        add             x14, x14, x8, lsl #1            // &chrVSrc[j][i]
> +        add             x15, x13, #16                   // x15 = &chrUSrc[j][i+8] (8 samples * 2 bytes)
> +        add             x16, x14, #16
> +
> +        ld1             {v24.8h}, [x13]                 // U samples 0-7
> +        ld1             {v25.8h}, [x14]                 // V samples 0-7
> +
> +        ld1             {v26.8h}, [x15]                 // U samples 8-15
> +        ld1             {v27.8h}, [x16]                 // V samples 8-15
> +
> +        smlal           v16.4s, v24.4h, v6.h[0]
> +        smlal2          v17.4s, v24.8h, v6.h[0]
> +        smlal           v18.4s, v25.4h, v6.h[0]
> +        smlal2          v19.4s, v25.8h, v6.h[0]
> +
> +        smlal           v20.4s, v26.4h, v6.h[0]
> +        smlal2          v21.4s, v26.8h, v6.h[0]
> +        smlal           v22.4s, v27.4h, v6.h[0]
> +        smlal2          v23.4s, v27.8h, v6.h[0]
> +
> +        subs            w9, w9, #1
> +        b.gt            .Lfilter_loop_16
> +
> +        // Process and store first 8 pixels
> +        sqshrun         v28.4h, v16.4s, #16
> +        sqshrun2        v28.8h, v17.4s, #16
> +        sqshrun         v29.4h, v18.4s, #16
> +        sqshrun2        v29.8h, v19.4s, #16
> +        uqshrn          v30.8b, v28.8h, #3              // U
> +        uqshrn          v31.8b, v29.8h, #3              // V
> +
> +        // Process and store next 8 pixels
> +        sqshrun         v28.4h, v20.4s, #16
> +        sqshrun2        v28.8h, v21.4s, #16
> +        sqshrun         v29.4h, v22.4s, #16
> +        sqshrun2        v29.8h, v23.4s, #16
> +        uqshrn          v24.8b, v28.8h, #3              // U
> +        uqshrn          v25.8b, v29.8h, #3              // V
> +
> +        // Store both 8-pixel blocks
> +        st2             {v30.8b, v31.8b}, [x6], #16
> +        st2             {v24.8b, v25.8b}, [x6], #16
> +
> +        subs            w7, w7, #16
> +        add             x8, x8, #16
> +        b.gt            .Lloop_16_pixels
> +        ret
> +
> +        // =============================================
> +        // 8-pixel processing path (original code)
> +        // =============================================

What does "original code" mean here? That you first wrote this codepath, 
and then unrolled it into the 16 pixel version above? That comment doesn't 
really add much value to the future reader of the code.

> +.Lprocess_8_pixels:
> +        mov             x8, #0                        // i = 0
> +.Lloop_8_pixels:
> +        // Initialize accumulators with dither
> +        mov             v16.16b, v2.16b               // U acc low
> +        mov             v17.16b, v3.16b               // U acc high
> +        mov             v18.16b, v4.16b               // V acc low
> +        mov             v19.16b, v5.16b               // V acc high
> +
> +        mov             w9, w3                        // chrFilterSize counter
> +        mov             x10, x2                       // chrFilter pointer
> +        mov             x11, x4                       // chrUSrc base
> +        mov             x12, x5                       // chrVSrc base
> +
> +.Lfilter_loop_8:
> +        ldr             h6, [x10], #2                 // Load filter coefficient
> +
> +        ldr             x13, [x11], #8                // chrUSrc[j]
> +        ldr             x14, [x12], #8                // chrVSrc[j]
> +        add             x13, x13, x8, lsl #1          // &chrUSrc[j][i]
> +        add             x14, x14, x8, lsl #1          // &chrVSrc[j][i]
> +
> +        ld1             {v20.8h}, [x13]               // U samples
> +        ld1             {v21.8h}, [x14]               // V samples
> +
> +        smlal           v16.4s, v20.4h, v6.h[0]
> +        smlal2          v17.4s, v20.8h, v6.h[0]
> +        smlal           v18.4s, v21.4h, v6.h[0]
> +        smlal2          v19.4s, v21.8h, v6.h[0]
> +
> +        subs            w9, w9, #1
> +        b.gt            .Lfilter_loop_8
> +
> +        // Final processing and store
> +        sqshrun         v26.4h, v16.4s, #16
> +        sqshrun2        v26.8h, v17.4s, #16
> +        sqshrun         v27.4h, v18.4s, #16
> +        sqshrun2        v27.8h, v19.4s, #16
> +        uqshrn          v28.8b, v26.8h, #3            // U
> +        uqshrn          v29.8b, v27.8h, #3            // V
> +
> +        st2             {v28.8b, v29.8b}, [x6], #16
> +
> +        subs            w7, w7, #8
> +        add             x8, x8, #8
> +        b.gt            .Lloop_8_pixels
> +        ret
> +endfunc
> +
> +function ff_yuv2nv12cX_swapped_neon, export=1
> +// x0 - dstFormat (unused)
> +// x1 - uint8_t *chrDither
> +// x2 - int16_t *chrFilter
> +// x3 - int chrFilterSize
> +// x4 - int16_t **chrUSrc
> +// x5 - int16_t **chrVSrc
> +// x6 - uint8_t *dest
> +// x7 - int chrDstW

Isn't this function _exactly_ the same code as the previous one, just 
swapping the registers at the final uqshrn instruction before st2?

We don't want to have that level of code duplication. For such cases, wrap 
the function definition in a .macro and just add conditionals around the 
part of the function that actually differs.

> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
> index 6e5a721c1f..5246d53a16 100644
> --- a/libswscale/aarch64/swscale.c
> +++ b/libswscale/aarch64/swscale.c
> @@ -168,6 +168,16 @@ void ff_yuv2plane1_8_neon(
>         const uint8_t *dither,
>         int offset);
>
> +void ff_yuv2nv12cX_notswapped_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
> +    const int16_t *chrFilter, int chrFilterSize,
> +    const int16_t **chrUSrc, const int16_t **chrVSrc,
> +    uint8_t *dest, int chrDstW);

Indent the parameters properly aligned with the parameters on the first 
line. ff_yuv2plane1_8_neon above is incorrectly indentend, 
ff_yuv2planeX_8_neon above is right.

> +
> +void ff_yuv2nv12cX_swapped_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
> +    const int16_t *chrFilter, int chrFilterSize,
> +    const int16_t **chrUSrc, const int16_t **chrVSrc,
> +    uint8_t *dest, int chrDstW);
> +
> #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt) do {              \
>     if (c->srcBpc == 8) {                                               \
>         if(c->dstBpc <= 14) {                                           \
> @@ -201,6 +211,12 @@ void ff_yuv2plane1_8_neon(
>     default: break;                                                     \
>     }
>
> +#define ASSIGN_YUV2NV12_FUNC(yuv2nv12fn, opt, dstFormat)               \
> +    if(!isSwappedChroma(dstFormat))                                    \

Spaces between if and parenthesis in "if("

> +        yuv2nv12fn = ff_yuv2nv12cX_notswapped_  ## opt;                \

Stray double space between _ and ##.

> +    else                                                               \
> +        yuv2nv12fn = ff_yuv2nv12cX_swapped_  ## opt;
> +
> #define NEON_INPUT(name) \
> void ff_##name##ToY_neon(uint8_t *dst, const uint8_t *src, const uint8_t *, \
>                         const uint8_t *, int w, uint32_t *coeffs, void *); \
> @@ -275,7 +291,10 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
>         ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon);
>         if (c->dstBpc == 8) {
>             c->yuv2planeX = ff_yuv2planeX_8_neon;
> +            if(isSemiPlanarYUV(c->opts.dst_format))

Missing space in "if(".

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".