Re: [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: optimised alf_classify function 8/10/12bit of vvc codec for aarch64

From: "Martin Storsjö" <martin@martin.st>
To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: Georgii Zagoruiko <george.zaguri@gmail.com>
Subject: Re: [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: optimised alf_classify function 8/10/12bit of vvc codec for aarch64
Date: Fri, 1 Aug 2025 20:50:06 +0300 (EEST)
Message-ID: <2f1aa58f-826a-bd98-7ac2-5918d17ca2d@martin.st> (raw)
In-Reply-To: <20250709075652.5636-1-george.zaguri@gmail.com>

On Wed, 9 Jul 2025, Georgii Zagoruiko wrote:

> diff --git a/libavcodec/aarch64/vvc/alf.S b/libavcodec/aarch64/vvc/alf.S
> index 8801b3afb6..9c9765ead1 100644
> --- a/libavcodec/aarch64/vvc/alf.S
> +++ b/libavcodec/aarch64/vvc/alf.S
> @@ -291,3 +291,281 @@ function ff_alf_filter_chroma_kernel_10_neon, export=1
> 1:
>         alf_filter_chroma_kernel 2
> endfunc
> +
> +
> +
> +.macro alf_classify_argvar2v30
> +        mov             w16, #0
> +        mov             v30.b[0], w16
> +        mov             w16, #1
> +        mov             v30.b[1], w16
> +        mov             w16, #2
> +        mov             v30.b[2], w16
> +        mov             v30.b[3], w16
> +        mov             v30.b[4], w16
> +        mov             v30.b[5], w16
> +        mov             v30.b[6], w16
> +        mov             w16, #3
> +        mov             v30.b[7], w16
> +        mov             v30.b[8], w16
> +        mov             v30.b[9], w16
> +        mov             v30.b[10], w16
> +        mov             v30.b[11], w16
> +        mov             v30.b[12], w16
> +        mov             v30.b[13], w16
> +        mov             v30.b[14], w16
> +        mov             w16, #4
> +        mov             v30.b[15], w16

Wouldn't it be more efficient to just have this pattern stored as a 
constant somewhere, and do a single load instruction (with some latency) 
to load it, rather than a 19 instruction sequence to initialize it?

> +.endm
> +
> +.macro alf_classify_load_pixel pix_size, dst, src
> +    .if \pix_size == 1
> +        ldrb            \dst, [\src], #1
> +    .else
> +        ldrh            \dst, [\src], #2
> +    .endif
> +.endm
> +
> +.macro alf_classify_load_pixel_with_offset pix_size, dst, src, offset
> +    .if \pix_size == 1
> +        ldrb            \dst, [\src, #(\offset)]
> +    .else
> +        ldrh            \dst, [\src, #(2*\offset)]
> +    .endif
> +.endm
> +
> +#define ALF_BLOCK_SIZE          4
> +#define ALF_GRADIENT_STEP       2
> +#define ALF_GRADIENT_BORDER     2
> +#define ALF_NUM_DIR             4
> +#define ALF_GRAD_BORDER_X2      (ALF_GRADIENT_BORDER * 2)
> +#define ALF_STRIDE_MUL          (ALF_GRADIENT_BORDER + 1)
> +#define ALF_GRAD_X_VSTEP        (ALF_GRADIENT_STEP * 8)
> +#define ALF_GSTRIDE_MUL         (ALF_NUM_DIR / ALF_GRADIENT_STEP)
> +
> +// Shift right: equal to division by 2 (see ALF_GRADIENT_STEP)
> +#define ALF_GSTRIDE_XG_BYTES    (2 * ALF_NUM_DIR / ALF_GRADIENT_STEP)
> +
> +#define ALF_GSTRIDE_SUB_BYTES   (2 * ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP) * ALF_NUM_DIR)
> +
> +#define ALF_CLASS_INC           (ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP)
> +#define ALF_CLASS_END           ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP)
> +
> +.macro ff_alf_classify_grad pix_size
> +        // class_idx     .req x0
> +        // transpose_idx .req x1
> +        // _src          .req x2
> +        // _src_stride   .req x3
> +        // width         .req w4
> +        // height        .req w5
> +        // vb_pos        .req w6
> +        // gradient_tmp  .req x7
> +
> +        mov             w16, #ALF_STRIDE_MUL
> +        add             w5, w5, #ALF_GRAD_BORDER_X2 // h = height + 4
> +        mul             x16, x3, x16                // 3 * stride
> +        add             w4, w4, #ALF_GRAD_BORDER_X2 // w = width + 4
> +        sub             x15, x2, x16                // src -= (3 * stride)
> +        mov             x17, x7
> +    .if \pix_size == 1
> +        sub             x15, x15, #ALF_GRADIENT_BORDER
> +    .else
> +        sub             x15, x15, #4
> +    .endif
> +        mov             w8, #0                      // y loop: y = 0
> +1:
> +        cmp             w8, w5
> +        bge             10f
> +
> +        add             x16, x8, #1
> +        mul             x14, x8, x3                 // y * stride
> +        mul             x16, x16, x3
> +        add             x10, x15, x14               // s0 = src + y * stride
> +        add             x14, x16, x3
> +        add             x11, x15, x16               // s1
> +        add             x16, x14, x3
> +        add             x12, x15, x14               // s2
> +        add             x13, x15, x16               // s3
> +
> +        // if (y == vb_pos): s3 = s2
> +        cmp             w8, w6
> +        add             w16, w6, #ALF_GRADIENT_BORDER
> +        csel            x13, x12, x13, eq
> +        // if (y == vb_pos + 2): s0 = s1
> +        cmp             w8, w16
> +        csel            x10, x11, x10, eq
> +
> +        alf_classify_load_pixel_with_offset \pix_size, w16, x10, -1
> +        alf_classify_load_pixel \pix_size, w14, x13
> +        mov             v16.h[7], w16
> +        mov             v28.h[7], w14
> +
> +        // load 4 pixels from *(s1-2) & *(s2-2)
> +    .if \pix_size == 1
> +        sub             x11, x11, #2
> +        sub             x12, x12, #2
> +        ld1             {v0.8b}, [x11]
> +        ld1             {v1.8b}, [x12]
> +        uxtl            v17.8h, v0.8b
> +        uxtl            v18.8h, v1.8b
> +    .else
> +        sub             x11, x11, #4
> +        sub             x12, x12, #4
> +        ld1             {v17.8h}, [x11]
> +        ld1             {v18.8h}, [x12]
> +    .endif
> +        ext             v22.16b, v22.16b, v17.16b, #8
> +        ext             v24.16b, v24.16b, v18.16b, #8
> +    .if \pix_size == 1
> +        add             x11, x11, #4
> +        add             x12, x12, #4
> +    .else
> +        add             x11, x11, #8
> +        add             x12, x12, #8
> +    .endif
> +
> +        // x loop
> +        mov             w9, #0
> +        b               11f
> +2:
> +        cmp             w9, w4
> +        bge             20f

Instead of having w9 count up to w4, it'd be more common to just decrement 
one register instead. Here you could do "mov w9, w4" instead of "mov w9, 
#0", and just decrement w9 with "subs w9, w9, #8" at the end, which allows 
you to do the branch as "b.gt 2b", avoiding some extra jumping around at 
the end of the loop. Same for the outer loop for w8/w5.

> +
> +        // Store operation starts from the second cycle
> +        st2             {v4.8h, v5.8h}, [x17], #32
> +11:
> +    .if \pix_size == 1
> +        // Load 8 pixels: s0 & s1+2
> +        ld1             {v0.8b}, [x10], #8
> +        ld1             {v1.8b}, [x11], #8
> +        uxtl            v20.8h, v0.8b
> +        uxtl            v26.8h, v1.8b
> +        // Load 8 pixels: s2+2 & s3+1
> +        ld1             {v0.8b}, [x12], #8
> +        ld1             {v1.8b}, [x13], #8
> +        uxtl            v27.8h, v0.8b
> +        uxtl            v19.8h, v1.8b
> +    .else
> +        // Load 8 pixels: s0
> +        ld1             {v20.8h}, [x10], #16
> +        // Load 8 pixels: s1+2
> +        ld1             {v26.8h}, [x11], #16
> +        // Load 8 pixels: s2+2
> +        ld1             {v27.8h}, [x12], #16
> +        // Load 8 pixels: s3+1
> +        ld1             {v19.8h}, [x13], #16
> +    .endif
> +
> +        ext             v16.16b, v16.16b, v20.16b, #14
> +        ext             v28.16b, v28.16b, v19.16b, #14
> +
> +        ext             v17.16b, v22.16b, v26.16b, #8
> +        ext             v22.16b, v22.16b, v26.16b, #12
> +
> +        ext             v18.16b, v24.16b, v27.16b, #8
> +        ext             v24.16b, v24.16b, v27.16b, #12
> +
> +        // Grad: Vertical & D0 (interleaved)
> +        trn1            v21.8h, v20.8h, v16.8h      // first abs: operand 1
> +        rev32           v23.8h, v22.8h              // second abs: operand 1
> +        trn2            v29.8h, v28.8h, v19.8h      // second abs: operand 2
> +        trn1            v30.8h, v22.8h, v22.8h
> +        trn2            v31.8h, v24.8h, v24.8h
> +        add             v30.8h, v30.8h, v30.8h
> +        add             v31.8h, v31.8h, v31.8h
> +        sub             v0.8h, v30.8h, v21.8h
> +        sub             v1.8h, v31.8h, v23.8h
> +        sabd            v4.8h, v0.8h, v24.8h
> +
> +        // Grad: Horizontal & D1 (interleaved)
> +        trn2            v21.8h, v17.8h, v20.8h      // first abs: operand 1
> +        saba            v4.8h, v1.8h, v29.8h
> +        trn2            v23.8h, v22.8h, v18.8h      // first abs: operand 2
> +        trn1            v25.8h, v24.8h, v26.8h      // second abs: operand 1
> +        trn1            v29.8h, v27.8h, v28.8h      // second abs: operand 2
> +        sub             v0.8h, v30.8h, v21.8h
> +        sub             v1.8h, v31.8h, v25.8h
> +        sabd            v5.8h, v0.8h, v23.8h
> +
> +        // Prepare for the next interation:
> +        mov             v16.16b, v20.16b
> +        saba            v5.8h, v1.8h, v29.8h
> +        mov             v28.16b, v19.16b
> +        mov             v22.16b, v26.16b
> +        mov             v24.16b, v27.16b
> +
> +        add             w9, w9, #8                  // x += 8
> +        b               2b
> +20:
> +        // 8 pixels -> 4 cycles of generic
> +        // 4 pixels -> paddings => half needs to be saved
> +        st2             {v4.4h, v5.4h}, [x17], #16
> +
> +        add             w8, w8, #ALF_GRADIENT_STEP  // y += 2
> +        b               1b
> +10:
> +        ret
> +.endm
> +
> +.macro ff_alf_classify_sum

This doesn't seem to need to be a macro, as it is only invoked once, 
without any parameters?

> +        // sum0          .req x0
> +        // sum1          .req x1
> +        // grad          .req x2
> +        // gshift        .req w3
> +        // steps         .req w4
> +        mov             w5, #2
> +        mov             w6, #0
> +        mul             w3, w3, w5

If you want to multiply by 2, then just do "lsl w3, w3, #1".

> +        movi            v16.4s, #0
> +        movi            v21.4s, #0
> +6:
> +        prfm            pldl1keep, [x2]

No prefetch instructions please. If you feel strongly about it, send a 
separate patch afterwards to add the prefetch instructions, with 
repeatable benchmark nubmers.

> +        cmp             w6, w4
> +        bge             60f
> +
> +        ld1             {v17.4h}, [x2], #8
> +        ld1             {v18.4h}, [x2], #8
> +        uxtl            v17.4s, v17.4h
> +        ld1             {v19.4h}, [x2], #8
> +        uxtl            v18.4s, v18.4h
> +        ld1             {v20.4h}, [x2], #8
> +        uxtl            v19.4s, v19.4h
> +        ld1             {v22.4h}, [x2], #8
> +        uxtl            v20.4s, v20.4h
> +        ld1             {v23.4h}, [x2]

This sequence of 6 separate sequential loads of .4h registers could be 
expressed as two loads, with {.4h, .4h, .4h, .4h} and {.4h, .4h}, or three 
loads of {.4h, .4h} - that should probably be more efficient than doing 6 
separate loads.

> +        uxtl            v22.4s, v22.4h
> +        uxtl            v23.4s, v23.4h
> +        add             v17.4s, v17.4s, v18.4s
> +        add             v16.4s, v16.4s, v17.4s

As this addition to v16 uses v17, which was updated in the directly 
preceding instruction, this causes stalls on in-order cores. It'd be 
better to move this add instruction down by one or two instructions.

> +        add             v19.4s, v19.4s, v20.4s
> +        add             v22.4s, v22.4s, v23.4s
> +        add             v21.4s, v21.4s, v19.4s
> +        add             v16.4s, v16.4s, v19.4s
> +        add             v21.4s, v21.4s, v22.4s
> +
> +        sub             x2, x2, #8
> +        add             w6, w6, #1                  // i += 1

Instead of this register w6 counting up to w4, it'd be more idiomatic and 
requiring one register less, if you'd just decrement w4 instead, with a 
"subs w4, w4, #1" instruction. Then you can just do "b.gt 6b" at the end, 
rather than having to jump back to 6 to do the comparison there.

> +        add             x2, x2, x3                  // grad += gstride - size * ALF_NUM_DIR

Instead of having to subtract 8 from x2 at the end of each loop, just 
subtract 8 from x3 before the loop.

> +        b               6b
> +60:
> +        st1             {v16.4s}, [x0]
> +        st1             {v21.4s}, [x1]
> +        ret
> +.endm
> +
> +
> +function ff_alf_classify_sum_neon, export=1
> +        ff_alf_classify_sum
> +endfunc
> +
> +function ff_alf_classify_grad_8_neon, export=1
> +        ff_alf_classify_grad 1
> +endfunc
> +
> +function ff_alf_classify_grad_10_neon, export=1
> +        ff_alf_classify_grad 2
> +endfunc
> +
> +function ff_alf_classify_grad_12_neon, export=1
> +        ff_alf_classify_grad 2
> +endfunc
> diff --git a/libavcodec/aarch64/vvc/alf_template.c b/libavcodec/aarch64/vvc/alf_template.c
> index 41f7bf8995..470222a634 100644
> --- a/libavcodec/aarch64/vvc/alf_template.c
> +++ b/libavcodec/aarch64/vvc/alf_template.c
> @@ -155,3 +155,90 @@ static void FUNC2(alf_filter_chroma, BIT_DEPTH, _neon)(uint8_t *_dst,
>         }
>     }
> }
> +
> +#define ALF_DIR_VERT        0
> +#define ALF_DIR_HORZ        1
> +#define ALF_DIR_DIGA0       2
> +#define ALF_DIR_DIGA1       3
> +
> +static void FUNC(ff_alf_get_idx)(int *class_idx, int *transpose_idx, const int *sum, const int ac)
> +{

Static functions don't need an ff_ prefix

> +    static const int arg_var[] = {0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 };
> +
> +    int hv0, hv1, dir_hv, d0, d1, dir_d, hvd1, hvd0, sum_hv, dir1;
> +
> +    dir_hv = sum[ALF_DIR_VERT] <= sum[ALF_DIR_HORZ];
> +    hv1    = FFMAX(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]);
> +    hv0    = FFMIN(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]);
> +
> +    dir_d  = sum[ALF_DIR_DIGA0] <= sum[ALF_DIR_DIGA1];
> +    d1     = FFMAX(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]);
> +    d0     = FFMIN(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]);
> +
> +    //promote to avoid overflow
> +    dir1 = (uint64_t)d1 * hv0 <= (uint64_t)hv1 * d0;
> +    hvd1 = dir1 ? hv1 : d1;
> +    hvd0 = dir1 ? hv0 : d0;
> +
> +    sum_hv = sum[ALF_DIR_HORZ] + sum[ALF_DIR_VERT];
> +    *class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), 4)];
> +    if (hvd1 * 2 > 9 * hvd0)
> +        *class_idx += ((dir1 << 1) + 2) * 5;
> +    else if (hvd1 > 2 * hvd0)
> +        *class_idx += ((dir1 << 1) + 1) * 5;
> +
> +    *transpose_idx = dir_d * 2 + dir_hv;
> +}
> +
> +static void FUNC(ff_alf_classify)(int *class_idx, int *transpose_idx,

Ditto

> +    const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
> +    const int vb_pos, int16_t *gradient_tmp)
> +{
> +    int16_t *grad;
> +
> +    const int w = width  + ALF_GRADIENT_BORDER * 2;
> +    const int size = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP;
> +    const int gstride = (w / ALF_GRADIENT_STEP) * ALF_NUM_DIR;
> +    const int gshift = gstride - size * ALF_NUM_DIR;
> +
> +    for (int y = 0; y < height ; y += ALF_BLOCK_SIZE ) {
> +        int start = 0;
> +        int end   = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP;
> +        int ac    = 2;
> +        if (y + ALF_BLOCK_SIZE == vb_pos) {
> +            end -= ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP;
> +            ac = 3;
> +        } else if (y == vb_pos) {
> +            start += ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP;
> +            ac = 3;
> +        }
> +        for (int x = 0; x < width; x += (2*ALF_BLOCK_SIZE)) {
> +            const int xg = x / ALF_GRADIENT_STEP;
> +            const int yg = y / ALF_GRADIENT_STEP;
> +            int sum0[ALF_NUM_DIR];
> +            int sum1[ALF_NUM_DIR];
> +            grad = gradient_tmp + (yg + start) * gstride + xg * ALF_NUM_DIR;
> +	    ff_alf_classify_sum_neon(sum0, sum1, grad, gshift, end-start);

This line seems to be indented differently than the rest

> +            FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum0, ac);
> +            class_idx++;
> +            transpose_idx++;
> +            FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum1, ac);
> +            class_idx++;
> +            transpose_idx++;
> +        }
> +    }
> +
> +}
> +
> +void FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx,
> +    const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
> +    const int vb_pos, int16_t *gradient_tmp);
> +
> +
> +static void FUNC2(alf_classify, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx,
> +    const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
> +    const int vb_pos, int *gradient_tmp) {

Move the opening brace of a function to the next line

> +        FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp);

This is indented one level too much. With the opening brace on its own 
line, that would be clearer to see.

If you want to, you can sign up at https://code.ffmpeg.org and submit your 
patch as a PR to https://code.ffmpeg.org/FFmpeg/FFmpeg, for easier 
followup/reviewing of your submission.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".