Re: [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: optimised alf_classify function 8/10/12bit of vvc codec for aarch64

From: George Zaguri <george.zaguri@gmail.com>
To: "Martin Storsjö" <martin@martin.st>
Cc: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Subject: Re: [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: optimised alf_classify function 8/10/12bit of vvc codec for aarch64
Date: Sun, 3 Aug 2025 18:54:10 +0100
Message-ID: <CAGQfSO1w_Bo3y-3uj2oj7d-zYJaYW2tX=E_KiWyN-1QYb=m8OQ@mail.gmail.com> (raw)
In-Reply-To: <2f1aa58f-826a-bd98-7ac2-5918d17ca2d@martin.st>

Thank you, Martin! I will have a look at it over the next week!

On Fri, Aug 1, 2025 at 6:50 PM Martin Storsjö <martin@martin.st> wrote:

> On Wed, 9 Jul 2025, Georgii Zagoruiko wrote:
>
> > diff --git a/libavcodec/aarch64/vvc/alf.S b/libavcodec/aarch64/vvc/alf.S
> > index 8801b3afb6..9c9765ead1 100644
> > --- a/libavcodec/aarch64/vvc/alf.S
> > +++ b/libavcodec/aarch64/vvc/alf.S
> > @@ -291,3 +291,281 @@ function ff_alf_filter_chroma_kernel_10_neon,
> export=1
> > 1:
> >         alf_filter_chroma_kernel 2
> > endfunc
> > +
> > +
> > +
> > +.macro alf_classify_argvar2v30
> > +        mov             w16, #0
> > +        mov             v30.b[0], w16
> > +        mov             w16, #1
> > +        mov             v30.b[1], w16
> > +        mov             w16, #2
> > +        mov             v30.b[2], w16
> > +        mov             v30.b[3], w16
> > +        mov             v30.b[4], w16
> > +        mov             v30.b[5], w16
> > +        mov             v30.b[6], w16
> > +        mov             w16, #3
> > +        mov             v30.b[7], w16
> > +        mov             v30.b[8], w16
> > +        mov             v30.b[9], w16
> > +        mov             v30.b[10], w16
> > +        mov             v30.b[11], w16
> > +        mov             v30.b[12], w16
> > +        mov             v30.b[13], w16
> > +        mov             v30.b[14], w16
> > +        mov             w16, #4
> > +        mov             v30.b[15], w16
>
> Wouldn't it be more efficient to just have this pattern stored as a
> constant somewhere, and do a single load instruction (with some latency)
> to load it, rather than a 19 instruction sequence to initialize it?
>
> > +.endm
> > +
> > +.macro alf_classify_load_pixel pix_size, dst, src
> > +    .if \pix_size == 1
> > +        ldrb            \dst, [\src], #1
> > +    .else
> > +        ldrh            \dst, [\src], #2
> > +    .endif
> > +.endm
> > +
> > +.macro alf_classify_load_pixel_with_offset pix_size, dst, src, offset
> > +    .if \pix_size == 1
> > +        ldrb            \dst, [\src, #(\offset)]
> > +    .else
> > +        ldrh            \dst, [\src, #(2*\offset)]
> > +    .endif
> > +.endm
> > +
> > +#define ALF_BLOCK_SIZE          4
> > +#define ALF_GRADIENT_STEP       2
> > +#define ALF_GRADIENT_BORDER     2
> > +#define ALF_NUM_DIR             4
> > +#define ALF_GRAD_BORDER_X2      (ALF_GRADIENT_BORDER * 2)
> > +#define ALF_STRIDE_MUL          (ALF_GRADIENT_BORDER + 1)
> > +#define ALF_GRAD_X_VSTEP        (ALF_GRADIENT_STEP * 8)
> > +#define ALF_GSTRIDE_MUL         (ALF_NUM_DIR / ALF_GRADIENT_STEP)
> > +
> > +// Shift right: equal to division by 2 (see ALF_GRADIENT_STEP)
> > +#define ALF_GSTRIDE_XG_BYTES    (2 * ALF_NUM_DIR / ALF_GRADIENT_STEP)
> > +
> > +#define ALF_GSTRIDE_SUB_BYTES   (2 * ((ALF_BLOCK_SIZE +
> ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP) * ALF_NUM_DIR)
> > +
> > +#define ALF_CLASS_INC           (ALF_GRADIENT_BORDER /
> ALF_GRADIENT_STEP)
> > +#define ALF_CLASS_END           ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER
> * 2) / ALF_GRADIENT_STEP)
> > +
> > +.macro ff_alf_classify_grad pix_size
> > +        // class_idx     .req x0
> > +        // transpose_idx .req x1
> > +        // _src          .req x2
> > +        // _src_stride   .req x3
> > +        // width         .req w4
> > +        // height        .req w5
> > +        // vb_pos        .req w6
> > +        // gradient_tmp  .req x7
> > +
> > +        mov             w16, #ALF_STRIDE_MUL
> > +        add             w5, w5, #ALF_GRAD_BORDER_X2 // h = height + 4
> > +        mul             x16, x3, x16                // 3 * stride
> > +        add             w4, w4, #ALF_GRAD_BORDER_X2 // w = width + 4
> > +        sub             x15, x2, x16                // src -= (3 *
> stride)
> > +        mov             x17, x7
> > +    .if \pix_size == 1
> > +        sub             x15, x15, #ALF_GRADIENT_BORDER
> > +    .else
> > +        sub             x15, x15, #4
> > +    .endif
> > +        mov             w8, #0                      // y loop: y = 0
> > +1:
> > +        cmp             w8, w5
> > +        bge             10f
> > +
> > +        add             x16, x8, #1
> > +        mul             x14, x8, x3                 // y * stride
> > +        mul             x16, x16, x3
> > +        add             x10, x15, x14               // s0 = src + y *
> stride
> > +        add             x14, x16, x3
> > +        add             x11, x15, x16               // s1
> > +        add             x16, x14, x3
> > +        add             x12, x15, x14               // s2
> > +        add             x13, x15, x16               // s3
> > +
> > +        // if (y == vb_pos): s3 = s2
> > +        cmp             w8, w6
> > +        add             w16, w6, #ALF_GRADIENT_BORDER
> > +        csel            x13, x12, x13, eq
> > +        // if (y == vb_pos + 2): s0 = s1
> > +        cmp             w8, w16
> > +        csel            x10, x11, x10, eq
> > +
> > +        alf_classify_load_pixel_with_offset \pix_size, w16, x10, -1
> > +        alf_classify_load_pixel \pix_size, w14, x13
> > +        mov             v16.h[7], w16
> > +        mov             v28.h[7], w14
> > +
> > +        // load 4 pixels from *(s1-2) & *(s2-2)
> > +    .if \pix_size == 1
> > +        sub             x11, x11, #2
> > +        sub             x12, x12, #2
> > +        ld1             {v0.8b}, [x11]
> > +        ld1             {v1.8b}, [x12]
> > +        uxtl            v17.8h, v0.8b
> > +        uxtl            v18.8h, v1.8b
> > +    .else
> > +        sub             x11, x11, #4
> > +        sub             x12, x12, #4
> > +        ld1             {v17.8h}, [x11]
> > +        ld1             {v18.8h}, [x12]
> > +    .endif
> > +        ext             v22.16b, v22.16b, v17.16b, #8
> > +        ext             v24.16b, v24.16b, v18.16b, #8
> > +    .if \pix_size == 1
> > +        add             x11, x11, #4
> > +        add             x12, x12, #4
> > +    .else
> > +        add             x11, x11, #8
> > +        add             x12, x12, #8
> > +    .endif
> > +
> > +        // x loop
> > +        mov             w9, #0
> > +        b               11f
> > +2:
> > +        cmp             w9, w4
> > +        bge             20f
>
> Instead of having w9 count up to w4, it'd be more common to just decrement
> one register instead. Here you could do "mov w9, w4" instead of "mov w9,
> #0", and just decrement w9 with "subs w9, w9, #8" at the end, which allows
> you to do the branch as "b.gt 2b", avoiding some extra jumping around at
> the end of the loop. Same for the outer loop for w8/w5.
>
> > +
> > +        // Store operation starts from the second cycle
> > +        st2             {v4.8h, v5.8h}, [x17], #32
> > +11:
> > +    .if \pix_size == 1
> > +        // Load 8 pixels: s0 & s1+2
> > +        ld1             {v0.8b}, [x10], #8
> > +        ld1             {v1.8b}, [x11], #8
> > +        uxtl            v20.8h, v0.8b
> > +        uxtl            v26.8h, v1.8b
> > +        // Load 8 pixels: s2+2 & s3+1
> > +        ld1             {v0.8b}, [x12], #8
> > +        ld1             {v1.8b}, [x13], #8
> > +        uxtl            v27.8h, v0.8b
> > +        uxtl            v19.8h, v1.8b
> > +    .else
> > +        // Load 8 pixels: s0
> > +        ld1             {v20.8h}, [x10], #16
> > +        // Load 8 pixels: s1+2
> > +        ld1             {v26.8h}, [x11], #16
> > +        // Load 8 pixels: s2+2
> > +        ld1             {v27.8h}, [x12], #16
> > +        // Load 8 pixels: s3+1
> > +        ld1             {v19.8h}, [x13], #16
> > +    .endif
> > +
> > +        ext             v16.16b, v16.16b, v20.16b, #14
> > +        ext             v28.16b, v28.16b, v19.16b, #14
> > +
> > +        ext             v17.16b, v22.16b, v26.16b, #8
> > +        ext             v22.16b, v22.16b, v26.16b, #12
> > +
> > +        ext             v18.16b, v24.16b, v27.16b, #8
> > +        ext             v24.16b, v24.16b, v27.16b, #12
> > +
> > +        // Grad: Vertical & D0 (interleaved)
> > +        trn1            v21.8h, v20.8h, v16.8h      // first abs:
> operand 1
> > +        rev32           v23.8h, v22.8h              // second abs:
> operand 1
> > +        trn2            v29.8h, v28.8h, v19.8h      // second abs:
> operand 2
> > +        trn1            v30.8h, v22.8h, v22.8h
> > +        trn2            v31.8h, v24.8h, v24.8h
> > +        add             v30.8h, v30.8h, v30.8h
> > +        add             v31.8h, v31.8h, v31.8h
> > +        sub             v0.8h, v30.8h, v21.8h
> > +        sub             v1.8h, v31.8h, v23.8h
> > +        sabd            v4.8h, v0.8h, v24.8h
> > +
> > +        // Grad: Horizontal & D1 (interleaved)
> > +        trn2            v21.8h, v17.8h, v20.8h      // first abs:
> operand 1
> > +        saba            v4.8h, v1.8h, v29.8h
> > +        trn2            v23.8h, v22.8h, v18.8h      // first abs:
> operand 2
> > +        trn1            v25.8h, v24.8h, v26.8h      // second abs:
> operand 1
> > +        trn1            v29.8h, v27.8h, v28.8h      // second abs:
> operand 2
> > +        sub             v0.8h, v30.8h, v21.8h
> > +        sub             v1.8h, v31.8h, v25.8h
> > +        sabd            v5.8h, v0.8h, v23.8h
> > +
> > +        // Prepare for the next interation:
> > +        mov             v16.16b, v20.16b
> > +        saba            v5.8h, v1.8h, v29.8h
> > +        mov             v28.16b, v19.16b
> > +        mov             v22.16b, v26.16b
> > +        mov             v24.16b, v27.16b
> > +
> > +        add             w9, w9, #8                  // x += 8
> > +        b               2b
> > +20:
> > +        // 8 pixels -> 4 cycles of generic
> > +        // 4 pixels -> paddings => half needs to be saved
> > +        st2             {v4.4h, v5.4h}, [x17], #16
> > +
> > +        add             w8, w8, #ALF_GRADIENT_STEP  // y += 2
> > +        b               1b
> > +10:
> > +        ret
> > +.endm
> > +
> > +.macro ff_alf_classify_sum
>
> This doesn't seem to need to be a macro, as it is only invoked once,
> without any parameters?
>
> > +        // sum0          .req x0
> > +        // sum1          .req x1
> > +        // grad          .req x2
> > +        // gshift        .req w3
> > +        // steps         .req w4
> > +        mov             w5, #2
> > +        mov             w6, #0
> > +        mul             w3, w3, w5
>
> If you want to multiply by 2, then just do "lsl w3, w3, #1".
>
> > +        movi            v16.4s, #0
> > +        movi            v21.4s, #0
> > +6:
> > +        prfm            pldl1keep, [x2]
>
> No prefetch instructions please. If you feel strongly about it, send a
> separate patch afterwards to add the prefetch instructions, with
> repeatable benchmark nubmers.
>
> > +        cmp             w6, w4
> > +        bge             60f
> > +
> > +        ld1             {v17.4h}, [x2], #8
> > +        ld1             {v18.4h}, [x2], #8
> > +        uxtl            v17.4s, v17.4h
> > +        ld1             {v19.4h}, [x2], #8
> > +        uxtl            v18.4s, v18.4h
> > +        ld1             {v20.4h}, [x2], #8
> > +        uxtl            v19.4s, v19.4h
> > +        ld1             {v22.4h}, [x2], #8
> > +        uxtl            v20.4s, v20.4h
> > +        ld1             {v23.4h}, [x2]
>
> This sequence of 6 separate sequential loads of .4h registers could be
> expressed as two loads, with {.4h, .4h, .4h, .4h} and {.4h, .4h}, or three
> loads of {.4h, .4h} - that should probably be more efficient than doing 6
> separate loads.
>
> > +        uxtl            v22.4s, v22.4h
> > +        uxtl            v23.4s, v23.4h
> > +        add             v17.4s, v17.4s, v18.4s
> > +        add             v16.4s, v16.4s, v17.4s
>
> As this addition to v16 uses v17, which was updated in the directly
> preceding instruction, this causes stalls on in-order cores. It'd be
> better to move this add instruction down by one or two instructions.
>
> > +        add             v19.4s, v19.4s, v20.4s
> > +        add             v22.4s, v22.4s, v23.4s
> > +        add             v21.4s, v21.4s, v19.4s
> > +        add             v16.4s, v16.4s, v19.4s
> > +        add             v21.4s, v21.4s, v22.4s
> > +
> > +        sub             x2, x2, #8
> > +        add             w6, w6, #1                  // i += 1
>
> Instead of this register w6 counting up to w4, it'd be more idiomatic and
> requiring one register less, if you'd just decrement w4 instead, with a
> "subs w4, w4, #1" instruction. Then you can just do "b.gt 6b" at the end,
> rather than having to jump back to 6 to do the comparison there.
>
> > +        add             x2, x2, x3                  // grad += gstride
> - size * ALF_NUM_DIR
>
> Instead of having to subtract 8 from x2 at the end of each loop, just
> subtract 8 from x3 before the loop.
>
> > +        b               6b
> > +60:
> > +        st1             {v16.4s}, [x0]
> > +        st1             {v21.4s}, [x1]
> > +        ret
> > +.endm
> > +
> > +
> > +function ff_alf_classify_sum_neon, export=1
> > +        ff_alf_classify_sum
> > +endfunc
> > +
> > +function ff_alf_classify_grad_8_neon, export=1
> > +        ff_alf_classify_grad 1
> > +endfunc
> > +
> > +function ff_alf_classify_grad_10_neon, export=1
> > +        ff_alf_classify_grad 2
> > +endfunc
> > +
> > +function ff_alf_classify_grad_12_neon, export=1
> > +        ff_alf_classify_grad 2
> > +endfunc
> > diff --git a/libavcodec/aarch64/vvc/alf_template.c
> b/libavcodec/aarch64/vvc/alf_template.c
> > index 41f7bf8995..470222a634 100644
> > --- a/libavcodec/aarch64/vvc/alf_template.c
> > +++ b/libavcodec/aarch64/vvc/alf_template.c
> > @@ -155,3 +155,90 @@ static void FUNC2(alf_filter_chroma, BIT_DEPTH,
> _neon)(uint8_t *_dst,
> >         }
> >     }
> > }
> > +
> > +#define ALF_DIR_VERT        0
> > +#define ALF_DIR_HORZ        1
> > +#define ALF_DIR_DIGA0       2
> > +#define ALF_DIR_DIGA1       3
> > +
> > +static void FUNC(ff_alf_get_idx)(int *class_idx, int *transpose_idx,
> const int *sum, const int ac)
> > +{
>
> Static functions don't need an ff_ prefix
>
> > +    static const int arg_var[] = {0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
> 3, 3, 3, 4 };
> > +
> > +    int hv0, hv1, dir_hv, d0, d1, dir_d, hvd1, hvd0, sum_hv, dir1;
> > +
> > +    dir_hv = sum[ALF_DIR_VERT] <= sum[ALF_DIR_HORZ];
> > +    hv1    = FFMAX(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]);
> > +    hv0    = FFMIN(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]);
> > +
> > +    dir_d  = sum[ALF_DIR_DIGA0] <= sum[ALF_DIR_DIGA1];
> > +    d1     = FFMAX(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]);
> > +    d0     = FFMIN(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]);
> > +
> > +    //promote to avoid overflow
> > +    dir1 = (uint64_t)d1 * hv0 <= (uint64_t)hv1 * d0;
> > +    hvd1 = dir1 ? hv1 : d1;
> > +    hvd0 = dir1 ? hv0 : d0;
> > +
> > +    sum_hv = sum[ALF_DIR_HORZ] + sum[ALF_DIR_VERT];
> > +    *class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1),
> 4)];
> > +    if (hvd1 * 2 > 9 * hvd0)
> > +        *class_idx += ((dir1 << 1) + 2) * 5;
> > +    else if (hvd1 > 2 * hvd0)
> > +        *class_idx += ((dir1 << 1) + 1) * 5;
> > +
> > +    *transpose_idx = dir_d * 2 + dir_hv;
> > +}
> > +
> > +static void FUNC(ff_alf_classify)(int *class_idx, int *transpose_idx,
>
> Ditto
>
> > +    const uint8_t *_src, const ptrdiff_t _src_stride, const int width,
> const int height,
> > +    const int vb_pos, int16_t *gradient_tmp)
> > +{
> > +    int16_t *grad;
> > +
> > +    const int w = width  + ALF_GRADIENT_BORDER * 2;
> > +    const int size = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) /
> ALF_GRADIENT_STEP;
> > +    const int gstride = (w / ALF_GRADIENT_STEP) * ALF_NUM_DIR;
> > +    const int gshift = gstride - size * ALF_NUM_DIR;
> > +
> > +    for (int y = 0; y < height ; y += ALF_BLOCK_SIZE ) {
> > +        int start = 0;
> > +        int end   = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) /
> ALF_GRADIENT_STEP;
> > +        int ac    = 2;
> > +        if (y + ALF_BLOCK_SIZE == vb_pos) {
> > +            end -= ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP;
> > +            ac = 3;
> > +        } else if (y == vb_pos) {
> > +            start += ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP;
> > +            ac = 3;
> > +        }
> > +        for (int x = 0; x < width; x += (2*ALF_BLOCK_SIZE)) {
> > +            const int xg = x / ALF_GRADIENT_STEP;
> > +            const int yg = y / ALF_GRADIENT_STEP;
> > +            int sum0[ALF_NUM_DIR];
> > +            int sum1[ALF_NUM_DIR];
> > +            grad = gradient_tmp + (yg + start) * gstride + xg *
> ALF_NUM_DIR;
> > +         ff_alf_classify_sum_neon(sum0, sum1, grad, gshift, end-start);
>
> This line seems to be indented differently than the rest
>
> > +            FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum0, ac);
> > +            class_idx++;
> > +            transpose_idx++;
> > +            FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum1, ac);
> > +            class_idx++;
> > +            transpose_idx++;
> > +        }
> > +    }
> > +
> > +}
> > +
> > +void FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(int *class_idx, int
> *transpose_idx,
> > +    const uint8_t *_src, const ptrdiff_t _src_stride, const int width,
> const int height,
> > +    const int vb_pos, int16_t *gradient_tmp);
> > +
> > +
> > +static void FUNC2(alf_classify, BIT_DEPTH, _neon)(int *class_idx, int
> *transpose_idx,
> > +    const uint8_t *_src, const ptrdiff_t _src_stride, const int width,
> const int height,
> > +    const int vb_pos, int *gradient_tmp) {
>
> Move the opening brace of a function to the next line
>
> > +        FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(class_idx,
> transpose_idx, _src, _src_stride, width, height, vb_pos,
> (int16_t*)gradient_tmp);
>
> This is indented one level too much. With the opening brace on its own
> line, that would be clearer to see.
>
>
> If you want to, you can sign up at https://code.ffmpeg.org and submit
> your
> patch as a PR to https://code.ffmpeg.org/FFmpeg/FFmpeg, for easier
> followup/reviewing of your submission.
>
>
> // Martin
>
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".