From: "Martin Storsjö" <martin@martin.st> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: Georgii Zagoruiko <george.zaguri@gmail.com> Subject: Re: [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: optimised alf_classify function 8/10/12bit of vvc codec for aarch64 Date: Fri, 1 Aug 2025 20:50:06 +0300 (EEST) Message-ID: <2f1aa58f-826a-bd98-7ac2-5918d17ca2d@martin.st> (raw) In-Reply-To: <20250709075652.5636-1-george.zaguri@gmail.com> On Wed, 9 Jul 2025, Georgii Zagoruiko wrote: > diff --git a/libavcodec/aarch64/vvc/alf.S b/libavcodec/aarch64/vvc/alf.S > index 8801b3afb6..9c9765ead1 100644 > --- a/libavcodec/aarch64/vvc/alf.S > +++ b/libavcodec/aarch64/vvc/alf.S > @@ -291,3 +291,281 @@ function ff_alf_filter_chroma_kernel_10_neon, export=1 > 1: > alf_filter_chroma_kernel 2 > endfunc > + > + > + > +.macro alf_classify_argvar2v30 > + mov w16, #0 > + mov v30.b[0], w16 > + mov w16, #1 > + mov v30.b[1], w16 > + mov w16, #2 > + mov v30.b[2], w16 > + mov v30.b[3], w16 > + mov v30.b[4], w16 > + mov v30.b[5], w16 > + mov v30.b[6], w16 > + mov w16, #3 > + mov v30.b[7], w16 > + mov v30.b[8], w16 > + mov v30.b[9], w16 > + mov v30.b[10], w16 > + mov v30.b[11], w16 > + mov v30.b[12], w16 > + mov v30.b[13], w16 > + mov v30.b[14], w16 > + mov w16, #4 > + mov v30.b[15], w16 Wouldn't it be more efficient to just have this pattern stored as a constant somewhere, and do a single load instruction (with some latency) to load it, rather than a 19 instruction sequence to initialize it? > +.endm > + > +.macro alf_classify_load_pixel pix_size, dst, src > + .if \pix_size == 1 > + ldrb \dst, [\src], #1 > + .else > + ldrh \dst, [\src], #2 > + .endif > +.endm > + > +.macro alf_classify_load_pixel_with_offset pix_size, dst, src, offset > + .if \pix_size == 1 > + ldrb \dst, [\src, #(\offset)] > + .else > + ldrh \dst, [\src, #(2*\offset)] > + .endif > +.endm > + > +#define ALF_BLOCK_SIZE 4 > +#define ALF_GRADIENT_STEP 2 > +#define ALF_GRADIENT_BORDER 2 > +#define ALF_NUM_DIR 4 > +#define ALF_GRAD_BORDER_X2 (ALF_GRADIENT_BORDER * 2) > +#define ALF_STRIDE_MUL (ALF_GRADIENT_BORDER + 1) > +#define ALF_GRAD_X_VSTEP (ALF_GRADIENT_STEP * 8) > +#define ALF_GSTRIDE_MUL (ALF_NUM_DIR / ALF_GRADIENT_STEP) > + > +// Shift right: equal to division by 2 (see ALF_GRADIENT_STEP) > +#define ALF_GSTRIDE_XG_BYTES (2 * ALF_NUM_DIR / ALF_GRADIENT_STEP) > + > +#define ALF_GSTRIDE_SUB_BYTES (2 * ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP) * ALF_NUM_DIR) > + > +#define ALF_CLASS_INC (ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP) > +#define ALF_CLASS_END ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP) > + > +.macro ff_alf_classify_grad pix_size > + // class_idx .req x0 > + // transpose_idx .req x1 > + // _src .req x2 > + // _src_stride .req x3 > + // width .req w4 > + // height .req w5 > + // vb_pos .req w6 > + // gradient_tmp .req x7 > + > + mov w16, #ALF_STRIDE_MUL > + add w5, w5, #ALF_GRAD_BORDER_X2 // h = height + 4 > + mul x16, x3, x16 // 3 * stride > + add w4, w4, #ALF_GRAD_BORDER_X2 // w = width + 4 > + sub x15, x2, x16 // src -= (3 * stride) > + mov x17, x7 > + .if \pix_size == 1 > + sub x15, x15, #ALF_GRADIENT_BORDER > + .else > + sub x15, x15, #4 > + .endif > + mov w8, #0 // y loop: y = 0 > +1: > + cmp w8, w5 > + bge 10f > + > + add x16, x8, #1 > + mul x14, x8, x3 // y * stride > + mul x16, x16, x3 > + add x10, x15, x14 // s0 = src + y * stride > + add x14, x16, x3 > + add x11, x15, x16 // s1 > + add x16, x14, x3 > + add x12, x15, x14 // s2 > + add x13, x15, x16 // s3 > + > + // if (y == vb_pos): s3 = s2 > + cmp w8, w6 > + add w16, w6, #ALF_GRADIENT_BORDER > + csel x13, x12, x13, eq > + // if (y == vb_pos + 2): s0 = s1 > + cmp w8, w16 > + csel x10, x11, x10, eq > + > + alf_classify_load_pixel_with_offset \pix_size, w16, x10, -1 > + alf_classify_load_pixel \pix_size, w14, x13 > + mov v16.h[7], w16 > + mov v28.h[7], w14 > + > + // load 4 pixels from *(s1-2) & *(s2-2) > + .if \pix_size == 1 > + sub x11, x11, #2 > + sub x12, x12, #2 > + ld1 {v0.8b}, [x11] > + ld1 {v1.8b}, [x12] > + uxtl v17.8h, v0.8b > + uxtl v18.8h, v1.8b > + .else > + sub x11, x11, #4 > + sub x12, x12, #4 > + ld1 {v17.8h}, [x11] > + ld1 {v18.8h}, [x12] > + .endif > + ext v22.16b, v22.16b, v17.16b, #8 > + ext v24.16b, v24.16b, v18.16b, #8 > + .if \pix_size == 1 > + add x11, x11, #4 > + add x12, x12, #4 > + .else > + add x11, x11, #8 > + add x12, x12, #8 > + .endif > + > + // x loop > + mov w9, #0 > + b 11f > +2: > + cmp w9, w4 > + bge 20f Instead of having w9 count up to w4, it'd be more common to just decrement one register instead. Here you could do "mov w9, w4" instead of "mov w9, #0", and just decrement w9 with "subs w9, w9, #8" at the end, which allows you to do the branch as "b.gt 2b", avoiding some extra jumping around at the end of the loop. Same for the outer loop for w8/w5. > + > + // Store operation starts from the second cycle > + st2 {v4.8h, v5.8h}, [x17], #32 > +11: > + .if \pix_size == 1 > + // Load 8 pixels: s0 & s1+2 > + ld1 {v0.8b}, [x10], #8 > + ld1 {v1.8b}, [x11], #8 > + uxtl v20.8h, v0.8b > + uxtl v26.8h, v1.8b > + // Load 8 pixels: s2+2 & s3+1 > + ld1 {v0.8b}, [x12], #8 > + ld1 {v1.8b}, [x13], #8 > + uxtl v27.8h, v0.8b > + uxtl v19.8h, v1.8b > + .else > + // Load 8 pixels: s0 > + ld1 {v20.8h}, [x10], #16 > + // Load 8 pixels: s1+2 > + ld1 {v26.8h}, [x11], #16 > + // Load 8 pixels: s2+2 > + ld1 {v27.8h}, [x12], #16 > + // Load 8 pixels: s3+1 > + ld1 {v19.8h}, [x13], #16 > + .endif > + > + ext v16.16b, v16.16b, v20.16b, #14 > + ext v28.16b, v28.16b, v19.16b, #14 > + > + ext v17.16b, v22.16b, v26.16b, #8 > + ext v22.16b, v22.16b, v26.16b, #12 > + > + ext v18.16b, v24.16b, v27.16b, #8 > + ext v24.16b, v24.16b, v27.16b, #12 > + > + // Grad: Vertical & D0 (interleaved) > + trn1 v21.8h, v20.8h, v16.8h // first abs: operand 1 > + rev32 v23.8h, v22.8h // second abs: operand 1 > + trn2 v29.8h, v28.8h, v19.8h // second abs: operand 2 > + trn1 v30.8h, v22.8h, v22.8h > + trn2 v31.8h, v24.8h, v24.8h > + add v30.8h, v30.8h, v30.8h > + add v31.8h, v31.8h, v31.8h > + sub v0.8h, v30.8h, v21.8h > + sub v1.8h, v31.8h, v23.8h > + sabd v4.8h, v0.8h, v24.8h > + > + // Grad: Horizontal & D1 (interleaved) > + trn2 v21.8h, v17.8h, v20.8h // first abs: operand 1 > + saba v4.8h, v1.8h, v29.8h > + trn2 v23.8h, v22.8h, v18.8h // first abs: operand 2 > + trn1 v25.8h, v24.8h, v26.8h // second abs: operand 1 > + trn1 v29.8h, v27.8h, v28.8h // second abs: operand 2 > + sub v0.8h, v30.8h, v21.8h > + sub v1.8h, v31.8h, v25.8h > + sabd v5.8h, v0.8h, v23.8h > + > + // Prepare for the next interation: > + mov v16.16b, v20.16b > + saba v5.8h, v1.8h, v29.8h > + mov v28.16b, v19.16b > + mov v22.16b, v26.16b > + mov v24.16b, v27.16b > + > + add w9, w9, #8 // x += 8 > + b 2b > +20: > + // 8 pixels -> 4 cycles of generic > + // 4 pixels -> paddings => half needs to be saved > + st2 {v4.4h, v5.4h}, [x17], #16 > + > + add w8, w8, #ALF_GRADIENT_STEP // y += 2 > + b 1b > +10: > + ret > +.endm > + > +.macro ff_alf_classify_sum This doesn't seem to need to be a macro, as it is only invoked once, without any parameters? > + // sum0 .req x0 > + // sum1 .req x1 > + // grad .req x2 > + // gshift .req w3 > + // steps .req w4 > + mov w5, #2 > + mov w6, #0 > + mul w3, w3, w5 If you want to multiply by 2, then just do "lsl w3, w3, #1". > + movi v16.4s, #0 > + movi v21.4s, #0 > +6: > + prfm pldl1keep, [x2] No prefetch instructions please. If you feel strongly about it, send a separate patch afterwards to add the prefetch instructions, with repeatable benchmark nubmers. > + cmp w6, w4 > + bge 60f > + > + ld1 {v17.4h}, [x2], #8 > + ld1 {v18.4h}, [x2], #8 > + uxtl v17.4s, v17.4h > + ld1 {v19.4h}, [x2], #8 > + uxtl v18.4s, v18.4h > + ld1 {v20.4h}, [x2], #8 > + uxtl v19.4s, v19.4h > + ld1 {v22.4h}, [x2], #8 > + uxtl v20.4s, v20.4h > + ld1 {v23.4h}, [x2] This sequence of 6 separate sequential loads of .4h registers could be expressed as two loads, with {.4h, .4h, .4h, .4h} and {.4h, .4h}, or three loads of {.4h, .4h} - that should probably be more efficient than doing 6 separate loads. > + uxtl v22.4s, v22.4h > + uxtl v23.4s, v23.4h > + add v17.4s, v17.4s, v18.4s > + add v16.4s, v16.4s, v17.4s As this addition to v16 uses v17, which was updated in the directly preceding instruction, this causes stalls on in-order cores. It'd be better to move this add instruction down by one or two instructions. > + add v19.4s, v19.4s, v20.4s > + add v22.4s, v22.4s, v23.4s > + add v21.4s, v21.4s, v19.4s > + add v16.4s, v16.4s, v19.4s > + add v21.4s, v21.4s, v22.4s > + > + sub x2, x2, #8 > + add w6, w6, #1 // i += 1 Instead of this register w6 counting up to w4, it'd be more idiomatic and requiring one register less, if you'd just decrement w4 instead, with a "subs w4, w4, #1" instruction. Then you can just do "b.gt 6b" at the end, rather than having to jump back to 6 to do the comparison there. > + add x2, x2, x3 // grad += gstride - size * ALF_NUM_DIR Instead of having to subtract 8 from x2 at the end of each loop, just subtract 8 from x3 before the loop. > + b 6b > +60: > + st1 {v16.4s}, [x0] > + st1 {v21.4s}, [x1] > + ret > +.endm > + > + > +function ff_alf_classify_sum_neon, export=1 > + ff_alf_classify_sum > +endfunc > + > +function ff_alf_classify_grad_8_neon, export=1 > + ff_alf_classify_grad 1 > +endfunc > + > +function ff_alf_classify_grad_10_neon, export=1 > + ff_alf_classify_grad 2 > +endfunc > + > +function ff_alf_classify_grad_12_neon, export=1 > + ff_alf_classify_grad 2 > +endfunc > diff --git a/libavcodec/aarch64/vvc/alf_template.c b/libavcodec/aarch64/vvc/alf_template.c > index 41f7bf8995..470222a634 100644 > --- a/libavcodec/aarch64/vvc/alf_template.c > +++ b/libavcodec/aarch64/vvc/alf_template.c > @@ -155,3 +155,90 @@ static void FUNC2(alf_filter_chroma, BIT_DEPTH, _neon)(uint8_t *_dst, > } > } > } > + > +#define ALF_DIR_VERT 0 > +#define ALF_DIR_HORZ 1 > +#define ALF_DIR_DIGA0 2 > +#define ALF_DIR_DIGA1 3 > + > +static void FUNC(ff_alf_get_idx)(int *class_idx, int *transpose_idx, const int *sum, const int ac) > +{ Static functions don't need an ff_ prefix > + static const int arg_var[] = {0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 }; > + > + int hv0, hv1, dir_hv, d0, d1, dir_d, hvd1, hvd0, sum_hv, dir1; > + > + dir_hv = sum[ALF_DIR_VERT] <= sum[ALF_DIR_HORZ]; > + hv1 = FFMAX(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]); > + hv0 = FFMIN(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]); > + > + dir_d = sum[ALF_DIR_DIGA0] <= sum[ALF_DIR_DIGA1]; > + d1 = FFMAX(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]); > + d0 = FFMIN(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]); > + > + //promote to avoid overflow > + dir1 = (uint64_t)d1 * hv0 <= (uint64_t)hv1 * d0; > + hvd1 = dir1 ? hv1 : d1; > + hvd0 = dir1 ? hv0 : d0; > + > + sum_hv = sum[ALF_DIR_HORZ] + sum[ALF_DIR_VERT]; > + *class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), 4)]; > + if (hvd1 * 2 > 9 * hvd0) > + *class_idx += ((dir1 << 1) + 2) * 5; > + else if (hvd1 > 2 * hvd0) > + *class_idx += ((dir1 << 1) + 1) * 5; > + > + *transpose_idx = dir_d * 2 + dir_hv; > +} > + > +static void FUNC(ff_alf_classify)(int *class_idx, int *transpose_idx, Ditto > + const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height, > + const int vb_pos, int16_t *gradient_tmp) > +{ > + int16_t *grad; > + > + const int w = width + ALF_GRADIENT_BORDER * 2; > + const int size = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP; > + const int gstride = (w / ALF_GRADIENT_STEP) * ALF_NUM_DIR; > + const int gshift = gstride - size * ALF_NUM_DIR; > + > + for (int y = 0; y < height ; y += ALF_BLOCK_SIZE ) { > + int start = 0; > + int end = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP; > + int ac = 2; > + if (y + ALF_BLOCK_SIZE == vb_pos) { > + end -= ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP; > + ac = 3; > + } else if (y == vb_pos) { > + start += ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP; > + ac = 3; > + } > + for (int x = 0; x < width; x += (2*ALF_BLOCK_SIZE)) { > + const int xg = x / ALF_GRADIENT_STEP; > + const int yg = y / ALF_GRADIENT_STEP; > + int sum0[ALF_NUM_DIR]; > + int sum1[ALF_NUM_DIR]; > + grad = gradient_tmp + (yg + start) * gstride + xg * ALF_NUM_DIR; > + ff_alf_classify_sum_neon(sum0, sum1, grad, gshift, end-start); This line seems to be indented differently than the rest > + FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum0, ac); > + class_idx++; > + transpose_idx++; > + FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum1, ac); > + class_idx++; > + transpose_idx++; > + } > + } > + > +} > + > +void FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx, > + const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height, > + const int vb_pos, int16_t *gradient_tmp); > + > + > +static void FUNC2(alf_classify, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx, > + const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height, > + const int vb_pos, int *gradient_tmp) { Move the opening brace of a function to the next line > + FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp); This is indented one level too much. With the opening brace on its own line, that would be clearer to see. If you want to, you can sign up at https://code.ffmpeg.org and submit your patch as a PR to https://code.ffmpeg.org/FFmpeg/FFmpeg, for easier followup/reviewing of your submission. // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
prev parent reply other threads:[~2025-08-01 17:50 UTC|newest] Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top 2025-07-09 7:56 Georgii Zagoruiko 2025-08-01 17:50 ` Martin Storsjö [this message]
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=2f1aa58f-826a-bd98-7ac2-5918d17ca2d@martin.st \ --to=martin@martin.st \ --cc=ffmpeg-devel@ffmpeg.org \ --cc=george.zaguri@gmail.com \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git