From: Georgii Zagoruiko <george.zaguri-at-gmail.com@ffmpeg.org> To: ffmpeg-devel@ffmpeg.org Cc: Georgii Zagoruiko <george.zaguri@gmail.com> Subject: [FFmpeg-devel] [PATCH] Optimised alf_classify function 8/10/12bit of vvc codec for aarch64 Date: Wed, 9 Jul 2025 00:27:30 +0400 Message-ID: <20250708202730.99401-1-george.zaguri@gmail.com> (raw) --- libavcodec/aarch64/vvc/alf.S | 278 ++++++++++++++++++++++++++ libavcodec/aarch64/vvc/alf_template.c | 87 ++++++++ libavcodec/aarch64/vvc/dsp_init.c | 6 + 3 files changed, 371 insertions(+) diff --git a/libavcodec/aarch64/vvc/alf.S b/libavcodec/aarch64/vvc/alf.S index 8801b3afb6..9c9765ead1 100644 --- a/libavcodec/aarch64/vvc/alf.S +++ b/libavcodec/aarch64/vvc/alf.S @@ -291,3 +291,281 @@ function ff_alf_filter_chroma_kernel_10_neon, export=1 1: alf_filter_chroma_kernel 2 endfunc + + + +.macro alf_classify_argvar2v30 + mov w16, #0 + mov v30.b[0], w16 + mov w16, #1 + mov v30.b[1], w16 + mov w16, #2 + mov v30.b[2], w16 + mov v30.b[3], w16 + mov v30.b[4], w16 + mov v30.b[5], w16 + mov v30.b[6], w16 + mov w16, #3 + mov v30.b[7], w16 + mov v30.b[8], w16 + mov v30.b[9], w16 + mov v30.b[10], w16 + mov v30.b[11], w16 + mov v30.b[12], w16 + mov v30.b[13], w16 + mov v30.b[14], w16 + mov w16, #4 + mov v30.b[15], w16 +.endm + +.macro alf_classify_load_pixel pix_size, dst, src + .if \pix_size == 1 + ldrb \dst, [\src], #1 + .else + ldrh \dst, [\src], #2 + .endif +.endm + +.macro alf_classify_load_pixel_with_offset pix_size, dst, src, offset + .if \pix_size == 1 + ldrb \dst, [\src, #(\offset)] + .else + ldrh \dst, [\src, #(2*\offset)] + .endif +.endm + +#define ALF_BLOCK_SIZE 4 +#define ALF_GRADIENT_STEP 2 +#define ALF_GRADIENT_BORDER 2 +#define ALF_NUM_DIR 4 +#define ALF_GRAD_BORDER_X2 (ALF_GRADIENT_BORDER * 2) +#define ALF_STRIDE_MUL (ALF_GRADIENT_BORDER + 1) +#define ALF_GRAD_X_VSTEP (ALF_GRADIENT_STEP * 8) +#define ALF_GSTRIDE_MUL (ALF_NUM_DIR / ALF_GRADIENT_STEP) + +// Shift right: equal to division by 2 (see ALF_GRADIENT_STEP) +#define ALF_GSTRIDE_XG_BYTES (2 * ALF_NUM_DIR / ALF_GRADIENT_STEP) + +#define ALF_GSTRIDE_SUB_BYTES (2 * ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP) * ALF_NUM_DIR) + +#define ALF_CLASS_INC (ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP) +#define ALF_CLASS_END ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP) + +.macro ff_alf_classify_grad pix_size + // class_idx .req x0 + // transpose_idx .req x1 + // _src .req x2 + // _src_stride .req x3 + // width .req w4 + // height .req w5 + // vb_pos .req w6 + // gradient_tmp .req x7 + + mov w16, #ALF_STRIDE_MUL + add w5, w5, #ALF_GRAD_BORDER_X2 // h = height + 4 + mul x16, x3, x16 // 3 * stride + add w4, w4, #ALF_GRAD_BORDER_X2 // w = width + 4 + sub x15, x2, x16 // src -= (3 * stride) + mov x17, x7 + .if \pix_size == 1 + sub x15, x15, #ALF_GRADIENT_BORDER + .else + sub x15, x15, #4 + .endif + mov w8, #0 // y loop: y = 0 +1: + cmp w8, w5 + bge 10f + + add x16, x8, #1 + mul x14, x8, x3 // y * stride + mul x16, x16, x3 + add x10, x15, x14 // s0 = src + y * stride + add x14, x16, x3 + add x11, x15, x16 // s1 + add x16, x14, x3 + add x12, x15, x14 // s2 + add x13, x15, x16 // s3 + + // if (y == vb_pos): s3 = s2 + cmp w8, w6 + add w16, w6, #ALF_GRADIENT_BORDER + csel x13, x12, x13, eq + // if (y == vb_pos + 2): s0 = s1 + cmp w8, w16 + csel x10, x11, x10, eq + + alf_classify_load_pixel_with_offset \pix_size, w16, x10, -1 + alf_classify_load_pixel \pix_size, w14, x13 + mov v16.h[7], w16 + mov v28.h[7], w14 + + // load 4 pixels from *(s1-2) & *(s2-2) + .if \pix_size == 1 + sub x11, x11, #2 + sub x12, x12, #2 + ld1 {v0.8b}, [x11] + ld1 {v1.8b}, [x12] + uxtl v17.8h, v0.8b + uxtl v18.8h, v1.8b + .else + sub x11, x11, #4 + sub x12, x12, #4 + ld1 {v17.8h}, [x11] + ld1 {v18.8h}, [x12] + .endif + ext v22.16b, v22.16b, v17.16b, #8 + ext v24.16b, v24.16b, v18.16b, #8 + .if \pix_size == 1 + add x11, x11, #4 + add x12, x12, #4 + .else + add x11, x11, #8 + add x12, x12, #8 + .endif + + // x loop + mov w9, #0 + b 11f +2: + cmp w9, w4 + bge 20f + + // Store operation starts from the second cycle + st2 {v4.8h, v5.8h}, [x17], #32 +11: + .if \pix_size == 1 + // Load 8 pixels: s0 & s1+2 + ld1 {v0.8b}, [x10], #8 + ld1 {v1.8b}, [x11], #8 + uxtl v20.8h, v0.8b + uxtl v26.8h, v1.8b + // Load 8 pixels: s2+2 & s3+1 + ld1 {v0.8b}, [x12], #8 + ld1 {v1.8b}, [x13], #8 + uxtl v27.8h, v0.8b + uxtl v19.8h, v1.8b + .else + // Load 8 pixels: s0 + ld1 {v20.8h}, [x10], #16 + // Load 8 pixels: s1+2 + ld1 {v26.8h}, [x11], #16 + // Load 8 pixels: s2+2 + ld1 {v27.8h}, [x12], #16 + // Load 8 pixels: s3+1 + ld1 {v19.8h}, [x13], #16 + .endif + + ext v16.16b, v16.16b, v20.16b, #14 + ext v28.16b, v28.16b, v19.16b, #14 + + ext v17.16b, v22.16b, v26.16b, #8 + ext v22.16b, v22.16b, v26.16b, #12 + + ext v18.16b, v24.16b, v27.16b, #8 + ext v24.16b, v24.16b, v27.16b, #12 + + // Grad: Vertical & D0 (interleaved) + trn1 v21.8h, v20.8h, v16.8h // first abs: operand 1 + rev32 v23.8h, v22.8h // second abs: operand 1 + trn2 v29.8h, v28.8h, v19.8h // second abs: operand 2 + trn1 v30.8h, v22.8h, v22.8h + trn2 v31.8h, v24.8h, v24.8h + add v30.8h, v30.8h, v30.8h + add v31.8h, v31.8h, v31.8h + sub v0.8h, v30.8h, v21.8h + sub v1.8h, v31.8h, v23.8h + sabd v4.8h, v0.8h, v24.8h + + // Grad: Horizontal & D1 (interleaved) + trn2 v21.8h, v17.8h, v20.8h // first abs: operand 1 + saba v4.8h, v1.8h, v29.8h + trn2 v23.8h, v22.8h, v18.8h // first abs: operand 2 + trn1 v25.8h, v24.8h, v26.8h // second abs: operand 1 + trn1 v29.8h, v27.8h, v28.8h // second abs: operand 2 + sub v0.8h, v30.8h, v21.8h + sub v1.8h, v31.8h, v25.8h + sabd v5.8h, v0.8h, v23.8h + + // Prepare for the next interation: + mov v16.16b, v20.16b + saba v5.8h, v1.8h, v29.8h + mov v28.16b, v19.16b + mov v22.16b, v26.16b + mov v24.16b, v27.16b + + add w9, w9, #8 // x += 8 + b 2b +20: + // 8 pixels -> 4 cycles of generic + // 4 pixels -> paddings => half needs to be saved + st2 {v4.4h, v5.4h}, [x17], #16 + + add w8, w8, #ALF_GRADIENT_STEP // y += 2 + b 1b +10: + ret +.endm + +.macro ff_alf_classify_sum + // sum0 .req x0 + // sum1 .req x1 + // grad .req x2 + // gshift .req w3 + // steps .req w4 + mov w5, #2 + mov w6, #0 + mul w3, w3, w5 + movi v16.4s, #0 + movi v21.4s, #0 +6: + prfm pldl1keep, [x2] + cmp w6, w4 + bge 60f + + ld1 {v17.4h}, [x2], #8 + ld1 {v18.4h}, [x2], #8 + uxtl v17.4s, v17.4h + ld1 {v19.4h}, [x2], #8 + uxtl v18.4s, v18.4h + ld1 {v20.4h}, [x2], #8 + uxtl v19.4s, v19.4h + ld1 {v22.4h}, [x2], #8 + uxtl v20.4s, v20.4h + ld1 {v23.4h}, [x2] + uxtl v22.4s, v22.4h + uxtl v23.4s, v23.4h + add v17.4s, v17.4s, v18.4s + add v16.4s, v16.4s, v17.4s + add v19.4s, v19.4s, v20.4s + add v22.4s, v22.4s, v23.4s + add v21.4s, v21.4s, v19.4s + add v16.4s, v16.4s, v19.4s + add v21.4s, v21.4s, v22.4s + + sub x2, x2, #8 + add w6, w6, #1 // i += 1 + add x2, x2, x3 // grad += gstride - size * ALF_NUM_DIR + b 6b +60: + st1 {v16.4s}, [x0] + st1 {v21.4s}, [x1] + ret +.endm + + +function ff_alf_classify_sum_neon, export=1 + ff_alf_classify_sum +endfunc + +function ff_alf_classify_grad_8_neon, export=1 + ff_alf_classify_grad 1 +endfunc + +function ff_alf_classify_grad_10_neon, export=1 + ff_alf_classify_grad 2 +endfunc + +function ff_alf_classify_grad_12_neon, export=1 + ff_alf_classify_grad 2 +endfunc diff --git a/libavcodec/aarch64/vvc/alf_template.c b/libavcodec/aarch64/vvc/alf_template.c index 41f7bf8995..470222a634 100644 --- a/libavcodec/aarch64/vvc/alf_template.c +++ b/libavcodec/aarch64/vvc/alf_template.c @@ -155,3 +155,90 @@ static void FUNC2(alf_filter_chroma, BIT_DEPTH, _neon)(uint8_t *_dst, } } } + +#define ALF_DIR_VERT 0 +#define ALF_DIR_HORZ 1 +#define ALF_DIR_DIGA0 2 +#define ALF_DIR_DIGA1 3 + +static void FUNC(ff_alf_get_idx)(int *class_idx, int *transpose_idx, const int *sum, const int ac) +{ + static const int arg_var[] = {0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 }; + + int hv0, hv1, dir_hv, d0, d1, dir_d, hvd1, hvd0, sum_hv, dir1; + + dir_hv = sum[ALF_DIR_VERT] <= sum[ALF_DIR_HORZ]; + hv1 = FFMAX(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]); + hv0 = FFMIN(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]); + + dir_d = sum[ALF_DIR_DIGA0] <= sum[ALF_DIR_DIGA1]; + d1 = FFMAX(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]); + d0 = FFMIN(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]); + + //promote to avoid overflow + dir1 = (uint64_t)d1 * hv0 <= (uint64_t)hv1 * d0; + hvd1 = dir1 ? hv1 : d1; + hvd0 = dir1 ? hv0 : d0; + + sum_hv = sum[ALF_DIR_HORZ] + sum[ALF_DIR_VERT]; + *class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), 4)]; + if (hvd1 * 2 > 9 * hvd0) + *class_idx += ((dir1 << 1) + 2) * 5; + else if (hvd1 > 2 * hvd0) + *class_idx += ((dir1 << 1) + 1) * 5; + + *transpose_idx = dir_d * 2 + dir_hv; +} + +static void FUNC(ff_alf_classify)(int *class_idx, int *transpose_idx, + const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height, + const int vb_pos, int16_t *gradient_tmp) +{ + int16_t *grad; + + const int w = width + ALF_GRADIENT_BORDER * 2; + const int size = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP; + const int gstride = (w / ALF_GRADIENT_STEP) * ALF_NUM_DIR; + const int gshift = gstride - size * ALF_NUM_DIR; + + for (int y = 0; y < height ; y += ALF_BLOCK_SIZE ) { + int start = 0; + int end = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP; + int ac = 2; + if (y + ALF_BLOCK_SIZE == vb_pos) { + end -= ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP; + ac = 3; + } else if (y == vb_pos) { + start += ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP; + ac = 3; + } + for (int x = 0; x < width; x += (2*ALF_BLOCK_SIZE)) { + const int xg = x / ALF_GRADIENT_STEP; + const int yg = y / ALF_GRADIENT_STEP; + int sum0[ALF_NUM_DIR]; + int sum1[ALF_NUM_DIR]; + grad = gradient_tmp + (yg + start) * gstride + xg * ALF_NUM_DIR; + ff_alf_classify_sum_neon(sum0, sum1, grad, gshift, end-start); + FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum0, ac); + class_idx++; + transpose_idx++; + FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum1, ac); + class_idx++; + transpose_idx++; + } + } + +} + +void FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx, + const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height, + const int vb_pos, int16_t *gradient_tmp); + + +static void FUNC2(alf_classify, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx, + const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height, + const int vb_pos, int *gradient_tmp) { + FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp); + FUNC(ff_alf_classify)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp); +} + diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c index 9a171234f6..75360d33f3 100644 --- a/libavcodec/aarch64/vvc/dsp_init.c +++ b/libavcodec/aarch64/vvc/dsp_init.c @@ -42,6 +42,9 @@ void ff_vvc_derive_bdof_vx_vy_neon(const int16_t *_src0, const int16_t *_src1, const int16_t **gradient_h, const int16_t **gradient_v, int16_t *vx, int16_t *vy); + +void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshift, uint32_t steps); + #define BIT_DEPTH 8 #include "alf_template.c" #include "of_template.c" @@ -205,6 +208,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->sao.edge_filter[i] = ff_vvc_sao_edge_filter_16x16_8_neon; c->alf.filter[LUMA] = alf_filter_luma_8_neon; c->alf.filter[CHROMA] = alf_filter_chroma_8_neon; + c->alf.classify = alf_classify_8_neon; if (have_i8mm(cpu_flags)) { c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon_i8mm; @@ -243,6 +247,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->alf.filter[LUMA] = alf_filter_luma_10_neon; c->alf.filter[CHROMA] = alf_filter_chroma_10_neon; + c->alf.classify = alf_classify_10_neon; } else if (bd == 12) { c->inter.avg = ff_vvc_avg_12_neon; c->inter.w_avg = vvc_w_avg_12; @@ -252,6 +257,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->alf.filter[LUMA] = alf_filter_luma_12_neon; c->alf.filter[CHROMA] = alf_filter_chroma_12_neon; + c->alf.classify = alf_classify_12_neon; } c->inter.sad = ff_vvc_sad_neon; -- 2.34.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
reply other threads:[~2025-07-08 20:27 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20250708202730.99401-1-george.zaguri@gmail.com \ --to=george.zaguri-at-gmail.com@ffmpeg.org \ --cc=ffmpeg-devel@ffmpeg.org \ --cc=george.zaguri@gmail.com \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git