* [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: optimised alf_classify function 8/10/12bit of vvc codec for aarch64
@ 2025-07-09 7:56 Georgii Zagoruiko
2025-08-01 17:50 ` Martin Storsjö
0 siblings, 1 reply; 2+ messages in thread
From: Georgii Zagoruiko @ 2025-07-09 7:56 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Georgii Zagoruiko
- vvc_alf.alf_classify [OK]
vvc_alf_classify_8x8_8_c: 1314.4 ( 1.00x)
vvc_alf_classify_8x8_8_neon: 794.3 ( 1.65x)
vvc_alf_classify_8x8_10_c: 1154.7 ( 1.00x)
vvc_alf_classify_8x8_10_neon: 770.0 ( 1.50x)
vvc_alf_classify_8x8_12_c: 1091.7 ( 1.00x)
vvc_alf_classify_8x8_12_neon: 770.7 ( 1.42x)
vvc_alf_classify_16x16_8_c: 3710.0 ( 1.00x)
vvc_alf_classify_16x16_8_neon: 2205.6 ( 1.68x)
vvc_alf_classify_16x16_10_c: 3306.2 ( 1.00x)
vvc_alf_classify_16x16_10_neon: 2087.9 ( 1.58x)
vvc_alf_classify_16x16_12_c: 3307.9 ( 1.00x)
vvc_alf_classify_16x16_12_neon: 2089.6 ( 1.58x)
vvc_alf_classify_32x32_8_c: 12770.2 ( 1.00x)
vvc_alf_classify_32x32_8_neon: 7124.6 ( 1.79x)
vvc_alf_classify_32x32_10_c: 11780.3 ( 1.00x)
vvc_alf_classify_32x32_10_neon: 6856.7 ( 1.72x)
vvc_alf_classify_32x32_12_c: 11779.2 ( 1.00x)
vvc_alf_classify_32x32_12_neon: 7002.8 ( 1.68x)
vvc_alf_classify_64x64_8_c: 49332.3 ( 1.00x)
vvc_alf_classify_64x64_8_neon: 26040.4 ( 1.89x)
vvc_alf_classify_64x64_10_c: 45353.7 ( 1.00x)
vvc_alf_classify_64x64_10_neon: 26251.5 ( 1.73x)
vvc_alf_classify_64x64_12_c: 44876.9 ( 1.00x)
vvc_alf_classify_64x64_12_neon: 26491.3 ( 1.69x)
vvc_alf_classify_128x128_8_c: 191953.5 ( 1.00x)
vvc_alf_classify_128x128_8_neon: 96166.3 ( 2.00x)
vvc_alf_classify_128x128_10_c: 177198.5 ( 1.00x)
vvc_alf_classify_128x128_10_neon: 96077.9 ( 1.84x)
vvc_alf_classify_128x128_12_c: 177461.1 ( 1.00x)
vvc_alf_classify_128x128_12_neon: 96184.4 ( 1.85x)
---
libavcodec/aarch64/vvc/alf.S | 278 ++++++++++++++++++++++++++
libavcodec/aarch64/vvc/alf_template.c | 87 ++++++++
libavcodec/aarch64/vvc/dsp_init.c | 6 +
3 files changed, 371 insertions(+)
diff --git a/libavcodec/aarch64/vvc/alf.S b/libavcodec/aarch64/vvc/alf.S
index 8801b3afb6..9c9765ead1 100644
--- a/libavcodec/aarch64/vvc/alf.S
+++ b/libavcodec/aarch64/vvc/alf.S
@@ -291,3 +291,281 @@ function ff_alf_filter_chroma_kernel_10_neon, export=1
1:
alf_filter_chroma_kernel 2
endfunc
+
+
+
+.macro alf_classify_argvar2v30
+ mov w16, #0
+ mov v30.b[0], w16
+ mov w16, #1
+ mov v30.b[1], w16
+ mov w16, #2
+ mov v30.b[2], w16
+ mov v30.b[3], w16
+ mov v30.b[4], w16
+ mov v30.b[5], w16
+ mov v30.b[6], w16
+ mov w16, #3
+ mov v30.b[7], w16
+ mov v30.b[8], w16
+ mov v30.b[9], w16
+ mov v30.b[10], w16
+ mov v30.b[11], w16
+ mov v30.b[12], w16
+ mov v30.b[13], w16
+ mov v30.b[14], w16
+ mov w16, #4
+ mov v30.b[15], w16
+.endm
+
+.macro alf_classify_load_pixel pix_size, dst, src
+ .if \pix_size == 1
+ ldrb \dst, [\src], #1
+ .else
+ ldrh \dst, [\src], #2
+ .endif
+.endm
+
+.macro alf_classify_load_pixel_with_offset pix_size, dst, src, offset
+ .if \pix_size == 1
+ ldrb \dst, [\src, #(\offset)]
+ .else
+ ldrh \dst, [\src, #(2*\offset)]
+ .endif
+.endm
+
+#define ALF_BLOCK_SIZE 4
+#define ALF_GRADIENT_STEP 2
+#define ALF_GRADIENT_BORDER 2
+#define ALF_NUM_DIR 4
+#define ALF_GRAD_BORDER_X2 (ALF_GRADIENT_BORDER * 2)
+#define ALF_STRIDE_MUL (ALF_GRADIENT_BORDER + 1)
+#define ALF_GRAD_X_VSTEP (ALF_GRADIENT_STEP * 8)
+#define ALF_GSTRIDE_MUL (ALF_NUM_DIR / ALF_GRADIENT_STEP)
+
+// Shift right: equal to division by 2 (see ALF_GRADIENT_STEP)
+#define ALF_GSTRIDE_XG_BYTES (2 * ALF_NUM_DIR / ALF_GRADIENT_STEP)
+
+#define ALF_GSTRIDE_SUB_BYTES (2 * ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP) * ALF_NUM_DIR)
+
+#define ALF_CLASS_INC (ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP)
+#define ALF_CLASS_END ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP)
+
+.macro ff_alf_classify_grad pix_size
+ // class_idx .req x0
+ // transpose_idx .req x1
+ // _src .req x2
+ // _src_stride .req x3
+ // width .req w4
+ // height .req w5
+ // vb_pos .req w6
+ // gradient_tmp .req x7
+
+ mov w16, #ALF_STRIDE_MUL
+ add w5, w5, #ALF_GRAD_BORDER_X2 // h = height + 4
+ mul x16, x3, x16 // 3 * stride
+ add w4, w4, #ALF_GRAD_BORDER_X2 // w = width + 4
+ sub x15, x2, x16 // src -= (3 * stride)
+ mov x17, x7
+ .if \pix_size == 1
+ sub x15, x15, #ALF_GRADIENT_BORDER
+ .else
+ sub x15, x15, #4
+ .endif
+ mov w8, #0 // y loop: y = 0
+1:
+ cmp w8, w5
+ bge 10f
+
+ add x16, x8, #1
+ mul x14, x8, x3 // y * stride
+ mul x16, x16, x3
+ add x10, x15, x14 // s0 = src + y * stride
+ add x14, x16, x3
+ add x11, x15, x16 // s1
+ add x16, x14, x3
+ add x12, x15, x14 // s2
+ add x13, x15, x16 // s3
+
+ // if (y == vb_pos): s3 = s2
+ cmp w8, w6
+ add w16, w6, #ALF_GRADIENT_BORDER
+ csel x13, x12, x13, eq
+ // if (y == vb_pos + 2): s0 = s1
+ cmp w8, w16
+ csel x10, x11, x10, eq
+
+ alf_classify_load_pixel_with_offset \pix_size, w16, x10, -1
+ alf_classify_load_pixel \pix_size, w14, x13
+ mov v16.h[7], w16
+ mov v28.h[7], w14
+
+ // load 4 pixels from *(s1-2) & *(s2-2)
+ .if \pix_size == 1
+ sub x11, x11, #2
+ sub x12, x12, #2
+ ld1 {v0.8b}, [x11]
+ ld1 {v1.8b}, [x12]
+ uxtl v17.8h, v0.8b
+ uxtl v18.8h, v1.8b
+ .else
+ sub x11, x11, #4
+ sub x12, x12, #4
+ ld1 {v17.8h}, [x11]
+ ld1 {v18.8h}, [x12]
+ .endif
+ ext v22.16b, v22.16b, v17.16b, #8
+ ext v24.16b, v24.16b, v18.16b, #8
+ .if \pix_size == 1
+ add x11, x11, #4
+ add x12, x12, #4
+ .else
+ add x11, x11, #8
+ add x12, x12, #8
+ .endif
+
+ // x loop
+ mov w9, #0
+ b 11f
+2:
+ cmp w9, w4
+ bge 20f
+
+ // Store operation starts from the second cycle
+ st2 {v4.8h, v5.8h}, [x17], #32
+11:
+ .if \pix_size == 1
+ // Load 8 pixels: s0 & s1+2
+ ld1 {v0.8b}, [x10], #8
+ ld1 {v1.8b}, [x11], #8
+ uxtl v20.8h, v0.8b
+ uxtl v26.8h, v1.8b
+ // Load 8 pixels: s2+2 & s3+1
+ ld1 {v0.8b}, [x12], #8
+ ld1 {v1.8b}, [x13], #8
+ uxtl v27.8h, v0.8b
+ uxtl v19.8h, v1.8b
+ .else
+ // Load 8 pixels: s0
+ ld1 {v20.8h}, [x10], #16
+ // Load 8 pixels: s1+2
+ ld1 {v26.8h}, [x11], #16
+ // Load 8 pixels: s2+2
+ ld1 {v27.8h}, [x12], #16
+ // Load 8 pixels: s3+1
+ ld1 {v19.8h}, [x13], #16
+ .endif
+
+ ext v16.16b, v16.16b, v20.16b, #14
+ ext v28.16b, v28.16b, v19.16b, #14
+
+ ext v17.16b, v22.16b, v26.16b, #8
+ ext v22.16b, v22.16b, v26.16b, #12
+
+ ext v18.16b, v24.16b, v27.16b, #8
+ ext v24.16b, v24.16b, v27.16b, #12
+
+ // Grad: Vertical & D0 (interleaved)
+ trn1 v21.8h, v20.8h, v16.8h // first abs: operand 1
+ rev32 v23.8h, v22.8h // second abs: operand 1
+ trn2 v29.8h, v28.8h, v19.8h // second abs: operand 2
+ trn1 v30.8h, v22.8h, v22.8h
+ trn2 v31.8h, v24.8h, v24.8h
+ add v30.8h, v30.8h, v30.8h
+ add v31.8h, v31.8h, v31.8h
+ sub v0.8h, v30.8h, v21.8h
+ sub v1.8h, v31.8h, v23.8h
+ sabd v4.8h, v0.8h, v24.8h
+
+ // Grad: Horizontal & D1 (interleaved)
+ trn2 v21.8h, v17.8h, v20.8h // first abs: operand 1
+ saba v4.8h, v1.8h, v29.8h
+ trn2 v23.8h, v22.8h, v18.8h // first abs: operand 2
+ trn1 v25.8h, v24.8h, v26.8h // second abs: operand 1
+ trn1 v29.8h, v27.8h, v28.8h // second abs: operand 2
+ sub v0.8h, v30.8h, v21.8h
+ sub v1.8h, v31.8h, v25.8h
+ sabd v5.8h, v0.8h, v23.8h
+
+ // Prepare for the next interation:
+ mov v16.16b, v20.16b
+ saba v5.8h, v1.8h, v29.8h
+ mov v28.16b, v19.16b
+ mov v22.16b, v26.16b
+ mov v24.16b, v27.16b
+
+ add w9, w9, #8 // x += 8
+ b 2b
+20:
+ // 8 pixels -> 4 cycles of generic
+ // 4 pixels -> paddings => half needs to be saved
+ st2 {v4.4h, v5.4h}, [x17], #16
+
+ add w8, w8, #ALF_GRADIENT_STEP // y += 2
+ b 1b
+10:
+ ret
+.endm
+
+.macro ff_alf_classify_sum
+ // sum0 .req x0
+ // sum1 .req x1
+ // grad .req x2
+ // gshift .req w3
+ // steps .req w4
+ mov w5, #2
+ mov w6, #0
+ mul w3, w3, w5
+ movi v16.4s, #0
+ movi v21.4s, #0
+6:
+ prfm pldl1keep, [x2]
+ cmp w6, w4
+ bge 60f
+
+ ld1 {v17.4h}, [x2], #8
+ ld1 {v18.4h}, [x2], #8
+ uxtl v17.4s, v17.4h
+ ld1 {v19.4h}, [x2], #8
+ uxtl v18.4s, v18.4h
+ ld1 {v20.4h}, [x2], #8
+ uxtl v19.4s, v19.4h
+ ld1 {v22.4h}, [x2], #8
+ uxtl v20.4s, v20.4h
+ ld1 {v23.4h}, [x2]
+ uxtl v22.4s, v22.4h
+ uxtl v23.4s, v23.4h
+ add v17.4s, v17.4s, v18.4s
+ add v16.4s, v16.4s, v17.4s
+ add v19.4s, v19.4s, v20.4s
+ add v22.4s, v22.4s, v23.4s
+ add v21.4s, v21.4s, v19.4s
+ add v16.4s, v16.4s, v19.4s
+ add v21.4s, v21.4s, v22.4s
+
+ sub x2, x2, #8
+ add w6, w6, #1 // i += 1
+ add x2, x2, x3 // grad += gstride - size * ALF_NUM_DIR
+ b 6b
+60:
+ st1 {v16.4s}, [x0]
+ st1 {v21.4s}, [x1]
+ ret
+.endm
+
+
+function ff_alf_classify_sum_neon, export=1
+ ff_alf_classify_sum
+endfunc
+
+function ff_alf_classify_grad_8_neon, export=1
+ ff_alf_classify_grad 1
+endfunc
+
+function ff_alf_classify_grad_10_neon, export=1
+ ff_alf_classify_grad 2
+endfunc
+
+function ff_alf_classify_grad_12_neon, export=1
+ ff_alf_classify_grad 2
+endfunc
diff --git a/libavcodec/aarch64/vvc/alf_template.c b/libavcodec/aarch64/vvc/alf_template.c
index 41f7bf8995..470222a634 100644
--- a/libavcodec/aarch64/vvc/alf_template.c
+++ b/libavcodec/aarch64/vvc/alf_template.c
@@ -155,3 +155,90 @@ static void FUNC2(alf_filter_chroma, BIT_DEPTH, _neon)(uint8_t *_dst,
}
}
}
+
+#define ALF_DIR_VERT 0
+#define ALF_DIR_HORZ 1
+#define ALF_DIR_DIGA0 2
+#define ALF_DIR_DIGA1 3
+
+static void FUNC(ff_alf_get_idx)(int *class_idx, int *transpose_idx, const int *sum, const int ac)
+{
+ static const int arg_var[] = {0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 };
+
+ int hv0, hv1, dir_hv, d0, d1, dir_d, hvd1, hvd0, sum_hv, dir1;
+
+ dir_hv = sum[ALF_DIR_VERT] <= sum[ALF_DIR_HORZ];
+ hv1 = FFMAX(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]);
+ hv0 = FFMIN(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]);
+
+ dir_d = sum[ALF_DIR_DIGA0] <= sum[ALF_DIR_DIGA1];
+ d1 = FFMAX(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]);
+ d0 = FFMIN(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]);
+
+ //promote to avoid overflow
+ dir1 = (uint64_t)d1 * hv0 <= (uint64_t)hv1 * d0;
+ hvd1 = dir1 ? hv1 : d1;
+ hvd0 = dir1 ? hv0 : d0;
+
+ sum_hv = sum[ALF_DIR_HORZ] + sum[ALF_DIR_VERT];
+ *class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), 4)];
+ if (hvd1 * 2 > 9 * hvd0)
+ *class_idx += ((dir1 << 1) + 2) * 5;
+ else if (hvd1 > 2 * hvd0)
+ *class_idx += ((dir1 << 1) + 1) * 5;
+
+ *transpose_idx = dir_d * 2 + dir_hv;
+}
+
+static void FUNC(ff_alf_classify)(int *class_idx, int *transpose_idx,
+ const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
+ const int vb_pos, int16_t *gradient_tmp)
+{
+ int16_t *grad;
+
+ const int w = width + ALF_GRADIENT_BORDER * 2;
+ const int size = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP;
+ const int gstride = (w / ALF_GRADIENT_STEP) * ALF_NUM_DIR;
+ const int gshift = gstride - size * ALF_NUM_DIR;
+
+ for (int y = 0; y < height ; y += ALF_BLOCK_SIZE ) {
+ int start = 0;
+ int end = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP;
+ int ac = 2;
+ if (y + ALF_BLOCK_SIZE == vb_pos) {
+ end -= ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP;
+ ac = 3;
+ } else if (y == vb_pos) {
+ start += ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP;
+ ac = 3;
+ }
+ for (int x = 0; x < width; x += (2*ALF_BLOCK_SIZE)) {
+ const int xg = x / ALF_GRADIENT_STEP;
+ const int yg = y / ALF_GRADIENT_STEP;
+ int sum0[ALF_NUM_DIR];
+ int sum1[ALF_NUM_DIR];
+ grad = gradient_tmp + (yg + start) * gstride + xg * ALF_NUM_DIR;
+ ff_alf_classify_sum_neon(sum0, sum1, grad, gshift, end-start);
+ FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum0, ac);
+ class_idx++;
+ transpose_idx++;
+ FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum1, ac);
+ class_idx++;
+ transpose_idx++;
+ }
+ }
+
+}
+
+void FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx,
+ const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
+ const int vb_pos, int16_t *gradient_tmp);
+
+
+static void FUNC2(alf_classify, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx,
+ const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
+ const int vb_pos, int *gradient_tmp) {
+ FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp);
+ FUNC(ff_alf_classify)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp);
+}
+
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index 9a171234f6..75360d33f3 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -42,6 +42,9 @@ void ff_vvc_derive_bdof_vx_vy_neon(const int16_t *_src0, const int16_t *_src1,
const int16_t **gradient_h,
const int16_t **gradient_v,
int16_t *vx, int16_t *vy);
+
+void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshift, uint32_t steps);
+
#define BIT_DEPTH 8
#include "alf_template.c"
#include "of_template.c"
@@ -205,6 +208,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->sao.edge_filter[i] = ff_vvc_sao_edge_filter_16x16_8_neon;
c->alf.filter[LUMA] = alf_filter_luma_8_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_8_neon;
+ c->alf.classify = alf_classify_8_neon;
if (have_i8mm(cpu_flags)) {
c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon_i8mm;
@@ -243,6 +247,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
+ c->alf.classify = alf_classify_10_neon;
} else if (bd == 12) {
c->inter.avg = ff_vvc_avg_12_neon;
c->inter.w_avg = vvc_w_avg_12;
@@ -252,6 +257,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->alf.filter[LUMA] = alf_filter_luma_12_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
+ c->alf.classify = alf_classify_12_neon;
}
c->inter.sad = ff_vvc_sad_neon;
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: optimised alf_classify function 8/10/12bit of vvc codec for aarch64
2025-07-09 7:56 [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: optimised alf_classify function 8/10/12bit of vvc codec for aarch64 Georgii Zagoruiko
@ 2025-08-01 17:50 ` Martin Storsjö
0 siblings, 0 replies; 2+ messages in thread
From: Martin Storsjö @ 2025-08-01 17:50 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Georgii Zagoruiko
On Wed, 9 Jul 2025, Georgii Zagoruiko wrote:
> diff --git a/libavcodec/aarch64/vvc/alf.S b/libavcodec/aarch64/vvc/alf.S
> index 8801b3afb6..9c9765ead1 100644
> --- a/libavcodec/aarch64/vvc/alf.S
> +++ b/libavcodec/aarch64/vvc/alf.S
> @@ -291,3 +291,281 @@ function ff_alf_filter_chroma_kernel_10_neon, export=1
> 1:
> alf_filter_chroma_kernel 2
> endfunc
> +
> +
> +
> +.macro alf_classify_argvar2v30
> + mov w16, #0
> + mov v30.b[0], w16
> + mov w16, #1
> + mov v30.b[1], w16
> + mov w16, #2
> + mov v30.b[2], w16
> + mov v30.b[3], w16
> + mov v30.b[4], w16
> + mov v30.b[5], w16
> + mov v30.b[6], w16
> + mov w16, #3
> + mov v30.b[7], w16
> + mov v30.b[8], w16
> + mov v30.b[9], w16
> + mov v30.b[10], w16
> + mov v30.b[11], w16
> + mov v30.b[12], w16
> + mov v30.b[13], w16
> + mov v30.b[14], w16
> + mov w16, #4
> + mov v30.b[15], w16
Wouldn't it be more efficient to just have this pattern stored as a
constant somewhere, and do a single load instruction (with some latency)
to load it, rather than a 19 instruction sequence to initialize it?
> +.endm
> +
> +.macro alf_classify_load_pixel pix_size, dst, src
> + .if \pix_size == 1
> + ldrb \dst, [\src], #1
> + .else
> + ldrh \dst, [\src], #2
> + .endif
> +.endm
> +
> +.macro alf_classify_load_pixel_with_offset pix_size, dst, src, offset
> + .if \pix_size == 1
> + ldrb \dst, [\src, #(\offset)]
> + .else
> + ldrh \dst, [\src, #(2*\offset)]
> + .endif
> +.endm
> +
> +#define ALF_BLOCK_SIZE 4
> +#define ALF_GRADIENT_STEP 2
> +#define ALF_GRADIENT_BORDER 2
> +#define ALF_NUM_DIR 4
> +#define ALF_GRAD_BORDER_X2 (ALF_GRADIENT_BORDER * 2)
> +#define ALF_STRIDE_MUL (ALF_GRADIENT_BORDER + 1)
> +#define ALF_GRAD_X_VSTEP (ALF_GRADIENT_STEP * 8)
> +#define ALF_GSTRIDE_MUL (ALF_NUM_DIR / ALF_GRADIENT_STEP)
> +
> +// Shift right: equal to division by 2 (see ALF_GRADIENT_STEP)
> +#define ALF_GSTRIDE_XG_BYTES (2 * ALF_NUM_DIR / ALF_GRADIENT_STEP)
> +
> +#define ALF_GSTRIDE_SUB_BYTES (2 * ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP) * ALF_NUM_DIR)
> +
> +#define ALF_CLASS_INC (ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP)
> +#define ALF_CLASS_END ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP)
> +
> +.macro ff_alf_classify_grad pix_size
> + // class_idx .req x0
> + // transpose_idx .req x1
> + // _src .req x2
> + // _src_stride .req x3
> + // width .req w4
> + // height .req w5
> + // vb_pos .req w6
> + // gradient_tmp .req x7
> +
> + mov w16, #ALF_STRIDE_MUL
> + add w5, w5, #ALF_GRAD_BORDER_X2 // h = height + 4
> + mul x16, x3, x16 // 3 * stride
> + add w4, w4, #ALF_GRAD_BORDER_X2 // w = width + 4
> + sub x15, x2, x16 // src -= (3 * stride)
> + mov x17, x7
> + .if \pix_size == 1
> + sub x15, x15, #ALF_GRADIENT_BORDER
> + .else
> + sub x15, x15, #4
> + .endif
> + mov w8, #0 // y loop: y = 0
> +1:
> + cmp w8, w5
> + bge 10f
> +
> + add x16, x8, #1
> + mul x14, x8, x3 // y * stride
> + mul x16, x16, x3
> + add x10, x15, x14 // s0 = src + y * stride
> + add x14, x16, x3
> + add x11, x15, x16 // s1
> + add x16, x14, x3
> + add x12, x15, x14 // s2
> + add x13, x15, x16 // s3
> +
> + // if (y == vb_pos): s3 = s2
> + cmp w8, w6
> + add w16, w6, #ALF_GRADIENT_BORDER
> + csel x13, x12, x13, eq
> + // if (y == vb_pos + 2): s0 = s1
> + cmp w8, w16
> + csel x10, x11, x10, eq
> +
> + alf_classify_load_pixel_with_offset \pix_size, w16, x10, -1
> + alf_classify_load_pixel \pix_size, w14, x13
> + mov v16.h[7], w16
> + mov v28.h[7], w14
> +
> + // load 4 pixels from *(s1-2) & *(s2-2)
> + .if \pix_size == 1
> + sub x11, x11, #2
> + sub x12, x12, #2
> + ld1 {v0.8b}, [x11]
> + ld1 {v1.8b}, [x12]
> + uxtl v17.8h, v0.8b
> + uxtl v18.8h, v1.8b
> + .else
> + sub x11, x11, #4
> + sub x12, x12, #4
> + ld1 {v17.8h}, [x11]
> + ld1 {v18.8h}, [x12]
> + .endif
> + ext v22.16b, v22.16b, v17.16b, #8
> + ext v24.16b, v24.16b, v18.16b, #8
> + .if \pix_size == 1
> + add x11, x11, #4
> + add x12, x12, #4
> + .else
> + add x11, x11, #8
> + add x12, x12, #8
> + .endif
> +
> + // x loop
> + mov w9, #0
> + b 11f
> +2:
> + cmp w9, w4
> + bge 20f
Instead of having w9 count up to w4, it'd be more common to just decrement
one register instead. Here you could do "mov w9, w4" instead of "mov w9,
#0", and just decrement w9 with "subs w9, w9, #8" at the end, which allows
you to do the branch as "b.gt 2b", avoiding some extra jumping around at
the end of the loop. Same for the outer loop for w8/w5.
> +
> + // Store operation starts from the second cycle
> + st2 {v4.8h, v5.8h}, [x17], #32
> +11:
> + .if \pix_size == 1
> + // Load 8 pixels: s0 & s1+2
> + ld1 {v0.8b}, [x10], #8
> + ld1 {v1.8b}, [x11], #8
> + uxtl v20.8h, v0.8b
> + uxtl v26.8h, v1.8b
> + // Load 8 pixels: s2+2 & s3+1
> + ld1 {v0.8b}, [x12], #8
> + ld1 {v1.8b}, [x13], #8
> + uxtl v27.8h, v0.8b
> + uxtl v19.8h, v1.8b
> + .else
> + // Load 8 pixels: s0
> + ld1 {v20.8h}, [x10], #16
> + // Load 8 pixels: s1+2
> + ld1 {v26.8h}, [x11], #16
> + // Load 8 pixels: s2+2
> + ld1 {v27.8h}, [x12], #16
> + // Load 8 pixels: s3+1
> + ld1 {v19.8h}, [x13], #16
> + .endif
> +
> + ext v16.16b, v16.16b, v20.16b, #14
> + ext v28.16b, v28.16b, v19.16b, #14
> +
> + ext v17.16b, v22.16b, v26.16b, #8
> + ext v22.16b, v22.16b, v26.16b, #12
> +
> + ext v18.16b, v24.16b, v27.16b, #8
> + ext v24.16b, v24.16b, v27.16b, #12
> +
> + // Grad: Vertical & D0 (interleaved)
> + trn1 v21.8h, v20.8h, v16.8h // first abs: operand 1
> + rev32 v23.8h, v22.8h // second abs: operand 1
> + trn2 v29.8h, v28.8h, v19.8h // second abs: operand 2
> + trn1 v30.8h, v22.8h, v22.8h
> + trn2 v31.8h, v24.8h, v24.8h
> + add v30.8h, v30.8h, v30.8h
> + add v31.8h, v31.8h, v31.8h
> + sub v0.8h, v30.8h, v21.8h
> + sub v1.8h, v31.8h, v23.8h
> + sabd v4.8h, v0.8h, v24.8h
> +
> + // Grad: Horizontal & D1 (interleaved)
> + trn2 v21.8h, v17.8h, v20.8h // first abs: operand 1
> + saba v4.8h, v1.8h, v29.8h
> + trn2 v23.8h, v22.8h, v18.8h // first abs: operand 2
> + trn1 v25.8h, v24.8h, v26.8h // second abs: operand 1
> + trn1 v29.8h, v27.8h, v28.8h // second abs: operand 2
> + sub v0.8h, v30.8h, v21.8h
> + sub v1.8h, v31.8h, v25.8h
> + sabd v5.8h, v0.8h, v23.8h
> +
> + // Prepare for the next interation:
> + mov v16.16b, v20.16b
> + saba v5.8h, v1.8h, v29.8h
> + mov v28.16b, v19.16b
> + mov v22.16b, v26.16b
> + mov v24.16b, v27.16b
> +
> + add w9, w9, #8 // x += 8
> + b 2b
> +20:
> + // 8 pixels -> 4 cycles of generic
> + // 4 pixels -> paddings => half needs to be saved
> + st2 {v4.4h, v5.4h}, [x17], #16
> +
> + add w8, w8, #ALF_GRADIENT_STEP // y += 2
> + b 1b
> +10:
> + ret
> +.endm
> +
> +.macro ff_alf_classify_sum
This doesn't seem to need to be a macro, as it is only invoked once,
without any parameters?
> + // sum0 .req x0
> + // sum1 .req x1
> + // grad .req x2
> + // gshift .req w3
> + // steps .req w4
> + mov w5, #2
> + mov w6, #0
> + mul w3, w3, w5
If you want to multiply by 2, then just do "lsl w3, w3, #1".
> + movi v16.4s, #0
> + movi v21.4s, #0
> +6:
> + prfm pldl1keep, [x2]
No prefetch instructions please. If you feel strongly about it, send a
separate patch afterwards to add the prefetch instructions, with
repeatable benchmark nubmers.
> + cmp w6, w4
> + bge 60f
> +
> + ld1 {v17.4h}, [x2], #8
> + ld1 {v18.4h}, [x2], #8
> + uxtl v17.4s, v17.4h
> + ld1 {v19.4h}, [x2], #8
> + uxtl v18.4s, v18.4h
> + ld1 {v20.4h}, [x2], #8
> + uxtl v19.4s, v19.4h
> + ld1 {v22.4h}, [x2], #8
> + uxtl v20.4s, v20.4h
> + ld1 {v23.4h}, [x2]
This sequence of 6 separate sequential loads of .4h registers could be
expressed as two loads, with {.4h, .4h, .4h, .4h} and {.4h, .4h}, or three
loads of {.4h, .4h} - that should probably be more efficient than doing 6
separate loads.
> + uxtl v22.4s, v22.4h
> + uxtl v23.4s, v23.4h
> + add v17.4s, v17.4s, v18.4s
> + add v16.4s, v16.4s, v17.4s
As this addition to v16 uses v17, which was updated in the directly
preceding instruction, this causes stalls on in-order cores. It'd be
better to move this add instruction down by one or two instructions.
> + add v19.4s, v19.4s, v20.4s
> + add v22.4s, v22.4s, v23.4s
> + add v21.4s, v21.4s, v19.4s
> + add v16.4s, v16.4s, v19.4s
> + add v21.4s, v21.4s, v22.4s
> +
> + sub x2, x2, #8
> + add w6, w6, #1 // i += 1
Instead of this register w6 counting up to w4, it'd be more idiomatic and
requiring one register less, if you'd just decrement w4 instead, with a
"subs w4, w4, #1" instruction. Then you can just do "b.gt 6b" at the end,
rather than having to jump back to 6 to do the comparison there.
> + add x2, x2, x3 // grad += gstride - size * ALF_NUM_DIR
Instead of having to subtract 8 from x2 at the end of each loop, just
subtract 8 from x3 before the loop.
> + b 6b
> +60:
> + st1 {v16.4s}, [x0]
> + st1 {v21.4s}, [x1]
> + ret
> +.endm
> +
> +
> +function ff_alf_classify_sum_neon, export=1
> + ff_alf_classify_sum
> +endfunc
> +
> +function ff_alf_classify_grad_8_neon, export=1
> + ff_alf_classify_grad 1
> +endfunc
> +
> +function ff_alf_classify_grad_10_neon, export=1
> + ff_alf_classify_grad 2
> +endfunc
> +
> +function ff_alf_classify_grad_12_neon, export=1
> + ff_alf_classify_grad 2
> +endfunc
> diff --git a/libavcodec/aarch64/vvc/alf_template.c b/libavcodec/aarch64/vvc/alf_template.c
> index 41f7bf8995..470222a634 100644
> --- a/libavcodec/aarch64/vvc/alf_template.c
> +++ b/libavcodec/aarch64/vvc/alf_template.c
> @@ -155,3 +155,90 @@ static void FUNC2(alf_filter_chroma, BIT_DEPTH, _neon)(uint8_t *_dst,
> }
> }
> }
> +
> +#define ALF_DIR_VERT 0
> +#define ALF_DIR_HORZ 1
> +#define ALF_DIR_DIGA0 2
> +#define ALF_DIR_DIGA1 3
> +
> +static void FUNC(ff_alf_get_idx)(int *class_idx, int *transpose_idx, const int *sum, const int ac)
> +{
Static functions don't need an ff_ prefix
> + static const int arg_var[] = {0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 };
> +
> + int hv0, hv1, dir_hv, d0, d1, dir_d, hvd1, hvd0, sum_hv, dir1;
> +
> + dir_hv = sum[ALF_DIR_VERT] <= sum[ALF_DIR_HORZ];
> + hv1 = FFMAX(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]);
> + hv0 = FFMIN(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]);
> +
> + dir_d = sum[ALF_DIR_DIGA0] <= sum[ALF_DIR_DIGA1];
> + d1 = FFMAX(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]);
> + d0 = FFMIN(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]);
> +
> + //promote to avoid overflow
> + dir1 = (uint64_t)d1 * hv0 <= (uint64_t)hv1 * d0;
> + hvd1 = dir1 ? hv1 : d1;
> + hvd0 = dir1 ? hv0 : d0;
> +
> + sum_hv = sum[ALF_DIR_HORZ] + sum[ALF_DIR_VERT];
> + *class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), 4)];
> + if (hvd1 * 2 > 9 * hvd0)
> + *class_idx += ((dir1 << 1) + 2) * 5;
> + else if (hvd1 > 2 * hvd0)
> + *class_idx += ((dir1 << 1) + 1) * 5;
> +
> + *transpose_idx = dir_d * 2 + dir_hv;
> +}
> +
> +static void FUNC(ff_alf_classify)(int *class_idx, int *transpose_idx,
Ditto
> + const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
> + const int vb_pos, int16_t *gradient_tmp)
> +{
> + int16_t *grad;
> +
> + const int w = width + ALF_GRADIENT_BORDER * 2;
> + const int size = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP;
> + const int gstride = (w / ALF_GRADIENT_STEP) * ALF_NUM_DIR;
> + const int gshift = gstride - size * ALF_NUM_DIR;
> +
> + for (int y = 0; y < height ; y += ALF_BLOCK_SIZE ) {
> + int start = 0;
> + int end = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP;
> + int ac = 2;
> + if (y + ALF_BLOCK_SIZE == vb_pos) {
> + end -= ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP;
> + ac = 3;
> + } else if (y == vb_pos) {
> + start += ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP;
> + ac = 3;
> + }
> + for (int x = 0; x < width; x += (2*ALF_BLOCK_SIZE)) {
> + const int xg = x / ALF_GRADIENT_STEP;
> + const int yg = y / ALF_GRADIENT_STEP;
> + int sum0[ALF_NUM_DIR];
> + int sum1[ALF_NUM_DIR];
> + grad = gradient_tmp + (yg + start) * gstride + xg * ALF_NUM_DIR;
> + ff_alf_classify_sum_neon(sum0, sum1, grad, gshift, end-start);
This line seems to be indented differently than the rest
> + FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum0, ac);
> + class_idx++;
> + transpose_idx++;
> + FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum1, ac);
> + class_idx++;
> + transpose_idx++;
> + }
> + }
> +
> +}
> +
> +void FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx,
> + const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
> + const int vb_pos, int16_t *gradient_tmp);
> +
> +
> +static void FUNC2(alf_classify, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx,
> + const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
> + const int vb_pos, int *gradient_tmp) {
Move the opening brace of a function to the next line
> + FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp);
This is indented one level too much. With the opening brace on its own
line, that would be clearer to see.
If you want to, you can sign up at https://code.ffmpeg.org and submit your
patch as a PR to https://code.ffmpeg.org/FFmpeg/FFmpeg, for easier
followup/reviewing of your submission.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2025-08-01 17:50 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-07-09 7:56 [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: optimised alf_classify function 8/10/12bit of vvc codec for aarch64 Georgii Zagoruiko
2025-08-01 17:50 ` Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git