* [FFmpeg-devel] [PATCH] feature/classify_neon (PR #20377)
@ 2025-08-31 9:55 george.zaguri via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: george.zaguri via ffmpeg-devel @ 2025-08-31 9:55 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: george.zaguri
PR #20377 opened by george.zaguri
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20377
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20377.patch
Optimisations for NEON platform with fixes to improve performance on Mac and fixed comments to patch
RPi4:
Apple M2 (MacBook Air):
vvc_alf_classify_8x8_8_c: 2.6 ( 1.00x)
vvc_alf_classify_8x8_8_neon: 1.2 ( 2.06x)
vvc_alf_classify_8x8_10_c: 2.7 ( 1.00x)
vvc_alf_classify_8x8_10_neon: 1.1 ( 2.41x)
vvc_alf_classify_8x8_12_c: 2.8 ( 1.00x)
vvc_alf_classify_8x8_12_neon: 1.1 ( 2.48x)
vvc_alf_classify_16x16_8_c: 7.2 ( 1.00x)
vvc_alf_classify_16x16_8_neon: 3.4 ( 2.09x)
vvc_alf_classify_16x16_10_c: 4.3 ( 1.00x)
vvc_alf_classify_16x16_10_neon: 3.1 ( 1.38x)
vvc_alf_classify_16x16_12_c: 4.4 ( 1.00x)
vvc_alf_classify_16x16_12_neon: 3.2 ( 1.40x)
vvc_alf_classify_32x32_8_c: 13.6 ( 1.00x)
vvc_alf_classify_32x32_8_neon: 10.6 ( 1.29x)
vvc_alf_classify_32x32_10_c: 12.1 ( 1.00x)
vvc_alf_classify_32x32_10_neon: 9.6 ( 1.26x)
vvc_alf_classify_32x32_12_c: 12.3 ( 1.00x)
vvc_alf_classify_32x32_12_neon: 9.6 ( 1.28x)
vvc_alf_classify_64x64_8_c: 44.0 ( 1.00x)
vvc_alf_classify_64x64_8_neon: 38.6 ( 1.14x)
vvc_alf_classify_64x64_10_c: 41.0 ( 1.00x)
vvc_alf_classify_64x64_10_neon: 35.0 ( 1.17x)
vvc_alf_classify_64x64_12_c: 41.7 ( 1.00x)
vvc_alf_classify_64x64_12_neon: 34.9 ( 1.20x)
vvc_alf_classify_128x128_8_c: 157.8 ( 1.00x)
vvc_alf_classify_128x128_8_neon: 147.2 ( 1.07x)
vvc_alf_classify_128x128_10_c: 150.4 ( 1.00x)
vvc_alf_classify_128x128_10_neon: 131.6 ( 1.14x)
vvc_alf_classify_128x128_12_c: 150.0 ( 1.00x)
vvc_alf_classify_128x128_12_neon: 130.6 ( 1.15x)
>From 8b279086db3eb4d1c680be706756f57ca926e0b2 Mon Sep 17 00:00:00 2001
From: Georgii Zagoruiko <george.zaguri@gmail.com>
Date: Tue, 8 Jul 2025 23:52:18 +0400
Subject: [PATCH 1/3] avcodec/aarch64/vvc: optimised alf_classify function
8/10/12bit of vvc codec for aarch64
- vvc_alf.alf_classify [OK]
vvc_alf_classify_8x8_8_c: 1314.4 ( 1.00x)
vvc_alf_classify_8x8_8_neon: 794.3 ( 1.65x)
vvc_alf_classify_8x8_10_c: 1154.7 ( 1.00x)
vvc_alf_classify_8x8_10_neon: 770.0 ( 1.50x)
vvc_alf_classify_8x8_12_c: 1091.7 ( 1.00x)
vvc_alf_classify_8x8_12_neon: 770.7 ( 1.42x)
vvc_alf_classify_16x16_8_c: 3710.0 ( 1.00x)
vvc_alf_classify_16x16_8_neon: 2205.6 ( 1.68x)
vvc_alf_classify_16x16_10_c: 3306.2 ( 1.00x)
vvc_alf_classify_16x16_10_neon: 2087.9 ( 1.58x)
vvc_alf_classify_16x16_12_c: 3307.9 ( 1.00x)
vvc_alf_classify_16x16_12_neon: 2089.6 ( 1.58x)
vvc_alf_classify_32x32_8_c: 12770.2 ( 1.00x)
vvc_alf_classify_32x32_8_neon: 7124.6 ( 1.79x)
vvc_alf_classify_32x32_10_c: 11780.3 ( 1.00x)
vvc_alf_classify_32x32_10_neon: 6856.7 ( 1.72x)
vvc_alf_classify_32x32_12_c: 11779.2 ( 1.00x)
vvc_alf_classify_32x32_12_neon: 7002.8 ( 1.68x)
vvc_alf_classify_64x64_8_c: 49332.3 ( 1.00x)
vvc_alf_classify_64x64_8_neon: 26040.4 ( 1.89x)
vvc_alf_classify_64x64_10_c: 45353.7 ( 1.00x)
vvc_alf_classify_64x64_10_neon: 26251.5 ( 1.73x)
vvc_alf_classify_64x64_12_c: 44876.9 ( 1.00x)
vvc_alf_classify_64x64_12_neon: 26491.3 ( 1.69x)
vvc_alf_classify_128x128_8_c: 191953.5 ( 1.00x)
vvc_alf_classify_128x128_8_neon: 96166.3 ( 2.00x)
vvc_alf_classify_128x128_10_c: 177198.5 ( 1.00x)
vvc_alf_classify_128x128_10_neon: 96077.9 ( 1.84x)
vvc_alf_classify_128x128_12_c: 177461.1 ( 1.00x)
vvc_alf_classify_128x128_12_neon: 96184.4 ( 1.85x)
---
libavcodec/aarch64/vvc/alf.S | 278 ++++++++++++++++++++++++++
libavcodec/aarch64/vvc/alf_template.c | 87 ++++++++
libavcodec/aarch64/vvc/dsp_init.c | 6 +
3 files changed, 371 insertions(+)
diff --git a/libavcodec/aarch64/vvc/alf.S b/libavcodec/aarch64/vvc/alf.S
index 8801b3afb6..9c9765ead1 100644
--- a/libavcodec/aarch64/vvc/alf.S
+++ b/libavcodec/aarch64/vvc/alf.S
@@ -291,3 +291,281 @@ function ff_alf_filter_chroma_kernel_10_neon, export=1
1:
alf_filter_chroma_kernel 2
endfunc
+
+
+
+.macro alf_classify_argvar2v30
+ mov w16, #0
+ mov v30.b[0], w16
+ mov w16, #1
+ mov v30.b[1], w16
+ mov w16, #2
+ mov v30.b[2], w16
+ mov v30.b[3], w16
+ mov v30.b[4], w16
+ mov v30.b[5], w16
+ mov v30.b[6], w16
+ mov w16, #3
+ mov v30.b[7], w16
+ mov v30.b[8], w16
+ mov v30.b[9], w16
+ mov v30.b[10], w16
+ mov v30.b[11], w16
+ mov v30.b[12], w16
+ mov v30.b[13], w16
+ mov v30.b[14], w16
+ mov w16, #4
+ mov v30.b[15], w16
+.endm
+
+.macro alf_classify_load_pixel pix_size, dst, src
+ .if \pix_size == 1
+ ldrb \dst, [\src], #1
+ .else
+ ldrh \dst, [\src], #2
+ .endif
+.endm
+
+.macro alf_classify_load_pixel_with_offset pix_size, dst, src, offset
+ .if \pix_size == 1
+ ldrb \dst, [\src, #(\offset)]
+ .else
+ ldrh \dst, [\src, #(2*\offset)]
+ .endif
+.endm
+
+#define ALF_BLOCK_SIZE 4
+#define ALF_GRADIENT_STEP 2
+#define ALF_GRADIENT_BORDER 2
+#define ALF_NUM_DIR 4
+#define ALF_GRAD_BORDER_X2 (ALF_GRADIENT_BORDER * 2)
+#define ALF_STRIDE_MUL (ALF_GRADIENT_BORDER + 1)
+#define ALF_GRAD_X_VSTEP (ALF_GRADIENT_STEP * 8)
+#define ALF_GSTRIDE_MUL (ALF_NUM_DIR / ALF_GRADIENT_STEP)
+
+// Shift right: equal to division by 2 (see ALF_GRADIENT_STEP)
+#define ALF_GSTRIDE_XG_BYTES (2 * ALF_NUM_DIR / ALF_GRADIENT_STEP)
+
+#define ALF_GSTRIDE_SUB_BYTES (2 * ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP) * ALF_NUM_DIR)
+
+#define ALF_CLASS_INC (ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP)
+#define ALF_CLASS_END ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP)
+
+.macro ff_alf_classify_grad pix_size
+ // class_idx .req x0
+ // transpose_idx .req x1
+ // _src .req x2
+ // _src_stride .req x3
+ // width .req w4
+ // height .req w5
+ // vb_pos .req w6
+ // gradient_tmp .req x7
+
+ mov w16, #ALF_STRIDE_MUL
+ add w5, w5, #ALF_GRAD_BORDER_X2 // h = height + 4
+ mul x16, x3, x16 // 3 * stride
+ add w4, w4, #ALF_GRAD_BORDER_X2 // w = width + 4
+ sub x15, x2, x16 // src -= (3 * stride)
+ mov x17, x7
+ .if \pix_size == 1
+ sub x15, x15, #ALF_GRADIENT_BORDER
+ .else
+ sub x15, x15, #4
+ .endif
+ mov w8, #0 // y loop: y = 0
+1:
+ cmp w8, w5
+ bge 10f
+
+ add x16, x8, #1
+ mul x14, x8, x3 // y * stride
+ mul x16, x16, x3
+ add x10, x15, x14 // s0 = src + y * stride
+ add x14, x16, x3
+ add x11, x15, x16 // s1
+ add x16, x14, x3
+ add x12, x15, x14 // s2
+ add x13, x15, x16 // s3
+
+ // if (y == vb_pos): s3 = s2
+ cmp w8, w6
+ add w16, w6, #ALF_GRADIENT_BORDER
+ csel x13, x12, x13, eq
+ // if (y == vb_pos + 2): s0 = s1
+ cmp w8, w16
+ csel x10, x11, x10, eq
+
+ alf_classify_load_pixel_with_offset \pix_size, w16, x10, -1
+ alf_classify_load_pixel \pix_size, w14, x13
+ mov v16.h[7], w16
+ mov v28.h[7], w14
+
+ // load 4 pixels from *(s1-2) & *(s2-2)
+ .if \pix_size == 1
+ sub x11, x11, #2
+ sub x12, x12, #2
+ ld1 {v0.8b}, [x11]
+ ld1 {v1.8b}, [x12]
+ uxtl v17.8h, v0.8b
+ uxtl v18.8h, v1.8b
+ .else
+ sub x11, x11, #4
+ sub x12, x12, #4
+ ld1 {v17.8h}, [x11]
+ ld1 {v18.8h}, [x12]
+ .endif
+ ext v22.16b, v22.16b, v17.16b, #8
+ ext v24.16b, v24.16b, v18.16b, #8
+ .if \pix_size == 1
+ add x11, x11, #4
+ add x12, x12, #4
+ .else
+ add x11, x11, #8
+ add x12, x12, #8
+ .endif
+
+ // x loop
+ mov w9, #0
+ b 11f
+2:
+ cmp w9, w4
+ bge 20f
+
+ // Store operation starts from the second cycle
+ st2 {v4.8h, v5.8h}, [x17], #32
+11:
+ .if \pix_size == 1
+ // Load 8 pixels: s0 & s1+2
+ ld1 {v0.8b}, [x10], #8
+ ld1 {v1.8b}, [x11], #8
+ uxtl v20.8h, v0.8b
+ uxtl v26.8h, v1.8b
+ // Load 8 pixels: s2+2 & s3+1
+ ld1 {v0.8b}, [x12], #8
+ ld1 {v1.8b}, [x13], #8
+ uxtl v27.8h, v0.8b
+ uxtl v19.8h, v1.8b
+ .else
+ // Load 8 pixels: s0
+ ld1 {v20.8h}, [x10], #16
+ // Load 8 pixels: s1+2
+ ld1 {v26.8h}, [x11], #16
+ // Load 8 pixels: s2+2
+ ld1 {v27.8h}, [x12], #16
+ // Load 8 pixels: s3+1
+ ld1 {v19.8h}, [x13], #16
+ .endif
+
+ ext v16.16b, v16.16b, v20.16b, #14
+ ext v28.16b, v28.16b, v19.16b, #14
+
+ ext v17.16b, v22.16b, v26.16b, #8
+ ext v22.16b, v22.16b, v26.16b, #12
+
+ ext v18.16b, v24.16b, v27.16b, #8
+ ext v24.16b, v24.16b, v27.16b, #12
+
+ // Grad: Vertical & D0 (interleaved)
+ trn1 v21.8h, v20.8h, v16.8h // first abs: operand 1
+ rev32 v23.8h, v22.8h // second abs: operand 1
+ trn2 v29.8h, v28.8h, v19.8h // second abs: operand 2
+ trn1 v30.8h, v22.8h, v22.8h
+ trn2 v31.8h, v24.8h, v24.8h
+ add v30.8h, v30.8h, v30.8h
+ add v31.8h, v31.8h, v31.8h
+ sub v0.8h, v30.8h, v21.8h
+ sub v1.8h, v31.8h, v23.8h
+ sabd v4.8h, v0.8h, v24.8h
+
+ // Grad: Horizontal & D1 (interleaved)
+ trn2 v21.8h, v17.8h, v20.8h // first abs: operand 1
+ saba v4.8h, v1.8h, v29.8h
+ trn2 v23.8h, v22.8h, v18.8h // first abs: operand 2
+ trn1 v25.8h, v24.8h, v26.8h // second abs: operand 1
+ trn1 v29.8h, v27.8h, v28.8h // second abs: operand 2
+ sub v0.8h, v30.8h, v21.8h
+ sub v1.8h, v31.8h, v25.8h
+ sabd v5.8h, v0.8h, v23.8h
+
+ // Prepare for the next interation:
+ mov v16.16b, v20.16b
+ saba v5.8h, v1.8h, v29.8h
+ mov v28.16b, v19.16b
+ mov v22.16b, v26.16b
+ mov v24.16b, v27.16b
+
+ add w9, w9, #8 // x += 8
+ b 2b
+20:
+ // 8 pixels -> 4 cycles of generic
+ // 4 pixels -> paddings => half needs to be saved
+ st2 {v4.4h, v5.4h}, [x17], #16
+
+ add w8, w8, #ALF_GRADIENT_STEP // y += 2
+ b 1b
+10:
+ ret
+.endm
+
+.macro ff_alf_classify_sum
+ // sum0 .req x0
+ // sum1 .req x1
+ // grad .req x2
+ // gshift .req w3
+ // steps .req w4
+ mov w5, #2
+ mov w6, #0
+ mul w3, w3, w5
+ movi v16.4s, #0
+ movi v21.4s, #0
+6:
+ prfm pldl1keep, [x2]
+ cmp w6, w4
+ bge 60f
+
+ ld1 {v17.4h}, [x2], #8
+ ld1 {v18.4h}, [x2], #8
+ uxtl v17.4s, v17.4h
+ ld1 {v19.4h}, [x2], #8
+ uxtl v18.4s, v18.4h
+ ld1 {v20.4h}, [x2], #8
+ uxtl v19.4s, v19.4h
+ ld1 {v22.4h}, [x2], #8
+ uxtl v20.4s, v20.4h
+ ld1 {v23.4h}, [x2]
+ uxtl v22.4s, v22.4h
+ uxtl v23.4s, v23.4h
+ add v17.4s, v17.4s, v18.4s
+ add v16.4s, v16.4s, v17.4s
+ add v19.4s, v19.4s, v20.4s
+ add v22.4s, v22.4s, v23.4s
+ add v21.4s, v21.4s, v19.4s
+ add v16.4s, v16.4s, v19.4s
+ add v21.4s, v21.4s, v22.4s
+
+ sub x2, x2, #8
+ add w6, w6, #1 // i += 1
+ add x2, x2, x3 // grad += gstride - size * ALF_NUM_DIR
+ b 6b
+60:
+ st1 {v16.4s}, [x0]
+ st1 {v21.4s}, [x1]
+ ret
+.endm
+
+
+function ff_alf_classify_sum_neon, export=1
+ ff_alf_classify_sum
+endfunc
+
+function ff_alf_classify_grad_8_neon, export=1
+ ff_alf_classify_grad 1
+endfunc
+
+function ff_alf_classify_grad_10_neon, export=1
+ ff_alf_classify_grad 2
+endfunc
+
+function ff_alf_classify_grad_12_neon, export=1
+ ff_alf_classify_grad 2
+endfunc
diff --git a/libavcodec/aarch64/vvc/alf_template.c b/libavcodec/aarch64/vvc/alf_template.c
index 41f7bf8995..470222a634 100644
--- a/libavcodec/aarch64/vvc/alf_template.c
+++ b/libavcodec/aarch64/vvc/alf_template.c
@@ -155,3 +155,90 @@ static void FUNC2(alf_filter_chroma, BIT_DEPTH, _neon)(uint8_t *_dst,
}
}
}
+
+#define ALF_DIR_VERT 0
+#define ALF_DIR_HORZ 1
+#define ALF_DIR_DIGA0 2
+#define ALF_DIR_DIGA1 3
+
+static void FUNC(ff_alf_get_idx)(int *class_idx, int *transpose_idx, const int *sum, const int ac)
+{
+ static const int arg_var[] = {0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 };
+
+ int hv0, hv1, dir_hv, d0, d1, dir_d, hvd1, hvd0, sum_hv, dir1;
+
+ dir_hv = sum[ALF_DIR_VERT] <= sum[ALF_DIR_HORZ];
+ hv1 = FFMAX(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]);
+ hv0 = FFMIN(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]);
+
+ dir_d = sum[ALF_DIR_DIGA0] <= sum[ALF_DIR_DIGA1];
+ d1 = FFMAX(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]);
+ d0 = FFMIN(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]);
+
+ //promote to avoid overflow
+ dir1 = (uint64_t)d1 * hv0 <= (uint64_t)hv1 * d0;
+ hvd1 = dir1 ? hv1 : d1;
+ hvd0 = dir1 ? hv0 : d0;
+
+ sum_hv = sum[ALF_DIR_HORZ] + sum[ALF_DIR_VERT];
+ *class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), 4)];
+ if (hvd1 * 2 > 9 * hvd0)
+ *class_idx += ((dir1 << 1) + 2) * 5;
+ else if (hvd1 > 2 * hvd0)
+ *class_idx += ((dir1 << 1) + 1) * 5;
+
+ *transpose_idx = dir_d * 2 + dir_hv;
+}
+
+static void FUNC(ff_alf_classify)(int *class_idx, int *transpose_idx,
+ const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
+ const int vb_pos, int16_t *gradient_tmp)
+{
+ int16_t *grad;
+
+ const int w = width + ALF_GRADIENT_BORDER * 2;
+ const int size = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP;
+ const int gstride = (w / ALF_GRADIENT_STEP) * ALF_NUM_DIR;
+ const int gshift = gstride - size * ALF_NUM_DIR;
+
+ for (int y = 0; y < height ; y += ALF_BLOCK_SIZE ) {
+ int start = 0;
+ int end = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP;
+ int ac = 2;
+ if (y + ALF_BLOCK_SIZE == vb_pos) {
+ end -= ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP;
+ ac = 3;
+ } else if (y == vb_pos) {
+ start += ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP;
+ ac = 3;
+ }
+ for (int x = 0; x < width; x += (2*ALF_BLOCK_SIZE)) {
+ const int xg = x / ALF_GRADIENT_STEP;
+ const int yg = y / ALF_GRADIENT_STEP;
+ int sum0[ALF_NUM_DIR];
+ int sum1[ALF_NUM_DIR];
+ grad = gradient_tmp + (yg + start) * gstride + xg * ALF_NUM_DIR;
+ ff_alf_classify_sum_neon(sum0, sum1, grad, gshift, end-start);
+ FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum0, ac);
+ class_idx++;
+ transpose_idx++;
+ FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum1, ac);
+ class_idx++;
+ transpose_idx++;
+ }
+ }
+
+}
+
+void FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx,
+ const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
+ const int vb_pos, int16_t *gradient_tmp);
+
+
+static void FUNC2(alf_classify, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx,
+ const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
+ const int vb_pos, int *gradient_tmp) {
+ FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp);
+ FUNC(ff_alf_classify)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp);
+}
+
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index 9a171234f6..75360d33f3 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -42,6 +42,9 @@ void ff_vvc_derive_bdof_vx_vy_neon(const int16_t *_src0, const int16_t *_src1,
const int16_t **gradient_h,
const int16_t **gradient_v,
int16_t *vx, int16_t *vy);
+
+void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshift, uint32_t steps);
+
#define BIT_DEPTH 8
#include "alf_template.c"
#include "of_template.c"
@@ -205,6 +208,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->sao.edge_filter[i] = ff_vvc_sao_edge_filter_16x16_8_neon;
c->alf.filter[LUMA] = alf_filter_luma_8_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_8_neon;
+ c->alf.classify = alf_classify_8_neon;
if (have_i8mm(cpu_flags)) {
c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon_i8mm;
@@ -243,6 +247,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
+ c->alf.classify = alf_classify_10_neon;
} else if (bd == 12) {
c->inter.avg = ff_vvc_avg_12_neon;
c->inter.w_avg = vvc_w_avg_12;
@@ -252,6 +257,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->alf.filter[LUMA] = alf_filter_luma_12_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
+ c->alf.classify = alf_classify_12_neon;
}
c->inter.sad = ff_vvc_sad_neon;
--
2.49.1
>From bde4ea825c648eb4c3a517f8b7775a93b4104ae3 Mon Sep 17 00:00:00 2001
From: Georgii Zagoruiko <george.zaguri@gmail.com>
Date: Tue, 15 Jul 2025 22:01:27 +0400
Subject: [PATCH 2/3] version 2 without ext instructions
---
libavcodec/aarch64/vvc/alf.S | 111 ++++++++++++++++++-----------------
1 file changed, 58 insertions(+), 53 deletions(-)
diff --git a/libavcodec/aarch64/vvc/alf.S b/libavcodec/aarch64/vvc/alf.S
index 9c9765ead1..c137e35e72 100644
--- a/libavcodec/aarch64/vvc/alf.S
+++ b/libavcodec/aarch64/vvc/alf.S
@@ -395,33 +395,14 @@ endfunc
cmp w8, w16
csel x10, x11, x10, eq
- alf_classify_load_pixel_with_offset \pix_size, w16, x10, -1
- alf_classify_load_pixel \pix_size, w14, x13
- mov v16.h[7], w16
- mov v28.h[7], w14
-
- // load 4 pixels from *(s1-2) & *(s2-2)
.if \pix_size == 1
+ sub x10, x10, #1 // s0-1
sub x11, x11, #2
sub x12, x12, #2
- ld1 {v0.8b}, [x11]
- ld1 {v1.8b}, [x12]
- uxtl v17.8h, v0.8b
- uxtl v18.8h, v1.8b
.else
+ sub x10, x10, #2 // s0-1
sub x11, x11, #4
sub x12, x12, #4
- ld1 {v17.8h}, [x11]
- ld1 {v18.8h}, [x12]
- .endif
- ext v22.16b, v22.16b, v17.16b, #8
- ext v24.16b, v24.16b, v18.16b, #8
- .if \pix_size == 1
- add x11, x11, #4
- add x12, x12, #4
- .else
- add x11, x11, #8
- add x12, x12, #8
.endif
// x loop
@@ -436,35 +417,65 @@ endfunc
11:
.if \pix_size == 1
// Load 8 pixels: s0 & s1+2
- ld1 {v0.8b}, [x10], #8
- ld1 {v1.8b}, [x11], #8
- uxtl v20.8h, v0.8b
- uxtl v26.8h, v1.8b
- // Load 8 pixels: s2+2 & s3+1
- ld1 {v0.8b}, [x12], #8
- ld1 {v1.8b}, [x13], #8
- uxtl v27.8h, v0.8b
+ ld1 {v0.8b}, [x10] // s0-1
+ add x10, x10, #1
+ ld1 {v1.8b}, [x10] // s0
+ add x10, x10, #7
+ uxtl v16.8h, v0.8b
+ uxtl v20.8h, v1.8b
+
+ ld1 {v0.8b}, [x11] // s1-2
+ add x11, x11, #2
+ ld1 {v1.8b}, [x11] // s1
+ add x11, x11, #2
+ ld1 {v2.8b}, [x11] // s1+2
+ add x11, x11, #4
+ uxtl v17.8h, v0.8b
+ uxtl v22.8h, v1.8b
+ uxtl v26.8h, v2.8b
+
+ ld1 {v0.8b}, [x12] // s2-2
+ add x12, x12, #2
+ ld1 {v1.8b}, [x12] // s2
+ add x12, x12, #2
+ ld1 {v2.8b}, [x12] // s2+2
+ add x12, x12, #4
+ uxtl v18.8h, v0.8b
+ uxtl v24.8h, v1.8b
+ uxtl v27.8h, v2.8b
+
+ ld1 {v0.8b}, [x13] // s3
+ add x13, x13, #1
+ ld1 {v1.8b}, [x13] // s3+1
+ add x13, x13, #7
+ uxtl v28.8h, v0.8b
uxtl v19.8h, v1.8b
.else
- // Load 8 pixels: s0
- ld1 {v20.8h}, [x10], #16
- // Load 8 pixels: s1+2
- ld1 {v26.8h}, [x11], #16
- // Load 8 pixels: s2+2
- ld1 {v27.8h}, [x12], #16
- // Load 8 pixels: s3+1
- ld1 {v19.8h}, [x13], #16
+ ld1 {v16.8h}, [x10] // s0-1
+ add x10, x10, #2
+ ld1 {v20.8h}, [x10] // s0
+ add x10, x10, #14
+
+ ld1 {v17.8h}, [x11] // s1-2
+ add x11, x11, #4
+ ld1 {v22.8h}, [x11] // s1
+ add x11, x11, #4
+ ld1 {v26.8h}, [x11] // s1+2
+ add x11, x11, #8
+
+ ld1 {v18.8h}, [x12] // s2-2
+ add x12, x12, #4
+ ld1 {v24.8h}, [x12] // s2
+ add x12, x12, #4
+ ld1 {v27.8h}, [x12] // s2+2
+ add x12, x12, #8
+
+ ld1 {v28.8h}, [x13] // s3
+ add x13, x13, #2
+ ld1 {v19.8h}, [x13] // s3+1
+ add x13, x13, #14
.endif
- ext v16.16b, v16.16b, v20.16b, #14
- ext v28.16b, v28.16b, v19.16b, #14
-
- ext v17.16b, v22.16b, v26.16b, #8
- ext v22.16b, v22.16b, v26.16b, #12
-
- ext v18.16b, v24.16b, v27.16b, #8
- ext v24.16b, v24.16b, v27.16b, #12
-
// Grad: Vertical & D0 (interleaved)
trn1 v21.8h, v20.8h, v16.8h // first abs: operand 1
rev32 v23.8h, v22.8h // second abs: operand 1
@@ -486,14 +497,8 @@ endfunc
sub v0.8h, v30.8h, v21.8h
sub v1.8h, v31.8h, v25.8h
sabd v5.8h, v0.8h, v23.8h
-
- // Prepare for the next interation:
- mov v16.16b, v20.16b
+ mov v16.16b, v20.16b // to fill the gap
saba v5.8h, v1.8h, v29.8h
- mov v28.16b, v19.16b
- mov v22.16b, v26.16b
- mov v24.16b, v27.16b
-
add w9, w9, #8 // x += 8
b 2b
20:
--
2.49.1
>From c0aa20b932b8ee07779e98223bb903290c7efdea Mon Sep 17 00:00:00 2001
From: Viktoriia Zagoruiko <viktoriabubnova2018@gmail.com>
Date: Sat, 30 Aug 2025 18:25:44 +0100
Subject: [PATCH 3/3] fix: fixed comments and optimisation issue in sum part of
classification
---
libavcodec/aarch64/vvc/alf.S | 201 +++++++++-----------------
libavcodec/aarch64/vvc/alf_template.c | 15 +-
2 files changed, 73 insertions(+), 143 deletions(-)
diff --git a/libavcodec/aarch64/vvc/alf.S b/libavcodec/aarch64/vvc/alf.S
index c137e35e72..e546274567 100644
--- a/libavcodec/aarch64/vvc/alf.S
+++ b/libavcodec/aarch64/vvc/alf.S
@@ -292,48 +292,6 @@ function ff_alf_filter_chroma_kernel_10_neon, export=1
alf_filter_chroma_kernel 2
endfunc
-
-
-.macro alf_classify_argvar2v30
- mov w16, #0
- mov v30.b[0], w16
- mov w16, #1
- mov v30.b[1], w16
- mov w16, #2
- mov v30.b[2], w16
- mov v30.b[3], w16
- mov v30.b[4], w16
- mov v30.b[5], w16
- mov v30.b[6], w16
- mov w16, #3
- mov v30.b[7], w16
- mov v30.b[8], w16
- mov v30.b[9], w16
- mov v30.b[10], w16
- mov v30.b[11], w16
- mov v30.b[12], w16
- mov v30.b[13], w16
- mov v30.b[14], w16
- mov w16, #4
- mov v30.b[15], w16
-.endm
-
-.macro alf_classify_load_pixel pix_size, dst, src
- .if \pix_size == 1
- ldrb \dst, [\src], #1
- .else
- ldrh \dst, [\src], #2
- .endif
-.endm
-
-.macro alf_classify_load_pixel_with_offset pix_size, dst, src, offset
- .if \pix_size == 1
- ldrb \dst, [\src, #(\offset)]
- .else
- ldrh \dst, [\src, #(2*\offset)]
- .endif
-.endm
-
#define ALF_BLOCK_SIZE 4
#define ALF_GRADIENT_STEP 2
#define ALF_GRADIENT_BORDER 2
@@ -374,9 +332,6 @@ endfunc
.endif
mov w8, #0 // y loop: y = 0
1:
- cmp w8, w5
- bge 10f
-
add x16, x8, #1
mul x14, x8, x3 // y * stride
mul x16, x16, x3
@@ -409,71 +364,56 @@ endfunc
mov w9, #0
b 11f
2:
- cmp w9, w4
- bge 20f
-
// Store operation starts from the second cycle
st2 {v4.8h, v5.8h}, [x17], #32
11:
.if \pix_size == 1
- // Load 8 pixels: s0 & s1+2
- ld1 {v0.8b}, [x10] // s0-1
- add x10, x10, #1
- ld1 {v1.8b}, [x10] // s0
- add x10, x10, #7
+ // Load 8 pixels: s0 & s1+2
+ mov x16, #1
+ mov x14, #7
+ ld1 {v0.8b}, [x10], x16 // s0-1
+ ld1 {v1.8b}, [x10], x14 // s0
uxtl v16.8h, v0.8b
uxtl v20.8h, v1.8b
- ld1 {v0.8b}, [x11] // s1-2
- add x11, x11, #2
- ld1 {v1.8b}, [x11] // s1
- add x11, x11, #2
- ld1 {v2.8b}, [x11] // s1+2
- add x11, x11, #4
+ ld1 {v0.8b}, [x13], x16 // s3
+ ld1 {v1.8b}, [x13], x14 // s3+1
+ uxtl v28.8h, v0.8b
+ uxtl v19.8h, v1.8b
+
+ mov x16, #2
+ mov x14, #4
+ ld1 {v0.8b}, [x11], x16 // s1-2
+ ld1 {v1.8b}, [x11], x16 // s1
+ ld1 {v2.8b}, [x11], x14 // s1+2
uxtl v17.8h, v0.8b
uxtl v22.8h, v1.8b
uxtl v26.8h, v2.8b
- ld1 {v0.8b}, [x12] // s2-2
- add x12, x12, #2
- ld1 {v1.8b}, [x12] // s2
- add x12, x12, #2
- ld1 {v2.8b}, [x12] // s2+2
- add x12, x12, #4
+ ld1 {v0.8b}, [x12], x16 // s2-2
+ ld1 {v1.8b}, [x12], x16 // s2
+ ld1 {v2.8b}, [x12], x14 // s2+2
uxtl v18.8h, v0.8b
uxtl v24.8h, v1.8b
uxtl v27.8h, v2.8b
-
- ld1 {v0.8b}, [x13] // s3
- add x13, x13, #1
- ld1 {v1.8b}, [x13] // s3+1
- add x13, x13, #7
- uxtl v28.8h, v0.8b
- uxtl v19.8h, v1.8b
.else
- ld1 {v16.8h}, [x10] // s0-1
- add x10, x10, #2
- ld1 {v20.8h}, [x10] // s0
- add x10, x10, #14
+ mov x16, #2
+ mov x14, #14
+ ld1 {v16.8h}, [x10], x16 // s0-1
+ ld1 {v20.8h}, [x10], x14 // s0
- ld1 {v17.8h}, [x11] // s1-2
- add x11, x11, #4
- ld1 {v22.8h}, [x11] // s1
- add x11, x11, #4
- ld1 {v26.8h}, [x11] // s1+2
- add x11, x11, #8
+ ld1 {v28.8h}, [x13], x16 // s3
+ ld1 {v19.8h}, [x13], x14 // s3+1
- ld1 {v18.8h}, [x12] // s2-2
- add x12, x12, #4
- ld1 {v24.8h}, [x12] // s2
- add x12, x12, #4
- ld1 {v27.8h}, [x12] // s2+2
- add x12, x12, #8
+ mov x16, #4
+ mov x14, #8
+ ld1 {v17.8h}, [x11], x16 // s1-2
+ ld1 {v22.8h}, [x11], x16 // s1
+ ld1 {v26.8h}, [x11], x14 // s1+2
- ld1 {v28.8h}, [x13] // s3
- add x13, x13, #2
- ld1 {v19.8h}, [x13] // s3+1
- add x13, x13, #14
+ ld1 {v18.8h}, [x12], x16 // s2-2
+ ld1 {v24.8h}, [x12], x16 // s2
+ ld1 {v27.8h}, [x12], x14 // s2+2
.endif
// Grad: Vertical & D0 (interleaved)
@@ -497,70 +437,59 @@ endfunc
sub v0.8h, v30.8h, v21.8h
sub v1.8h, v31.8h, v25.8h
sabd v5.8h, v0.8h, v23.8h
- mov v16.16b, v20.16b // to fill the gap
- saba v5.8h, v1.8h, v29.8h
add w9, w9, #8 // x += 8
- b 2b
-20:
+ saba v5.8h, v1.8h, v29.8h
+ cmp w9, w4
+ b.lt 2b
+
+ add w8, w8, #ALF_GRADIENT_STEP // y += 2
// 8 pixels -> 4 cycles of generic
// 4 pixels -> paddings => half needs to be saved
st2 {v4.4h, v5.4h}, [x17], #16
-
- add w8, w8, #ALF_GRADIENT_STEP // y += 2
- b 1b
-10:
+ cmp w8, w5
+ b.lt 1b
ret
.endm
-.macro ff_alf_classify_sum
+.macro ff_alf_classify_sum v16_4s, v21_4s, addr
+ ld1 {v17.8h, v18.8h}, [\addr], #32
+ ld1 {v19.8h}, [\addr], x3
+ uxtl v0.4s, v17.4h
+ uxtl2 v1.4s, v17.8h
+ add \v16_4s, \v16_4s, v0.4s
+ uxtl v2.4s, v18.4h
+ add \v16_4s, \v16_4s, v1.4s
+ uxtl2 v3.4s, v18.8h
+ uxtl v4.4s, v19.4h
+ add v2.4s, v2.4s, v3.4s
+ uxtl2 v5.4s, v19.8h
+ add \v21_4s, \v21_4s, v2.4s
+ add v4.4s, v4.4s, v5.4s
+ add \v16_4s, \v16_4s, v2.4s
+ add \v21_4s, \v21_4s, v4.4s
+.endm
+
+function ff_alf_classify_sum_neon, export=1
// sum0 .req x0
// sum1 .req x1
// grad .req x2
// gshift .req w3
// steps .req w4
- mov w5, #2
- mov w6, #0
- mul w3, w3, w5
+ lsl w3, w3, #1
movi v16.4s, #0
movi v21.4s, #0
-6:
- prfm pldl1keep, [x2]
- cmp w6, w4
- bge 60f
- ld1 {v17.4h}, [x2], #8
- ld1 {v18.4h}, [x2], #8
- uxtl v17.4s, v17.4h
- ld1 {v19.4h}, [x2], #8
- uxtl v18.4s, v18.4h
- ld1 {v20.4h}, [x2], #8
- uxtl v19.4s, v19.4h
- ld1 {v22.4h}, [x2], #8
- uxtl v20.4s, v20.4h
- ld1 {v23.4h}, [x2]
- uxtl v22.4s, v22.4h
- uxtl v23.4s, v23.4h
- add v17.4s, v17.4s, v18.4s
- add v16.4s, v16.4s, v17.4s
- add v19.4s, v19.4s, v20.4s
- add v22.4s, v22.4s, v23.4s
- add v21.4s, v21.4s, v19.4s
- add v16.4s, v16.4s, v19.4s
- add v21.4s, v21.4s, v22.4s
+ ff_alf_classify_sum v16.4s, v21.4s, x2
+ ff_alf_classify_sum v16.4s, v21.4s, x2
+ ff_alf_classify_sum v16.4s, v21.4s, x2
- sub x2, x2, #8
- add w6, w6, #1 // i += 1
- add x2, x2, x3 // grad += gstride - size * ALF_NUM_DIR
- b 6b
+ cmp w4, #4
+ blt 60f
+ ff_alf_classify_sum v16.4s, v21.4s, x2
60:
st1 {v16.4s}, [x0]
st1 {v21.4s}, [x1]
ret
-.endm
-
-
-function ff_alf_classify_sum_neon, export=1
- ff_alf_classify_sum
endfunc
function ff_alf_classify_grad_8_neon, export=1
diff --git a/libavcodec/aarch64/vvc/alf_template.c b/libavcodec/aarch64/vvc/alf_template.c
index 470222a634..97c6c34d05 100644
--- a/libavcodec/aarch64/vvc/alf_template.c
+++ b/libavcodec/aarch64/vvc/alf_template.c
@@ -161,7 +161,7 @@ static void FUNC2(alf_filter_chroma, BIT_DEPTH, _neon)(uint8_t *_dst,
#define ALF_DIR_DIGA0 2
#define ALF_DIR_DIGA1 3
-static void FUNC(ff_alf_get_idx)(int *class_idx, int *transpose_idx, const int *sum, const int ac)
+static void FUNC(alf_get_idx)(int *class_idx, int *transpose_idx, const int *sum, const int ac)
{
static const int arg_var[] = {0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 };
@@ -218,11 +218,11 @@ static void FUNC(ff_alf_classify)(int *class_idx, int *transpose_idx,
int sum0[ALF_NUM_DIR];
int sum1[ALF_NUM_DIR];
grad = gradient_tmp + (yg + start) * gstride + xg * ALF_NUM_DIR;
- ff_alf_classify_sum_neon(sum0, sum1, grad, gshift, end-start);
- FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum0, ac);
+ ff_alf_classify_sum_neon(sum0, sum1, grad, gshift, end-start);
+ FUNC(alf_get_idx)(class_idx, transpose_idx, sum0, ac);
class_idx++;
transpose_idx++;
- FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum1, ac);
+ FUNC(alf_get_idx)(class_idx, transpose_idx, sum1, ac);
class_idx++;
transpose_idx++;
}
@@ -237,8 +237,9 @@ void FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(int *class_idx, int *transpos
static void FUNC2(alf_classify, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx,
const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
- const int vb_pos, int *gradient_tmp) {
- FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp);
- FUNC(ff_alf_classify)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp);
+ const int vb_pos, int *gradient_tmp)
+{
+ FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp);
+ FUNC(ff_alf_classify)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp);
}
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-08-31 9:55 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-08-31 9:55 [FFmpeg-devel] [PATCH] feature/classify_neon (PR #20377) george.zaguri via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git