* [FFmpeg-devel] [PATCH] vvc-bdof-rework-2 (PR #20241)
@ 2025-08-14 16:35 Zhao Zhili
0 siblings, 0 replies; only message in thread
From: Zhao Zhili @ 2025-08-14 16:35 UTC (permalink / raw)
To: ffmpeg-devel
PR #20241 opened by Zhao Zhili (quink)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20241
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20241.patch
From c3362c98ce019e218463a62fbee770b16c3bb478 Mon Sep 17 00:00:00 2001
From: Zhao Zhili <zhilizhao@tencent.com>
Date: Fri, 20 Jun 2025 21:15:20 +0800
Subject: [PATCH 1/2] avcodec/aarch64/vvc: Optimize derive_bdof_vx_vy
Before After
-----------------------------------------------------------------
apply_bdof_8_8x16_c: | 7375.5 ( 1.00x) | 7473.8 ( 1.00x)
apply_bdof_8_8x16_neon: | 1875.1 ( 3.93x) | 1135.8 ( 6.58x)
apply_bdof_8_16x8_c: | 7273.9 ( 1.00x) | 7204.0 ( 1.00x)
apply_bdof_8_16x8_neon: | 1738.2 ( 4.18x) | 1013.0 ( 7.11x)
apply_bdof_8_16x16_c: | 14744.9 ( 1.00x) | 14712.6 ( 1.00x)
apply_bdof_8_16x16_neon: | 3446.7 ( 4.28x) | 1997.7 ( 7.36x)
apply_bdof_10_8x16_c: | 7352.4 ( 1.00x) | 7485.7 ( 1.00x)
apply_bdof_10_8x16_neon: | 1861.0 ( 3.95x) | 1134.1 ( 6.60x)
apply_bdof_10_16x8_c: | 7330.5 ( 1.00x) | 7232.8 ( 1.00x)
apply_bdof_10_16x8_neon: | 1747.2 ( 4.20x) | 1002.6 ( 7.21x)
apply_bdof_10_16x16_c: | 14522.4 ( 1.00x) | 14664.8 ( 1.00x)
apply_bdof_10_16x16_neon: | 3490.5 ( 4.16x) | 1978.4 ( 7.41x)
apply_bdof_12_8x16_c: | 7389.0 ( 1.00x) | 7380.1 ( 1.00x)
apply_bdof_12_8x16_neon: | 1861.3 ( 3.97x) | 1134.0 ( 6.51x)
apply_bdof_12_16x8_c: | 7283.1 ( 1.00x) | 7336.9 ( 1.00x)
apply_bdof_12_16x8_neon: | 1749.1 ( 4.16x) | 1002.3 ( 7.32x)
apply_bdof_12_16x16_c: | 14580.7 ( 1.00x) | 14502.7 ( 1.00x)
apply_bdof_12_16x16_neon: | 3472.9 ( 4.20x) | 1978.3 ( 7.33x)
Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
---
libavcodec/aarch64/vvc/dsp_init.c | 17 +-
libavcodec/aarch64/vvc/inter.S | 606 ++++++++++++++++-----------
libavcodec/aarch64/vvc/of_template.c | 15 +-
3 files changed, 386 insertions(+), 252 deletions(-)
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index 9a171234f6..1db38ebb1d 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -37,11 +37,18 @@ void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
ptrdiff_t src_stride,
int width, int height);
-void ff_vvc_derive_bdof_vx_vy_neon(const int16_t *_src0, const int16_t *_src1,
- int pad_mask,
- const int16_t **gradient_h,
- const int16_t **gradient_v,
- int16_t *vx, int16_t *vy);
+void ff_vvc_derive_bdof_vx_vy_8x_neon(const int16_t *_src0,
+ const int16_t *_src1,
+ int16_t *const gradient_h[2],
+ int16_t *const gradient_v[2],
+ int16_t vx[16], int16_t vy[16],
+ int block_h);
+void ff_vvc_derive_bdof_vx_vy_16x_neon(const int16_t *_src0,
+ const int16_t *_src1,
+ int16_t *const gradient_h[2],
+ int16_t *const gradient_v[2],
+ int16_t vx[16], int16_t vy[16],
+ int block_h);
#define BIT_DEPTH 8
#include "alf_template.c"
#include "of_template.c"
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index c299e6f68b..06c6f3619b 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -804,262 +804,388 @@ function ff_vvc_apply_bdof_block_12_neon, export=1
vvc_apply_bdof_block 12
endfunc
-function ff_vvc_derive_bdof_vx_vy_neon, export=1
- src0 .req x0
- src1 .req x1
- pad_mask .req w2
- gh .req x3
- gv .req x4
- vx .req x5
- vy .req x6
+const bdof_vx_vy_8x_tbl
+ .byte 0, 1, 16, 16, 16, 16, 8, 9
+ .byte 6, 7, 16, 16, 16, 16, 14, 15
+endconst
- gh0 .req x7
- gh1 .req x8
- gv0 .req x9
- gv1 .req x10
- y .req x12
+const bdof_vx_vy_16x_tbl
+ .byte 0, 1, 64, 64, 64, 64, 8, 9
+ .byte 6, 7, 64, 64, 64, 64, 16, 17
+ .byte 14, 15, 64, 64, 64, 64, 24, 25
+ .byte 22, 23, 64, 64, 64, 64, 30, 31
+endconst
- sgx2 .req w7
- sgy2 .req w8
- sgxgy .req w9
- sgxdi .req w10
- sgydi .req w11
+// line(-1), line0, line1, line2, line3, line4
+// line3 and line4 becomes line(-1) and line0 in the next block.
+.macro bdof_vx_vy_8x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
+ mov \tmp0\().16b, v28.16b
+ mov \tmp1\().16b, v29.16b
+ mov \tmp2\().16b, v30.16b
+ mov \tmp3\().16b, v31.16b
+ mov \tmp4\().16b, v8.16b
+.endm
- sgx2_v .req v22
- sgy2_v .req v23
- sgxgy_v .req v24
- sgxdi_v .req v25
- sgydi_v .req v26
+.macro bdof_vx_vy_8x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
+ add v25.4s, v25.4s, \tmp0\().4s
+ add v27.4s, v27.4s, \tmp1\().4s
+ add v23.4s, v23.4s, \tmp2\().4s
+ sub v26.4s, v26.4s, \tmp3\().4s
+ sub v24.4s, v24.4s, \tmp4\().4s
+.endm
- sgx2_v2 .req v27
- sgy2_v2 .req v28
- sgxgy_v2 .req v29
- sgxdi_v2 .req v30
- sgydi_v2 .req v31
+.macro bdof_vx_vy_8x_padding_left_right src, tmp0, tmp1, dst
+ tbl \tmp0\().16b, { \src\().16b }, v0.16b
+ saddl \tmp1\().4s, \tmp0\().4h, \src\().4h
+ saddl2 \dst\().4s, \tmp0\().8h, \src\().8h
+ addp \dst\().4s, \tmp1\().4s, \dst\().4s
+.endm
+
+.macro bdof_vx_vy_sign src, tmp0, tmp1, dst
+ cmlt \tmp0\().8h, \src\().8h, #0
+ cmgt \tmp1\().8h, \src\().8h, #0
+ sub \dst\().8h, \tmp0\().8h, \tmp1\().8h
+.endm
+
+.macro bdof_vx_vy_clip_mask src, max, min, mask, dst
+ smin \src\().4s, \src\().4s, \max\().4s
+ smax \src\().4s, \src\().4s, \min\().4s
+ cmgt \mask\().4s, \mask\().4s, #0
+ and \dst\().16b, \src\().16b, \mask\().16b
+.endm
+
+.macro bdof_vx_vy_16x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
+ mov \tmp0\().16b, v29.16b
+ mov \tmp1\().16b, v30.16b
+ mov \tmp2\().16b, v31.16b
+ mov \tmp3\().16b, v8.16b
+ mov \tmp4\().16b, v9.16b
+.endm
+
+.macro bdof_vx_vy_16x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
+ add v25.4s, v25.4s, \tmp0\().4s
+ add v24.4s, v24.4s, \tmp1\().4s
+ add v26.4s, v26.4s, \tmp2\().4s
+ sub v28.4s, v28.4s, \tmp3\().4s
+ sub v27.4s, v27.4s, \tmp4\().4s
+.endm
+
+.macro bdof_vx_vy_16x_padding_left_right src0, src1, tmp0, tmp1, tmp2, dst
+ tbl \tmp0\().16b, {\src0\().16b, \src1\().16b}, v0.16b
+ tbl v2.16b, {\src0\().16b, \src1\().16b}, v1.16b
+ saddl \tmp1\().4s, \tmp0\().4h, \src0\().4h
+ saddl \tmp2\().4s, v2.4h, \src1\().4h
+ saddl2 \tmp0\().4s, \tmp0\().8h, \src0\().8h
+ saddl2 \dst\().4s, v2.8h, \src1\().8h
+ addp \tmp0\().4s, \tmp1\().4s, \tmp0\().4s
+ addp \dst\().4s, \tmp2\().4s, \dst\().4s
+ addp \dst\().4s, \tmp0\().4s, \dst\().4s
+.endm
+
+/*
+ * x0: const int16_t *_src0,
+ * x1: const int16_t *_src1,
+ * x2: int16_t *gradient_h[2],
+ * x3: int16_t *gradient_v[2],
+ * x4: int16_t vx[16],
+ * x5: int16_t vy[16],
+ * w6: int block_h
+ */
+function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1
+ stp d11, d10, [sp, #-0x20]!
+ stp d9, d8, [sp, #0x10]
+
+ ldp x14, x13, [x2] // gh0, gh1
+ ldp x10, x9, [x3] // gv0, gv1
+ movrel x11, bdof_vx_vy_8x_tbl
+ ldr q0, [x11] // table
+ mvni v2.4s, #30 // -31, for log2
+ movi v3.4s, #15 // clip to 15
+ mvni v4.4s, #14 // clip to -15
+
+ mov w11, #0x8
+ mov w12, w6 // y = block_h
+ b 4f
- ldp gh0, gh1, [gh]
- ldp gv0, gv1, [gv]
- movi sgx2_v.4s, #0
- movi sgy2_v.4s, #0
- movi sgxgy_v.4s, #0
- movi sgxdi_v.4s, #0
- movi sgydi_v.4s, #0
- movi sgx2_v2.4s, #0
- movi sgy2_v2.4s, #0
- movi sgxgy_v2.4s, #0
- movi sgxdi_v2.4s, #0
- movi sgydi_v2.4s, #0
- mov x13, #-1 // dy
- movi v6.4s, #0
- mov y, #-1
- tbz pad_mask, #1, 1f // check pad top
- mov x13, #0 // dy: pad top
1:
- mov x16, #-2 // dx
- add x14, src0, x13, lsl #8 // local src0
- add x15, src1, x13, lsl #8 // local src1
- add x17, x16, x13, lsl #5
- ldr q0, [x14, x16]
- ldr q1, [x15, x16]
- ldr q2, [gh0, x17]
- ldr q3, [gh1, x17]
- ldr q4, [gv0, x17]
- ldr q5, [gv1, x17]
- add x16, x16, #8
- add x17, x17, #8
- ins v0.s[3], v6.s[3]
- ins v1.s[3], v6.s[3]
- ins v2.s[3], v6.s[3]
- ins v3.s[3], v6.s[3]
- ins v4.s[3], v6.s[3]
- ins v5.s[3], v6.s[3]
+ // save line4 results
+ bdof_vx_vy_8x_save_line v5, v6, v7, v16, v17
+2:
+ addp v25.4s, v25.4s, v25.4s
+ addp v27.4s, v27.4s, v27.4s
+ addp v26.4s, v26.4s, v26.4s
+ addp v23.4s, v23.4s, v23.4s
+ addp v24.4s, v24.4s, v24.4s
- ldr q16, [x14, x16]
- ldr q17, [x15, x16]
- ldr q18, [gh0, x17]
- ldr q19, [gh1, x17]
- ldr q20, [gv0, x17]
- ldr q21, [gv1, x17]
- ins v16.s[3], v6.s[3]
- ins v17.s[3], v6.s[3]
- ins v18.s[3], v6.s[3]
- ins v19.s[3], v6.s[3]
- ins v20.s[3], v6.s[3]
- ins v21.s[3], v6.s[3]
+ clz v28.4s, v25.4s
+ add v28.4s, v28.4s, v2.4s // log2
+ shl v26.4s, v26.4s, #0x2
+ sshl v26.4s, v26.4s, v28.4s
- tbz pad_mask, #0, 20f
- // pad left
- ins v0.h[0], v0.h[1]
- ins v1.h[0], v1.h[1]
- ins v2.h[0], v2.h[1]
- ins v3.h[0], v3.h[1]
- ins v4.h[0], v4.h[1]
- ins v5.h[0], v5.h[1]
-20:
- tbz pad_mask, #2, 21f
- // pad right
- ins v16.h[5], v16.h[4]
- ins v17.h[5], v17.h[4]
- ins v18.h[5], v18.h[4]
- ins v19.h[5], v19.h[4]
- ins v20.h[5], v20.h[4]
- ins v21.h[5], v21.h[4]
-21:
- sshr v0.8h, v0.8h, #4
- sshr v1.8h, v1.8h, #4
- add v2.8h, v2.8h, v3.8h
- add v4.8h, v4.8h, v5.8h
- sub v0.8h, v0.8h, v1.8h // diff
- sshr v2.8h, v2.8h, #1 // temph
- sshr v4.8h, v4.8h, #1 // tempv
+ bdof_vx_vy_clip_mask v26, v3, v4, v25, v25
+ sqxtn v26.4h, v25.4s
+ st1 {v26.s}[0], [x4], x11
- sshr v16.8h, v16.8h, #4
- sshr v17.8h, v17.8h, #4
- add v18.8h, v18.8h, v19.8h
- add v20.8h, v20.8h, v21.8h
- sub v16.8h, v16.8h, v17.8h // diff
- sshr v18.8h, v18.8h, #1 // temph
- sshr v20.8h, v20.8h, #1 // tempv
+ subs x12, x12, #(BDOF_MIN_BLOCK_SIZE)
- abs v3.8h, v2.8h
- abs v5.8h, v4.8h
- uxtl v19.4s, v3.4h
- uxtl v21.4s, v5.4h
- uxtl2 v3.4s, v3.8h
- uxtl2 v5.4s, v5.8h
- add v3.4s, v3.4s, v19.4s
- add v5.4s, v5.4s, v21.4s
- add sgx2_v.4s, sgx2_v.4s, v3.4s
- add sgy2_v.4s, sgy2_v.4s, v5.4s
+ clz v26.4s, v27.4s
+ add v26.4s, v26.4s, v2.4s
+ shl v24.4s, v24.4s, #0x2
+ mul v23.4s, v25.4s, v23.4s
+ sshr v23.4s, v23.4s, #0x1
+ sub v23.4s, v24.4s, v23.4s
+ sshl v23.4s, v23.4s, v26.4s
- abs v3.8h, v18.8h
- abs v5.8h, v20.8h
- uxtl v19.4s, v3.4h
- uxtl v21.4s, v5.4h
- uxtl2 v3.4s, v3.8h
- uxtl2 v5.4s, v5.8h
- add v3.4s, v3.4s, v19.4s
- add v5.4s, v5.4s, v21.4s
- add sgx2_v2.4s, sgx2_v2.4s, v3.4s
- add sgy2_v2.4s, sgy2_v2.4s, v5.4s
+ bdof_vx_vy_clip_mask v23, v3, v4, v27, v23
+ sqxtn v23.4h, v23.4s
+ st1 {v23.s}[0], [x5], x11
- cmgt v17.8h, v4.8h, #0
- cmlt v7.8h, v4.8h, #0
- cmgt v19.8h, v20.8h, #0
- cmlt v21.8h, v20.8h, #0
- sub v17.8h, v7.8h, v17.8h // VVC_SIGN(tempv)
- sub v19.8h, v21.8h, v19.8h // VVC_SIGN(tempv)
-
- smlal sgxgy_v.4s, v17.4h, v2.4h
- smlal2 sgxgy_v.4s, v17.8h, v2.8h
- smlsl sgydi_v.4s, v17.4h, v0.4h
- smlsl2 sgydi_v.4s, v17.8h, v0.8h
-
- cmgt v3.8h, v2.8h, #0
- cmlt v5.8h, v2.8h, #0
- cmgt v17.8h, v18.8h, #0
- cmlt v21.8h, v18.8h, #0
- sub v3.8h, v5.8h, v3.8h // VVC_SIGN(temph)
- sub v17.8h, v21.8h, v17.8h // VVC_SIGN(temph)
-
- smlal sgxgy_v2.4s, v19.4h, v18.4h
- smlal2 sgxgy_v2.4s, v19.8h, v18.8h
- smlsl sgydi_v2.4s, v19.4h, v16.4h
- smlsl2 sgydi_v2.4s, v19.8h, v16.8h
-
- smlsl sgxdi_v.4s, v3.4h, v0.4h
- smlsl2 sgxdi_v.4s, v3.8h, v0.8h
- smlsl sgxdi_v2.4s, v17.4h, v16.4h
- smlsl2 sgxdi_v2.4s, v17.8h, v16.8h
-3:
- add y, y, #1
- cmp y, #(BDOF_MIN_BLOCK_SIZE)
- mov x13, y
- b.gt 4f
- b.lt 1b
- tbz pad_mask, #3, 1b
- sub x13, x13, #1 // pad bottom
- b 1b
+ b.eq 16f
4:
- addv s22, sgx2_v.4s
- addv s23, sgy2_v.4s
- addv s24, sgxgy_v.4s
- addv s25, sgxdi_v.4s
- addv s26, sgydi_v.4s
+ mov x15, #0x0 // dy, inner loop
- mov w3, #31
- mov w16, #-15
- mov w17, #15
-40:
- mov w14, #0
+ movi v25.2d, #0
+ movi v27.2d, #0
+ movi v23.2d, #0
+ movi v26.2d, #0
+ movi v24.2d, #0
+ b 8f
- mov sgx2, v22.s[0]
- mov sgy2, v23.s[0]
- mov sgxgy, v24.s[0]
- mov sgxdi, v25.s[0]
- mov sgydi, v26.s[0]
-
- cbz sgx2, 5f
- clz w12, sgx2
- lsl sgxdi, sgxdi, #2
- sub w13, w3, w12 // log2(sgx2)
- asr sgxdi, sgxdi, w13
- cmp sgxdi, w16
- csel w14, w16, sgxdi, lt // clip to -15
- b.le 5f
- cmp sgxdi, w17
- csel w14, w17, sgxdi, gt // clip to 15
5:
- strh w14, [vx], #2
+ // add line(-1) and line0 from previous results
+ bdof_vx_vy_8x_add_line v18, v19, v20, v21, v22
+ bdof_vx_vy_8x_add_line v5, v6, v7, v16, v17
+ add x15, x15, #1
+8:
+ cmp w12, w6
+ b.hs 9f
+ // y < block_h && dy == 0, reuse previous results
+ cbz x15, 5b
+9:
+ ldr q28, [x0] // src0
+ ldr q29, [x1] // src1
+ ldr q30, [x14], #(BDOF_BLOCK_SIZE * 2) // gh0
+ ldr q31, [x13], #(BDOF_BLOCK_SIZE * 2) // gh1
+ ldr q8, [x10], #(BDOF_BLOCK_SIZE * 2) // gv0
+ ldr q9, [x9], #(BDOF_BLOCK_SIZE * 2) // gv1
+ add x0, x0, #(VVC_MAX_PB_SIZE * 2)
+ add x1, x1, #(VVC_MAX_PB_SIZE * 2)
- mov w15, #0
- cbz sgy2, 6f
- lsl sgydi, sgydi, #2
- smull x14, w14, sgxgy
- asr w14, w14, #1
- sub sgydi, sgydi, w14
- clz w12, sgy2
- sub w13, w3, w12 // log2(sgy2)
- asr sgydi, sgydi, w13
- cmp sgydi, w16
- csel w15, w16, sgydi, lt // clip to -15
- b.le 6f
- cmp sgydi, w17
- csel w15, w17, sgydi, gt // clip to 15
-6:
- strh w15, [vy], #2
- cbz x0, 7f
- addv s22, sgx2_v2.4s
- addv s23, sgy2_v2.4s
- addv s24, sgxgy_v2.4s
- addv s25, sgxdi_v2.4s
- addv s26, sgydi_v2.4s
- mov x0, #0
- b 40b
-7:
+ sshr v28.8h, v28.8h, #0x4
+ sshr v29.8h, v29.8h, #0x4
+ shadd v30.8h, v30.8h, v31.8h // tmph
+ shadd v31.8h, v8.8h, v9.8h // tmpv
+ sub v8.8h, v28.8h, v29.8h // diff
+
+ abs v28.8h, v30.8h
+ abs v29.8h, v31.8h
+
+ bdof_vx_vy_8x_padding_left_right v28, v9, v10, v28
+ bdof_vx_vy_8x_padding_left_right v29, v9, v10, v29
+
+ bdof_vx_vy_sign v30, v9, v10, v9
+ bdof_vx_vy_sign v31, v10, v31, v31
+
+ mul v30.8h, v31.8h, v30.8h
+ mul v9.8h, v9.8h, v8.8h
+ mul v8.8h, v31.8h, v8.8h
+
+ bdof_vx_vy_8x_padding_left_right v30, v31, v10, v30
+ bdof_vx_vy_8x_padding_left_right v9, v31, v10, v31
+ bdof_vx_vy_8x_padding_left_right v8, v9, v10, v8
+
+ bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
+
+ cmp w12, w6
+ b.ne 10f
+ cbnz x15, 10f
+
+ // y == block_h && dy == 0, duplicate first line results
+ bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
+ add x15, x15, #0x1
+ b 9b
+10:
+ cmp x15, #(BDOF_MIN_BLOCK_SIZE - 1)
+ b.eq 11f
+ cmp x15, #(BDOF_MIN_BLOCK_SIZE)
+ b.ne 12f
+ b 1b
+11:
+ // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
+ // duplicate the results and break
+ cmp x12, #(BDOF_MIN_BLOCK_SIZE)
+ b.eq 13f
+ bdof_vx_vy_8x_save_line v18, v19, v20, v21, v22
+12:
+ add x15, x15, #1
+ b 8b
+13:
+ // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
+ // padding bottom then break
+ bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
+ b 2b
+16:
+ ldp d9, d8, [sp, #0x10]
+ ldp d11, d10, [sp], #0x20
+ ret
+endfunc
+
+/*
+ * x0: const int16_t *_src0,
+ * x1: const int16_t *_src1,
+ * x2: int16_t *gradient_h[2],
+ * x3: int16_t *gradient_v[2],
+ * x4: int16_t vx[16],
+ * x5: int16_t vy[16],
+ * w6: int block_h
+ */
+function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
+ sub sp, sp, #0x80
+ stp d15, d14, [sp, #0x30]
+ stp d13, d12, [sp, #0x40]
+ stp d11, d10, [sp, #0x50]
+ stp d9, d8, [sp, #0x60]
+ stp x29, x30, [sp, #0x70]
+
+ ldp x8, x9, [x2] // gh0, gh1
+ ldp x10, x11, [x3] // gv0, gv1
+ movrel x12, bdof_vx_vy_16x_tbl
+ ldp q0, q1, [x12] // table
+ mov w13, w6 // y = block_h
+ b 4f
+
+1:
+ // save line4
+ bdof_vx_vy_16x_save_line v6, v7, v16, v17, v18
+2:
+ clz v3.4s, v25.4s
+ mvni v5.4s, #0x1e
+ add v3.4s, v3.4s, v5.4s // -log2()
+ shl v4.4s, v28.4s, #0x2
+ sshl v3.4s, v4.4s, v3.4s
+
+ movi v28.4s, #0xf // clip to 15
+ mvni v29.4s, #0xe // clip to -15
+ bdof_vx_vy_clip_mask v3, v28, v29, v25, v3
+ sqxtn v4.4h, v3.4s
+ st1 {v4.d}[0], [x4], #(BDOF_MIN_BLOCK_SIZE * 2)
+
+ subs x13, x13, #(BDOF_MIN_BLOCK_SIZE) // y -= BDOF_MIN_BLOCK_SIZE
+
+ clz v4.4s, v24.4s
+ add v4.4s, v4.4s, v5.4s // -log2()
+ shl v5.4s, v27.4s, #0x2
+ mul v3.4s, v3.4s, v26.4s
+ sshr v3.4s, v3.4s, #0x1
+ sub v3.4s, v5.4s, v3.4s
+ sshl v3.4s, v3.4s, v4.4s
+
+ bdof_vx_vy_clip_mask v3, v28, v29, v24, v3
+ sqxtn v3.4h, v3.4s
+ st1 {v3.d}[0], [x5], #(BDOF_MIN_BLOCK_SIZE * 2)
+ b.eq 16f
+4:
+ mov w14, #0x0 // dy, inner loop
+
+ movi v25.2d, #0
+ movi v24.2d, #0
+ movi v26.2d, #0
+ movi v28.2d, #0
+ movi v27.2d, #0
+ b 8f
+
+5:
+ // add line(-1) and line0 from previous results
+ bdof_vx_vy_16x_add_line v19, v20, v21, v22, v23
+ bdof_vx_vy_16x_add_line v6, v7, v16, v17, v18
+ add w14, w14, #0x1
+
+ 8:
+ cmp w13, w6
+ b.hs 9f
+ // y < block_h && dy == 0, reuse previous results
+ cbz w14, 5b
+9:
+ ld1 {v29.8h, v30.8h}, [x0] // src0
+ sshr v31.8h, v29.8h, #0x4
+ ld1 {v8.8h, v9.8h}, [x1] // src1
+ sshr v10.8h, v8.8h, #0x4
+ ld1 {v11.8h, v12.8h}, [x8], #32 // gh0
+ sshr v29.8h, v30.8h, #0x4
+ sshr v30.8h, v9.8h, #0x4
+ ld1 {v8.8h, v9.8h}, [x9], #32 // gh1
+ shadd v13.8h, v11.8h, v8.8h // (gh0 + gh1) >> 1, left half
+ ld1 {v14.8h, v15.8h}, [x10], #32 // gv0
+ ld1 {v3.8h, v4.8h}, [x11], #32 // gv1
+ shadd v5.8h, v14.8h, v3.8h // (gv0 + gv1) >> 1, left half
+ sub v31.8h, v31.8h, v10.8h // diff, left half
+ shadd v8.8h, v12.8h, v9.8h // (gh0 + gh1) >> 1, right half
+ shadd v3.8h, v15.8h, v4.8h // (gv0 + gv1) >> 1, right half
+ sub v4.8h, v29.8h, v30.8h // diff, right half
+
+ abs v29.8h, v13.8h
+ abs v30.8h, v8.8h
+ abs v9.8h, v5.8h
+ abs v10.8h, v3.8h
+
+ add x0, x0, #(VVC_MAX_PB_SIZE * 2)
+ add x1, x1, #(VVC_MAX_PB_SIZE * 2)
+
+ bdof_vx_vy_16x_padding_left_right v29, v30, v11, v12, v14, v29
+ bdof_vx_vy_16x_padding_left_right v9, v10, v11, v12, v14, v30
+
+ bdof_vx_vy_sign v13, v9, v10, v9
+ bdof_vx_vy_sign v8, v10, v11, v10
+ bdof_vx_vy_sign v5, v11, v5, v5
+ bdof_vx_vy_sign v3, v11, v3, v3
+
+ mul v11.8h, v5.8h, v13.8h
+ mul v12.8h, v3.8h, v8.8h
+ mul v8.8h, v9.8h, v31.8h
+ mul v9.8h, v10.8h, v4.8h
+ mul v13.8h, v5.8h, v31.8h
+ mul v14.8h, v3.8h, v4.8h
+
+ bdof_vx_vy_16x_padding_left_right v11, v12, v3, v4, v5, v31
+ bdof_vx_vy_16x_padding_left_right v8, v9, v3, v4, v5, v8
+ bdof_vx_vy_16x_padding_left_right v13, v14, v3, v4, v5, v9
+
+ bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
+ // check whether padding top
+ cmp w13, w6
+ b.ne 10f
+ cbnz w14, 10f
+ // y == block_h && dy == 0, padding top
+ bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
+ add w14, w14, #0x1
+ b 9b
+10:
+ cmp w14, #(BDOF_MIN_BLOCK_SIZE - 1)
+ b.eq 11f
+ cmp w14, #(BDOF_MIN_BLOCK_SIZE)
+ b.ne 12f
+ // save line4
+ b 1b
+ 11:
+ // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1, padding bottom
+ cmp x13, #(BDOF_MIN_BLOCK_SIZE)
+ b.eq 13f
+ // save line3
+ bdof_vx_vy_16x_save_line v19, v20, v21, v22, v23
+12:
+ add w14, w14, #0x1 // dy++
+ b 8b
+13:
+ // padding bottom
+ bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
+ b 2b
+16:
+ // restore
+ ldp x29, x30, [sp, #0x70]
+ ldp d9, d8, [sp, #0x60]
+ ldp d11, d10, [sp, #0x50]
+ ldp d13, d12, [sp, #0x40]
+ ldp d15, d14, [sp, #0x30]
+ add sp, sp, #0x80
ret
-
-.unreq src0
-.unreq src1
-.unreq pad_mask
-.unreq gh
-.unreq gv
-.unreq vx
-.unreq vy
-.unreq sgx2
-.unreq sgy2
-.unreq sgxgy
-.unreq sgxdi
-.unreq sgydi
-.unreq sgx2_v
-.unreq sgy2_v
-.unreq sgxgy_v
-.unreq sgxdi_v
-.unreq sgydi_v
-.unreq sgx2_v2
-.unreq sgy2_v2
-.unreq sgxgy_v2
-.unreq sgxdi_v2
-.unreq sgydi_v2
-.unreq y
endfunc
diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c
index ac6182b09d..d8ddaacb14 100644
--- a/libavcodec/aarch64/vvc/of_template.c
+++ b/libavcodec/aarch64/vvc/of_template.c
@@ -41,6 +41,11 @@ static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride,
ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1],
BDOF_BLOCK_SIZE,
_src1, MAX_PB_SIZE, block_w, block_h);
+ int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
+ if (block_w == 8)
+ ff_vvc_derive_bdof_vx_vy_8x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
+ else
+ ff_vvc_derive_bdof_vx_vy_16x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE * 2) {
@@ -50,14 +55,10 @@ static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride,
int idx = BDOF_BLOCK_SIZE * y + x;
const int16_t *gh[] = {gradient_h[0] + idx, gradient_h[1] + idx};
const int16_t *gv[] = {gradient_v[0] + idx, gradient_v[1] + idx};
- int16_t vx[2], vy[2];
- int pad_mask = !x | ((!y) << 1) |
- ((x + 2 * BDOF_MIN_BLOCK_SIZE == block_w) << 2) |
- ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3);
- ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, vx, vy);
+ int idx1 = y + x / BDOF_MIN_BLOCK_SIZE;
FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(d, dst_stride,
- src0, src1, gh, gv,
- vx, vy);
+ src0, src1, gh, gv,
+ vx + idx1, vy + idx1);
}
dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
}
--
2.49.1
From 6edb238e7ac1f98c7b16b0db05e0f5436152cc48 Mon Sep 17 00:00:00 2001
From: Zhao Zhili <zhilizhao@tencent.com>
Date: Thu, 14 Aug 2025 12:42:38 +0800
Subject: [PATCH 2/2] avcodec/aarch64/vvc: Optimize apply_bdof
Before After
--------------------------------------------------------------------
apply_bdof_8_8x16_c: | 7431.4 ( 1.00x) | 7371.7 ( 1.00x)
apply_bdof_8_8x16_neon: | 1175.4 ( 6.32x) | 1036.3 ( 7.11x)
apply_bdof_8_16x8_c: | 7182.2 ( 1.00x) | 7201.1 ( 1.00x)
apply_bdof_8_16x8_neon: | 1021.7 ( 7.03x) | 879.9 ( 8.18x)
apply_bdof_8_16x16_c: | 14577.1 ( 1.00x) | 14589.3 ( 1.00x)
apply_bdof_8_16x16_neon: | 2012.8 ( 7.24x) | 1743.3 ( 8.37x)
apply_bdof_10_8x16_c: | 7292.4 ( 1.00x) | 7308.5 ( 1.00x)
apply_bdof_10_8x16_neon: | 1156.3 ( 6.31x) | 1045.3 ( 6.99x)
apply_bdof_10_16x8_c: | 7112.4 ( 1.00x) | 7214.4 ( 1.00x)
apply_bdof_10_16x8_neon: | 1007.6 ( 7.06x) | 904.8 ( 7.97x)
apply_bdof_10_16x16_c: | 14363.3 ( 1.00x) | 14476.4 ( 1.00x)
apply_bdof_10_16x16_neon: | 1986.9 ( 7.23x) | 1783.1 ( 8.12x)
apply_bdof_12_8x16_c: | 7433.3 ( 1.00x) | 7374.7 ( 1.00x)
apply_bdof_12_8x16_neon: | 1155.9 ( 6.43x) | 1040.8 ( 7.09x)
apply_bdof_12_16x8_c: | 7171.1 ( 1.00x) | 7376.3 ( 1.00x)
apply_bdof_12_16x8_neon: | 1010.8 ( 7.09x) | 899.4 ( 8.20x)
apply_bdof_12_16x16_c: | 14515.5 ( 1.00x) | 14731.5 ( 1.00x)
apply_bdof_12_16x16_neon: | 1988.4 ( 7.30x) | 1785.2 ( 8.25x)
---
libavcodec/aarch64/vvc/dsp_init.c | 37 +--
libavcodec/aarch64/vvc/inter.S | 402 ++++++++++++++++++++++++---
libavcodec/aarch64/vvc/of_template.c | 65 -----
3 files changed, 368 insertions(+), 136 deletions(-)
delete mode 100644 libavcodec/aarch64/vvc/of_template.c
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index 1db38ebb1d..df0b536539 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -30,38 +30,16 @@
#define BDOF_BLOCK_SIZE 16
#define BDOF_MIN_BLOCK_SIZE 4
-void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
- int16_t *gradient_v,
- ptrdiff_t gradient_stride,
- const int16_t *_src,
- ptrdiff_t src_stride,
- int width, int height);
-
-void ff_vvc_derive_bdof_vx_vy_8x_neon(const int16_t *_src0,
- const int16_t *_src1,
- int16_t *const gradient_h[2],
- int16_t *const gradient_v[2],
- int16_t vx[16], int16_t vy[16],
- int block_h);
-void ff_vvc_derive_bdof_vx_vy_16x_neon(const int16_t *_src0,
- const int16_t *_src1,
- int16_t *const gradient_h[2],
- int16_t *const gradient_v[2],
- int16_t vx[16], int16_t vy[16],
- int block_h);
#define BIT_DEPTH 8
#include "alf_template.c"
-#include "of_template.c"
#undef BIT_DEPTH
#define BIT_DEPTH 10
#include "alf_template.c"
-#include "of_template.c"
#undef BIT_DEPTH
#define BIT_DEPTH 12
#include "alf_template.c"
-#include "of_template.c"
#undef BIT_DEPTH
int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
@@ -121,6 +99,15 @@ DMVR_FUN(hv_, 8)
DMVR_FUN(hv_, 10)
DMVR_FUN(hv_, 12)
+#define APPLY_BDOF_FUNC(bd) \
+ void ff_vvc_apply_bdof_ ## bd ## _neon(uint8_t *_dst, ptrdiff_t _dst_stride, \
+ const int16_t *_src0, const int16_t *_src1, \
+ int block_w, int block_h);
+
+APPLY_BDOF_FUNC(8)
+APPLY_BDOF_FUNC(10)
+APPLY_BDOF_FUNC(12)
+
void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
{
int cpu_flags = av_get_cpu_flags();
@@ -202,7 +189,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.w_avg = vvc_w_avg_8;
c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon;
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
- c->inter.apply_bdof = apply_bdof_8;
+ c->inter.apply_bdof = ff_vvc_apply_bdof_8_neon;
c->sao.band_filter[0] = ff_h26x_sao_band_filter_8x8_8_neon;
for (int i = 1; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
@@ -246,7 +233,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.avg = ff_vvc_avg_10_neon;
c->inter.w_avg = vvc_w_avg_10;
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
- c->inter.apply_bdof = apply_bdof_10;
+ c->inter.apply_bdof = ff_vvc_apply_bdof_10_neon;
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
@@ -255,7 +242,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.w_avg = vvc_w_avg_12;
c->inter.dmvr[0][0] = ff_vvc_dmvr_12_neon;
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
- c->inter.apply_bdof = apply_bdof_12;
+ c->inter.apply_bdof = ff_vvc_apply_bdof_12_neon;
c->alf.filter[LUMA] = alf_filter_luma_12_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 06c6f3619b..61de56c6ac 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -716,7 +716,93 @@ function ff_vvc_prof_grad_filter_8x_neon, export=1
.unreq height
endfunc
-.macro vvc_apply_bdof_block bit_depth
+function ff_vvc_bdof_grad_filter_8x_neon, export=1
+ gh0 .req x0
+ gh1 .req x1
+ gv0 .req x2
+ gv1 .req x3
+ src0 .req x4
+ src1 .req x5
+ width .req w6
+ height .req w7
+
+1:
+ mov x10, src0
+ mov w11, width
+ mov x12, gh0
+ mov x13, gv0
+ mov x14, src1
+ mov x15, gh1
+ mov x16, gv1
+2:
+ ldur q0, [x10, #2]
+ ldur q1, [x10, #-2]
+ ldr q2, [x10, #(VVC_MAX_PB_SIZE << 1)]
+ ldr q3, [x10, #-(VVC_MAX_PB_SIZE << 1)]
+ sshr v0.8h, v0.8h, #6
+ sshr v1.8h, v1.8h, #6
+ ldur q4, [x14, #2]
+ ldur q5, [x14, #-2]
+ sshr v2.8h, v2.8h, #6
+ sshr v3.8h, v3.8h, #6
+ ldr q6, [x14, #(VVC_MAX_PB_SIZE << 1)]
+ ldr q7, [x14, #-(VVC_MAX_PB_SIZE << 1)]
+ // results of gradient_h0
+ sub v0.8h, v0.8h, v1.8h
+ // results of gradient_v0
+ sub v2.8h, v2.8h, v3.8h
+
+ sshr v4.8h, v4.8h, #6
+ sshr v5.8h, v5.8h, #6
+ sshr v6.8h, v6.8h, #6
+ sshr v7.8h, v7.8h, #6
+ // results of gradient_h1
+ sub v4.8h, v4.8h, v5.8h
+ // results of gradient_v1
+ sub v6.8h, v6.8h, v7.8h
+
+ add x10, x10, #16
+ add x14, x14, #16
+
+ // (gradient_h0 + gradient_h1) >> 1
+ shadd v1.8h, v0.8h, v4.8h
+ // gradient_h0 - gradient_h1
+ sub v5.8h, v0.8h, v4.8h
+
+ subs w11, w11, #8
+
+ // (gradient_v0 + gradient_v1) >> 1
+ shadd v3.8h, v2.8h, v6.8h
+ // gradient_v0 - gradient_v1
+ sub v7.8h, v2.8h, v6.8h
+
+ st1 {v1.8h}, [x12], #16
+ st1 {v5.8h}, [x15], #16
+ st1 {v3.8h}, [x13], #16
+ st1 {v7.8h}, [x16], #16
+ b.ne 2b
+
+ subs height, height, #1
+ add gh0, gh0, #(BDOF_BLOCK_SIZE << 1)
+ add gv0, gv0, #(BDOF_BLOCK_SIZE << 1)
+ add src0, src0, #(VVC_MAX_PB_SIZE << 1)
+ add gh1, gh1, #(BDOF_BLOCK_SIZE << 1)
+ add gv1, gv1, #(BDOF_BLOCK_SIZE << 1)
+ add src1, src1, #(VVC_MAX_PB_SIZE << 1)
+ b.ne 1b
+ ret
+
+.unreq gh0
+.unreq gh1
+.unreq gv0
+.unreq gv1
+.unreq src0
+.unreq src1
+.unreq width
+.unreq height
+endfunc
+
+.macro vvc_apply_bdof_block_8x bit_depth
dst .req x0
dst_stride .req x1
src0 .req x2
@@ -726,33 +812,28 @@ endfunc
vx .req x6
vy .req x7
- ld1r {v0.8h}, [vx], #2
- ld1r {v1.8h}, [vy], #2
- ld1r {v2.8h}, [vx]
- ld1r {v3.8h}, [vy]
- ins v0.d[1], v2.d[1]
- ins v1.d[1], v3.d[1]
-
+ ldr w8, [sp]
movi v7.4s, #(1 << (14 - \bit_depth))
- ldp x8, x9, [gh]
- ldp x10, x11, [gv]
mov x12, #(BDOF_BLOCK_SIZE * 2)
- mov w13, #(BDOF_MIN_BLOCK_SIZE)
mov x14, #(VVC_MAX_PB_SIZE * 2)
.if \bit_depth >= 10
// clip pixel
mov w15, #((1 << \bit_depth) - 1)
movi v18.8h, #0
- lsl dst_stride, dst_stride, #1
dup v19.8h, w15
.endif
+
+0:
+ ld1r {v0.8h}, [vx], #2
+ ld1r {v1.8h}, [vy], #2
+ ld1r {v2.8h}, [vx]
+ ld1r {v3.8h}, [vy]
+ mov w13, #(BDOF_MIN_BLOCK_SIZE)
+ ins v0.d[1], v2.d[1]
+ ins v1.d[1], v3.d[1]
1:
- ld1 {v2.8h}, [x8], x12
- ld1 {v3.8h}, [x9], x12
- ld1 {v4.8h}, [x10], x12
- ld1 {v5.8h}, [x11], x12
- sub v2.8h, v2.8h, v3.8h
- sub v4.8h, v4.8h, v5.8h
+ ld1 {v2.8h}, [gh], x12
+ ld1 {v4.8h}, [gv], x12
smull v3.4s, v0.4h, v2.4h
smull2 v16.4s, v0.8h, v2.8h
smlal v3.4s, v1.4h, v4.4h
@@ -780,6 +861,11 @@ endfunc
st1 {v5.8h}, [dst], dst_stride
.endif
b.ne 1b
+
+ subs w8, w8, #(BDOF_MIN_BLOCK_SIZE)
+ add vx, vx, #(2 * BDOF_MIN_BLOCK_SIZE - 2)
+ add vy, vy, #(2 * BDOF_MIN_BLOCK_SIZE - 2)
+ b.ne 0b
ret
.unreq dst
@@ -792,16 +878,128 @@ endfunc
.unreq vy
.endm
-function ff_vvc_apply_bdof_block_8_neon, export=1
- vvc_apply_bdof_block 8
+function ff_vvc_apply_bdof_block_8x_8_neon, export=1
+ vvc_apply_bdof_block_8x 8
endfunc
-function ff_vvc_apply_bdof_block_10_neon, export=1
- vvc_apply_bdof_block 10
+function ff_vvc_apply_bdof_block_8x_10_neon, export=1
+ vvc_apply_bdof_block_8x 10
endfunc
-function ff_vvc_apply_bdof_block_12_neon, export=1
- vvc_apply_bdof_block 12
+function ff_vvc_apply_bdof_block_8x_12_neon, export=1
+ vvc_apply_bdof_block_8x 12
+endfunc
+
+.macro vvc_apply_bdof_block_16x bit_depth
+ dst .req x0
+ dst_stride .req x1
+ src0 .req x2
+ src1 .req x3
+ gh .req x4
+ gv .req x5
+ vx .req x6
+ vy .req x7
+
+ ldr w8, [sp]
+ movi v7.4s, #(1 << (14 - \bit_depth))
+.if \bit_depth >= 10
+ // clip pixel
+ mov w15, #((1 << \bit_depth) - 1)
+ movi v18.8h, #0
+ dup v19.8h, w15
+.endif
+
+0:
+ ld1r {v0.8h}, [vx], #2
+ ld1r {v1.8h}, [vy], #2
+ ld1r {v2.8h}, [vx], #2
+ ld1r {v3.8h}, [vy], #2
+
+ mov w13, #(BDOF_MIN_BLOCK_SIZE)
+
+ ld1r {v20.8h}, [vx], #2
+ ld1r {v21.8h}, [vy], #2
+ ld1r {v22.8h}, [vx], #2
+ ld1r {v23.8h}, [vy], #2
+
+ ins v0.d[1], v2.d[1]
+ ins v1.d[1], v3.d[1]
+ ins v20.d[1], v22.d[1]
+ ins v21.d[1], v23.d[1]
+1:
+ ldp q2, q22, [gh], #(BDOF_BLOCK_SIZE * 2)
+ ldp q4, q24, [gv], #(BDOF_BLOCK_SIZE * 2)
+ smull v3.4s, v0.4h, v2.4h
+ smull2 v16.4s, v0.8h, v2.8h
+ smlal v3.4s, v1.4h, v4.4h
+ smlal2 v16.4s, v1.8h, v4.8h
+
+ ldp q5, q25, [src0], #(VVC_MAX_PB_SIZE * 2)
+ ldp q6, q26, [src1], #(VVC_MAX_PB_SIZE * 2)
+
+ smull v23.4s, v20.4h, v22.4h
+ smull2 v27.4s, v20.8h, v22.8h
+ smlal v23.4s, v21.4h, v24.4h
+ smlal2 v27.4s, v21.8h, v24.8h
+
+ saddl v2.4s, v5.4h, v6.4h
+ add v2.4s, v2.4s, v7.4s
+ add v2.4s, v2.4s, v3.4s
+ saddl2 v4.4s, v5.8h, v6.8h
+ add v4.4s, v4.4s, v7.4s
+ add v4.4s, v4.4s, v16.4s
+
+ saddl v22.4s, v25.4h, v26.4h
+ add v22.4s, v22.4s, v7.4s
+ add v22.4s, v22.4s, v23.4s
+ saddl2 v24.4s, v25.8h, v26.8h
+ add v24.4s, v24.4s, v7.4s
+ add v24.4s, v24.4s, v27.4s
+
+ sqshrn v5.4h, v2.4s, #(15 - \bit_depth)
+ sqshrn2 v5.8h, v4.4s, #(15 - \bit_depth)
+ sqshrn v25.4h, v22.4s, #(15 - \bit_depth)
+ sqshrn2 v25.8h, v24.4s, #(15 - \bit_depth)
+
+ subs w13, w13, #1
+.if \bit_depth == 8
+ sqxtun v5.8b, v5.8h
+ sqxtun2 v5.16b, v25.8h
+ str q5, [dst]
+.else
+ smin v5.8h, v5.8h, v19.8h
+ smax v5.8h, v5.8h, v18.8h
+ smin v25.8h, v25.8h, v19.8h
+ smax v25.8h, v25.8h, v18.8h
+ stp q5, q25, [dst]
+.endif
+ add dst, dst, dst_stride
+ b.ne 1b
+
+ subs w8, w8, #(BDOF_MIN_BLOCK_SIZE)
+ b.ne 0b
+ ret
+
+.unreq dst
+.unreq dst_stride
+.unreq src0
+.unreq src1
+.unreq gh
+.unreq gv
+.unreq vx
+.unreq vy
+.endm
+
+function ff_vvc_apply_bdof_block_16x_8_neon, export=1
+ vvc_apply_bdof_block_16x 8
+endfunc
+
+function ff_vvc_apply_bdof_block_16x_10_neon, export=1
+ vvc_apply_bdof_block_16x 10
+endfunc
+
+function ff_vvc_apply_bdof_block_16x_12_neon, export=1
+ vvc_apply_bdof_block_16x 12
endfunc
const bdof_vx_vy_8x_tbl
@@ -885,8 +1083,8 @@ endconst
/*
* x0: const int16_t *_src0,
* x1: const int16_t *_src1,
- * x2: int16_t *gradient_h[2],
- * x3: int16_t *gradient_v[2],
+ * x2: const int16_t *gradient_h,
+ * x3: const int16_t *gradient_v,
* x4: int16_t vx[16],
* x5: int16_t vy[16],
* w6: int block_h
@@ -895,8 +1093,6 @@ function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1
stp d11, d10, [sp, #-0x20]!
stp d9, d8, [sp, #0x10]
- ldp x14, x13, [x2] // gh0, gh1
- ldp x10, x9, [x3] // gv0, gv1
movrel x11, bdof_vx_vy_8x_tbl
ldr q0, [x11] // table
mvni v2.4s, #30 // -31, for log2
@@ -964,17 +1160,13 @@ function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1
9:
ldr q28, [x0] // src0
ldr q29, [x1] // src1
- ldr q30, [x14], #(BDOF_BLOCK_SIZE * 2) // gh0
- ldr q31, [x13], #(BDOF_BLOCK_SIZE * 2) // gh1
- ldr q8, [x10], #(BDOF_BLOCK_SIZE * 2) // gv0
- ldr q9, [x9], #(BDOF_BLOCK_SIZE * 2) // gv1
+ ldr q30, [x2], #(BDOF_BLOCK_SIZE * 2) // (gh0 + gh1) >> 1
+ ldr q31, [x3], #(BDOF_BLOCK_SIZE * 2) // (gv0 + gv1) >> 1
add x0, x0, #(VVC_MAX_PB_SIZE * 2)
add x1, x1, #(VVC_MAX_PB_SIZE * 2)
sshr v28.8h, v28.8h, #0x4
sshr v29.8h, v29.8h, #0x4
- shadd v30.8h, v30.8h, v31.8h // tmph
- shadd v31.8h, v8.8h, v9.8h // tmpv
sub v8.8h, v28.8h, v29.8h // diff
abs v28.8h, v30.8h
@@ -1033,8 +1225,8 @@ endfunc
/*
* x0: const int16_t *_src0,
* x1: const int16_t *_src1,
- * x2: int16_t *gradient_h[2],
- * x3: int16_t *gradient_v[2],
+ * x2: const int16_t *gradient_h,
+ * x3: const int16_t *gradient_v,
* x4: int16_t vx[16],
* x5: int16_t vy[16],
* w6: int block_h
@@ -1047,8 +1239,6 @@ function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
stp d9, d8, [sp, #0x60]
stp x29, x30, [sp, #0x70]
- ldp x8, x9, [x2] // gh0, gh1
- ldp x10, x11, [x3] // gv0, gv1
movrel x12, bdof_vx_vy_16x_tbl
ldp q0, q1, [x12] // table
mov w13, w6 // y = block_h
@@ -1110,17 +1300,11 @@ function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
sshr v31.8h, v29.8h, #0x4
ld1 {v8.8h, v9.8h}, [x1] // src1
sshr v10.8h, v8.8h, #0x4
- ld1 {v11.8h, v12.8h}, [x8], #32 // gh0
+ ldp q13, q8, [x2], #32 // (gh0 + gh1) >> 1
sshr v29.8h, v30.8h, #0x4
sshr v30.8h, v9.8h, #0x4
- ld1 {v8.8h, v9.8h}, [x9], #32 // gh1
- shadd v13.8h, v11.8h, v8.8h // (gh0 + gh1) >> 1, left half
- ld1 {v14.8h, v15.8h}, [x10], #32 // gv0
- ld1 {v3.8h, v4.8h}, [x11], #32 // gv1
- shadd v5.8h, v14.8h, v3.8h // (gv0 + gv1) >> 1, left half
+ ldp q5, q3, [x3], #32 // (gv0 + gv1) >> 1
sub v31.8h, v31.8h, v10.8h // diff, left half
- shadd v8.8h, v12.8h, v9.8h // (gh0 + gh1) >> 1, right half
- shadd v3.8h, v15.8h, v4.8h // (gv0 + gv1) >> 1, right half
sub v4.8h, v29.8h, v30.8h // diff, right half
abs v29.8h, v13.8h
@@ -1189,3 +1373,129 @@ function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
add sp, sp, #0x80
ret
endfunc
+
+function ff_vvc_apply_bdof_10_neon, export=1
+ mov w6, #10
+ b 0f
+endfunc
+
+function ff_vvc_apply_bdof_12_neon, export=1
+ mov w6, #12
+ b 0f
+endfunc
+
+// int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]
+// int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]
+// int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
+#define APPLY_BDOF_STACK_SIZE ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 4)
+#define GRADIENT_H0_OFFSET 2
+#define GRADIENT_H1_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 2 + 2)
+#define GRADIENT_V0_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 4 + 2)
+#define GRADIENT_V1_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 6 + 2)
+#define VX_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8)
+#define VY_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 2)
+function ff_vvc_apply_bdof_8_neon, export=1
+ mov w6, #8
+0:
+ stp x19, x20, [sp, #-0x40]!
+ stp x21, x22, [sp, #0x10]
+ stp x23, x24, [sp, #0x20]
+ stp x25, x30, [sp, #0x30]
+
+ sub sp, sp, #APPLY_BDOF_STACK_SIZE
+ mov w19, w6 // bit_depth
+ mov x20, x0 // dst
+ mov x21, x1 // dst_stride
+ mov x22, x2 // src0
+ mov x23, x3 // src1
+ mov w24, w4 // block_w
+ mov w25, w5 // block_h
+
+ // int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]};
+ add x0, sp, #GRADIENT_H0_OFFSET
+ add x1, sp, #GRADIENT_H1_OFFSET
+ add x2, sp, #GRADIENT_V0_OFFSET
+ add x3, sp, #GRADIENT_V1_OFFSET
+ mov x4, x22
+ mov x5, x23
+ mov w6, w24
+ mov w7, w25
+ bl X(ff_vvc_bdof_grad_filter_8x_neon)
+
+ cmp w24, #8
+ mov x0, x22 // src0
+ mov x1, x23 // src1
+ add x2, sp, #GRADIENT_H0_OFFSET // gh0
+ add x3, sp, #GRADIENT_V0_OFFSET // gv0
+ add x4, sp, #VX_OFFSET // vx
+ add x5, sp, #VY_OFFSET // vy
+ mov w6, w25 // block_h
+
+ b.gt 16f
+
+ bl X(ff_vvc_derive_bdof_vx_vy_8x_neon)
+ cmp w19, #10 // check bitdepth
+ mov x0, x20 // dst
+ mov x1, x21 // dst_stride
+ mov x2, x22 // src0
+ mov x3, x23 // src1
+ add x4, sp, #GRADIENT_H1_OFFSET // gh1
+ add x5, sp, #GRADIENT_V1_OFFSET // gv1
+ add x6, sp, #VX_OFFSET
+ add x7, sp, #VY_OFFSET
+ str w25, [sp]
+ b.eq 1f
+ b.gt 2f
+ // 8bit
+0:
+ bl X(ff_vvc_apply_bdof_block_8x_8_neon)
+ b 32f
+1:
+ // 10bit
+ bl X(ff_vvc_apply_bdof_block_8x_10_neon)
+ b 32f
+2:
+ // 12bit
+ bl X(ff_vvc_apply_bdof_block_8x_12_neon)
+ b 32f
+16:
+ bl X(ff_vvc_derive_bdof_vx_vy_16x_neon)
+
+ cmp w19, #10 // check bitdepth
+ mov x0, x20 // dst
+ mov x1, x21 // dst_stride
+ mov x2, x22 // src0
+ mov x3, x23 // src1
+ add x4, sp, #GRADIENT_H1_OFFSET // gh1
+ add x5, sp, #GRADIENT_V1_OFFSET // gv1
+ add x6, sp, #VX_OFFSET
+ add x7, sp, #VY_OFFSET
+ str w25, [sp]
+ b.eq 17f
+ b.gt 18f
+ // 8bit
+ bl X(ff_vvc_apply_bdof_block_16x_8_neon)
+ b 32f
+17:
+ // 10bit
+ bl X(ff_vvc_apply_bdof_block_16x_10_neon)
+ b 32f
+18:
+ // 12bit
+ bl X(ff_vvc_apply_bdof_block_16x_12_neon)
+32:
+ add sp, sp, #APPLY_BDOF_STACK_SIZE
+ ldp x25, x30, [sp, #0x30]
+ ldp x23, x24, [sp, #0x20]
+ ldp x21, x22, [sp, #0x10]
+ ldp x19, x20, [sp], #0x40
+ ret
+endfunc
+
+#undef APPLY_BDOF_STACK_SIZE
+#undef GRADIENT_H0_OFFSET
+#undef GRADIENT_H1_OFFSET
+#undef GRADIENT_V0_OFFSET
+#undef GRADIENT_V1_OFFSET
+#undef VX_OFFSET
+#undef VY_OFFSET
diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c
deleted file mode 100644
index d8ddaacb14..0000000000
--- a/libavcodec/aarch64/vvc/of_template.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/bit_depth_template.c"
-
-void FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(pixel* dst,
- ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1,
- const int16_t **gh, const int16_t **gv, int16_t *vx, int16_t *vy);
-
-static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride,
- const int16_t *_src0, const int16_t *_src1,
- int block_w, int block_h) {
- // +2 for pad left and right
- int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2];
- int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2];
- int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]};
- int16_t *gradient_v[2] = {&gradient_buf_v[0][1], &gradient_buf_v[1][1]};
- ptrdiff_t dst_stride = _dst_stride / sizeof(pixel);
- pixel *dst = (pixel *) _dst;
-
- ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0],
- BDOF_BLOCK_SIZE,
- _src0, MAX_PB_SIZE, block_w, block_h);
- ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1],
- BDOF_BLOCK_SIZE,
- _src1, MAX_PB_SIZE, block_w, block_h);
- int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
- if (block_w == 8)
- ff_vvc_derive_bdof_vx_vy_8x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
- else
- ff_vvc_derive_bdof_vx_vy_16x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
-
- for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
- for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE * 2) {
- const int16_t *src0 = _src0 + y * MAX_PB_SIZE + x;
- const int16_t *src1 = _src1 + y * MAX_PB_SIZE + x;
- pixel *d = dst + x;
- int idx = BDOF_BLOCK_SIZE * y + x;
- const int16_t *gh[] = {gradient_h[0] + idx, gradient_h[1] + idx};
- const int16_t *gv[] = {gradient_v[0] + idx, gradient_v[1] + idx};
- int idx1 = y + x / BDOF_MIN_BLOCK_SIZE;
- FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(d, dst_stride,
- src0, src1, gh, gv,
- vx + idx1, vy + idx1);
- }
- dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
- }
-}
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-08-14 16:35 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-08-14 16:35 [FFmpeg-devel] [PATCH] vvc-bdof-rework-2 (PR #20241) Zhao Zhili
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git