* [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: Optimize derive_bdof_vx_vy
@ 2025-06-20 13:15 Zhao Zhili
0 siblings, 0 replies; only message in thread
From: Zhao Zhili @ 2025-06-20 13:15 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Zhao Zhili
From: Zhao Zhili <zhilizhao@tencent.com>
Before After
-----------------------------------------------------------------
apply_bdof_8_8x16_c: | 7375.5 ( 1.00x) | 7473.8 ( 1.00x)
apply_bdof_8_8x16_neon: | 1875.1 ( 3.93x) | 1135.8 ( 6.58x)
apply_bdof_8_16x8_c: | 7273.9 ( 1.00x) | 7204.0 ( 1.00x)
apply_bdof_8_16x8_neon: | 1738.2 ( 4.18x) | 1013.0 ( 7.11x)
apply_bdof_8_16x16_c: | 14744.9 ( 1.00x) | 14712.6 ( 1.00x)
apply_bdof_8_16x16_neon: | 3446.7 ( 4.28x) | 1997.7 ( 7.36x)
apply_bdof_10_8x16_c: | 7352.4 ( 1.00x) | 7485.7 ( 1.00x)
apply_bdof_10_8x16_neon: | 1861.0 ( 3.95x) | 1134.1 ( 6.60x)
apply_bdof_10_16x8_c: | 7330.5 ( 1.00x) | 7232.8 ( 1.00x)
apply_bdof_10_16x8_neon: | 1747.2 ( 4.20x) | 1002.6 ( 7.21x)
apply_bdof_10_16x16_c: | 14522.4 ( 1.00x) | 14664.8 ( 1.00x)
apply_bdof_10_16x16_neon: | 3490.5 ( 4.16x) | 1978.4 ( 7.41x)
apply_bdof_12_8x16_c: | 7389.0 ( 1.00x) | 7380.1 ( 1.00x)
apply_bdof_12_8x16_neon: | 1861.3 ( 3.97x) | 1134.0 ( 6.51x)
apply_bdof_12_16x8_c: | 7283.1 ( 1.00x) | 7336.9 ( 1.00x)
apply_bdof_12_16x8_neon: | 1749.1 ( 4.16x) | 1002.3 ( 7.32x)
apply_bdof_12_16x16_c: | 14580.7 ( 1.00x) | 14502.7 ( 1.00x)
apply_bdof_12_16x16_neon: | 3472.9 ( 4.20x) | 1978.3 ( 7.33x)
---
libavcodec/aarch64/vvc/dsp_init.c | 17 +-
libavcodec/aarch64/vvc/inter.S | 632 ++++++++++++++++-----------
libavcodec/aarch64/vvc/of_template.c | 15 +-
3 files changed, 399 insertions(+), 265 deletions(-)
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index 9a171234f6..1db38ebb1d 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -37,11 +37,18 @@ void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
ptrdiff_t src_stride,
int width, int height);
-void ff_vvc_derive_bdof_vx_vy_neon(const int16_t *_src0, const int16_t *_src1,
- int pad_mask,
- const int16_t **gradient_h,
- const int16_t **gradient_v,
- int16_t *vx, int16_t *vy);
+void ff_vvc_derive_bdof_vx_vy_8x_neon(const int16_t *_src0,
+ const int16_t *_src1,
+ int16_t *const gradient_h[2],
+ int16_t *const gradient_v[2],
+ int16_t vx[16], int16_t vy[16],
+ int block_h);
+void ff_vvc_derive_bdof_vx_vy_16x_neon(const int16_t *_src0,
+ const int16_t *_src1,
+ int16_t *const gradient_h[2],
+ int16_t *const gradient_v[2],
+ int16_t vx[16], int16_t vy[16],
+ int block_h);
#define BIT_DEPTH 8
#include "alf_template.c"
#include "of_template.c"
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index c299e6f68b..06c6f3619b 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -804,262 +804,388 @@ function ff_vvc_apply_bdof_block_12_neon, export=1
vvc_apply_bdof_block 12
endfunc
-function ff_vvc_derive_bdof_vx_vy_neon, export=1
- src0 .req x0
- src1 .req x1
- pad_mask .req w2
- gh .req x3
- gv .req x4
- vx .req x5
- vy .req x6
-
- gh0 .req x7
- gh1 .req x8
- gv0 .req x9
- gv1 .req x10
- y .req x12
-
- sgx2 .req w7
- sgy2 .req w8
- sgxgy .req w9
- sgxdi .req w10
- sgydi .req w11
-
- sgx2_v .req v22
- sgy2_v .req v23
- sgxgy_v .req v24
- sgxdi_v .req v25
- sgydi_v .req v26
-
- sgx2_v2 .req v27
- sgy2_v2 .req v28
- sgxgy_v2 .req v29
- sgxdi_v2 .req v30
- sgydi_v2 .req v31
-
- ldp gh0, gh1, [gh]
- ldp gv0, gv1, [gv]
- movi sgx2_v.4s, #0
- movi sgy2_v.4s, #0
- movi sgxgy_v.4s, #0
- movi sgxdi_v.4s, #0
- movi sgydi_v.4s, #0
- movi sgx2_v2.4s, #0
- movi sgy2_v2.4s, #0
- movi sgxgy_v2.4s, #0
- movi sgxdi_v2.4s, #0
- movi sgydi_v2.4s, #0
- mov x13, #-1 // dy
- movi v6.4s, #0
- mov y, #-1
- tbz pad_mask, #1, 1f // check pad top
- mov x13, #0 // dy: pad top
+const bdof_vx_vy_8x_tbl
+ .byte 0, 1, 16, 16, 16, 16, 8, 9
+ .byte 6, 7, 16, 16, 16, 16, 14, 15
+endconst
+
+const bdof_vx_vy_16x_tbl
+ .byte 0, 1, 64, 64, 64, 64, 8, 9
+ .byte 6, 7, 64, 64, 64, 64, 16, 17
+ .byte 14, 15, 64, 64, 64, 64, 24, 25
+ .byte 22, 23, 64, 64, 64, 64, 30, 31
+endconst
+
+// line(-1), line0, line1, line2, line3, line4
+// line3 and line4 becomes line(-1) and line0 in the next block.
+.macro bdof_vx_vy_8x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
+ mov \tmp0\().16b, v28.16b
+ mov \tmp1\().16b, v29.16b
+ mov \tmp2\().16b, v30.16b
+ mov \tmp3\().16b, v31.16b
+ mov \tmp4\().16b, v8.16b
+.endm
+
+.macro bdof_vx_vy_8x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
+ add v25.4s, v25.4s, \tmp0\().4s
+ add v27.4s, v27.4s, \tmp1\().4s
+ add v23.4s, v23.4s, \tmp2\().4s
+ sub v26.4s, v26.4s, \tmp3\().4s
+ sub v24.4s, v24.4s, \tmp4\().4s
+.endm
+
+.macro bdof_vx_vy_8x_padding_left_right src, tmp0, tmp1, dst
+ tbl \tmp0\().16b, { \src\().16b }, v0.16b
+ saddl \tmp1\().4s, \tmp0\().4h, \src\().4h
+ saddl2 \dst\().4s, \tmp0\().8h, \src\().8h
+ addp \dst\().4s, \tmp1\().4s, \dst\().4s
+.endm
+
+.macro bdof_vx_vy_sign src, tmp0, tmp1, dst
+ cmlt \tmp0\().8h, \src\().8h, #0
+ cmgt \tmp1\().8h, \src\().8h, #0
+ sub \dst\().8h, \tmp0\().8h, \tmp1\().8h
+.endm
+
+.macro bdof_vx_vy_clip_mask src, max, min, mask, dst
+ smin \src\().4s, \src\().4s, \max\().4s
+ smax \src\().4s, \src\().4s, \min\().4s
+ cmgt \mask\().4s, \mask\().4s, #0
+ and \dst\().16b, \src\().16b, \mask\().16b
+.endm
+
+.macro bdof_vx_vy_16x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
+ mov \tmp0\().16b, v29.16b
+ mov \tmp1\().16b, v30.16b
+ mov \tmp2\().16b, v31.16b
+ mov \tmp3\().16b, v8.16b
+ mov \tmp4\().16b, v9.16b
+.endm
+
+.macro bdof_vx_vy_16x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
+ add v25.4s, v25.4s, \tmp0\().4s
+ add v24.4s, v24.4s, \tmp1\().4s
+ add v26.4s, v26.4s, \tmp2\().4s
+ sub v28.4s, v28.4s, \tmp3\().4s
+ sub v27.4s, v27.4s, \tmp4\().4s
+.endm
+
+.macro bdof_vx_vy_16x_padding_left_right src0, src1, tmp0, tmp1, tmp2, dst
+ tbl \tmp0\().16b, {\src0\().16b, \src1\().16b}, v0.16b
+ tbl v2.16b, {\src0\().16b, \src1\().16b}, v1.16b
+ saddl \tmp1\().4s, \tmp0\().4h, \src0\().4h
+ saddl \tmp2\().4s, v2.4h, \src1\().4h
+ saddl2 \tmp0\().4s, \tmp0\().8h, \src0\().8h
+ saddl2 \dst\().4s, v2.8h, \src1\().8h
+ addp \tmp0\().4s, \tmp1\().4s, \tmp0\().4s
+ addp \dst\().4s, \tmp2\().4s, \dst\().4s
+ addp \dst\().4s, \tmp0\().4s, \dst\().4s
+.endm
+
+/*
+ * x0: const int16_t *_src0,
+ * x1: const int16_t *_src1,
+ * x2: int16_t *gradient_h[2],
+ * x3: int16_t *gradient_v[2],
+ * x4: int16_t vx[16],
+ * x5: int16_t vy[16],
+ * w6: int block_h
+ */
+function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1
+ stp d11, d10, [sp, #-0x20]!
+ stp d9, d8, [sp, #0x10]
+
+ ldp x14, x13, [x2] // gh0, gh1
+ ldp x10, x9, [x3] // gv0, gv1
+ movrel x11, bdof_vx_vy_8x_tbl
+ ldr q0, [x11] // table
+ mvni v2.4s, #30 // -31, for log2
+ movi v3.4s, #15 // clip to 15
+ mvni v4.4s, #14 // clip to -15
+
+ mov w11, #0x8
+ mov w12, w6 // y = block_h
+ b 4f
+
1:
- mov x16, #-2 // dx
- add x14, src0, x13, lsl #8 // local src0
- add x15, src1, x13, lsl #8 // local src1
- add x17, x16, x13, lsl #5
- ldr q0, [x14, x16]
- ldr q1, [x15, x16]
- ldr q2, [gh0, x17]
- ldr q3, [gh1, x17]
- ldr q4, [gv0, x17]
- ldr q5, [gv1, x17]
- add x16, x16, #8
- add x17, x17, #8
- ins v0.s[3], v6.s[3]
- ins v1.s[3], v6.s[3]
- ins v2.s[3], v6.s[3]
- ins v3.s[3], v6.s[3]
- ins v4.s[3], v6.s[3]
- ins v5.s[3], v6.s[3]
-
- ldr q16, [x14, x16]
- ldr q17, [x15, x16]
- ldr q18, [gh0, x17]
- ldr q19, [gh1, x17]
- ldr q20, [gv0, x17]
- ldr q21, [gv1, x17]
- ins v16.s[3], v6.s[3]
- ins v17.s[3], v6.s[3]
- ins v18.s[3], v6.s[3]
- ins v19.s[3], v6.s[3]
- ins v20.s[3], v6.s[3]
- ins v21.s[3], v6.s[3]
-
- tbz pad_mask, #0, 20f
- // pad left
- ins v0.h[0], v0.h[1]
- ins v1.h[0], v1.h[1]
- ins v2.h[0], v2.h[1]
- ins v3.h[0], v3.h[1]
- ins v4.h[0], v4.h[1]
- ins v5.h[0], v5.h[1]
-20:
- tbz pad_mask, #2, 21f
- // pad right
- ins v16.h[5], v16.h[4]
- ins v17.h[5], v17.h[4]
- ins v18.h[5], v18.h[4]
- ins v19.h[5], v19.h[4]
- ins v20.h[5], v20.h[4]
- ins v21.h[5], v21.h[4]
-21:
- sshr v0.8h, v0.8h, #4
- sshr v1.8h, v1.8h, #4
- add v2.8h, v2.8h, v3.8h
- add v4.8h, v4.8h, v5.8h
- sub v0.8h, v0.8h, v1.8h // diff
- sshr v2.8h, v2.8h, #1 // temph
- sshr v4.8h, v4.8h, #1 // tempv
-
- sshr v16.8h, v16.8h, #4
- sshr v17.8h, v17.8h, #4
- add v18.8h, v18.8h, v19.8h
- add v20.8h, v20.8h, v21.8h
- sub v16.8h, v16.8h, v17.8h // diff
- sshr v18.8h, v18.8h, #1 // temph
- sshr v20.8h, v20.8h, #1 // tempv
-
- abs v3.8h, v2.8h
- abs v5.8h, v4.8h
- uxtl v19.4s, v3.4h
- uxtl v21.4s, v5.4h
- uxtl2 v3.4s, v3.8h
- uxtl2 v5.4s, v5.8h
- add v3.4s, v3.4s, v19.4s
- add v5.4s, v5.4s, v21.4s
- add sgx2_v.4s, sgx2_v.4s, v3.4s
- add sgy2_v.4s, sgy2_v.4s, v5.4s
-
- abs v3.8h, v18.8h
- abs v5.8h, v20.8h
- uxtl v19.4s, v3.4h
- uxtl v21.4s, v5.4h
- uxtl2 v3.4s, v3.8h
- uxtl2 v5.4s, v5.8h
- add v3.4s, v3.4s, v19.4s
- add v5.4s, v5.4s, v21.4s
- add sgx2_v2.4s, sgx2_v2.4s, v3.4s
- add sgy2_v2.4s, sgy2_v2.4s, v5.4s
-
- cmgt v17.8h, v4.8h, #0
- cmlt v7.8h, v4.8h, #0
- cmgt v19.8h, v20.8h, #0
- cmlt v21.8h, v20.8h, #0
- sub v17.8h, v7.8h, v17.8h // VVC_SIGN(tempv)
- sub v19.8h, v21.8h, v19.8h // VVC_SIGN(tempv)
-
- smlal sgxgy_v.4s, v17.4h, v2.4h
- smlal2 sgxgy_v.4s, v17.8h, v2.8h
- smlsl sgydi_v.4s, v17.4h, v0.4h
- smlsl2 sgydi_v.4s, v17.8h, v0.8h
-
- cmgt v3.8h, v2.8h, #0
- cmlt v5.8h, v2.8h, #0
- cmgt v17.8h, v18.8h, #0
- cmlt v21.8h, v18.8h, #0
- sub v3.8h, v5.8h, v3.8h // VVC_SIGN(temph)
- sub v17.8h, v21.8h, v17.8h // VVC_SIGN(temph)
-
- smlal sgxgy_v2.4s, v19.4h, v18.4h
- smlal2 sgxgy_v2.4s, v19.8h, v18.8h
- smlsl sgydi_v2.4s, v19.4h, v16.4h
- smlsl2 sgydi_v2.4s, v19.8h, v16.8h
-
- smlsl sgxdi_v.4s, v3.4h, v0.4h
- smlsl2 sgxdi_v.4s, v3.8h, v0.8h
- smlsl sgxdi_v2.4s, v17.4h, v16.4h
- smlsl2 sgxdi_v2.4s, v17.8h, v16.8h
-3:
- add y, y, #1
- cmp y, #(BDOF_MIN_BLOCK_SIZE)
- mov x13, y
- b.gt 4f
- b.lt 1b
- tbz pad_mask, #3, 1b
- sub x13, x13, #1 // pad bottom
- b 1b
+ // save line4 results
+ bdof_vx_vy_8x_save_line v5, v6, v7, v16, v17
+2:
+ addp v25.4s, v25.4s, v25.4s
+ addp v27.4s, v27.4s, v27.4s
+ addp v26.4s, v26.4s, v26.4s
+ addp v23.4s, v23.4s, v23.4s
+ addp v24.4s, v24.4s, v24.4s
+
+ clz v28.4s, v25.4s
+ add v28.4s, v28.4s, v2.4s // log2
+ shl v26.4s, v26.4s, #0x2
+ sshl v26.4s, v26.4s, v28.4s
+
+ bdof_vx_vy_clip_mask v26, v3, v4, v25, v25
+ sqxtn v26.4h, v25.4s
+ st1 {v26.s}[0], [x4], x11
+
+ subs x12, x12, #(BDOF_MIN_BLOCK_SIZE)
+
+ clz v26.4s, v27.4s
+ add v26.4s, v26.4s, v2.4s
+ shl v24.4s, v24.4s, #0x2
+ mul v23.4s, v25.4s, v23.4s
+ sshr v23.4s, v23.4s, #0x1
+ sub v23.4s, v24.4s, v23.4s
+ sshl v23.4s, v23.4s, v26.4s
+
+ bdof_vx_vy_clip_mask v23, v3, v4, v27, v23
+ sqxtn v23.4h, v23.4s
+ st1 {v23.s}[0], [x5], x11
+
+ b.eq 16f
4:
- addv s22, sgx2_v.4s
- addv s23, sgy2_v.4s
- addv s24, sgxgy_v.4s
- addv s25, sgxdi_v.4s
- addv s26, sgydi_v.4s
-
- mov w3, #31
- mov w16, #-15
- mov w17, #15
-40:
- mov w14, #0
-
- mov sgx2, v22.s[0]
- mov sgy2, v23.s[0]
- mov sgxgy, v24.s[0]
- mov sgxdi, v25.s[0]
- mov sgydi, v26.s[0]
-
- cbz sgx2, 5f
- clz w12, sgx2
- lsl sgxdi, sgxdi, #2
- sub w13, w3, w12 // log2(sgx2)
- asr sgxdi, sgxdi, w13
- cmp sgxdi, w16
- csel w14, w16, sgxdi, lt // clip to -15
- b.le 5f
- cmp sgxdi, w17
- csel w14, w17, sgxdi, gt // clip to 15
+ mov x15, #0x0 // dy, inner loop
+
+ movi v25.2d, #0
+ movi v27.2d, #0
+ movi v23.2d, #0
+ movi v26.2d, #0
+ movi v24.2d, #0
+ b 8f
+
5:
- strh w14, [vx], #2
-
- mov w15, #0
- cbz sgy2, 6f
- lsl sgydi, sgydi, #2
- smull x14, w14, sgxgy
- asr w14, w14, #1
- sub sgydi, sgydi, w14
- clz w12, sgy2
- sub w13, w3, w12 // log2(sgy2)
- asr sgydi, sgydi, w13
- cmp sgydi, w16
- csel w15, w16, sgydi, lt // clip to -15
- b.le 6f
- cmp sgydi, w17
- csel w15, w17, sgydi, gt // clip to 15
-6:
- strh w15, [vy], #2
- cbz x0, 7f
- addv s22, sgx2_v2.4s
- addv s23, sgy2_v2.4s
- addv s24, sgxgy_v2.4s
- addv s25, sgxdi_v2.4s
- addv s26, sgydi_v2.4s
- mov x0, #0
- b 40b
-7:
+ // add line(-1) and line0 from previous results
+ bdof_vx_vy_8x_add_line v18, v19, v20, v21, v22
+ bdof_vx_vy_8x_add_line v5, v6, v7, v16, v17
+ add x15, x15, #1
+8:
+ cmp w12, w6
+ b.hs 9f
+ // y < block_h && dy == 0, reuse previous results
+ cbz x15, 5b
+9:
+ ldr q28, [x0] // src0
+ ldr q29, [x1] // src1
+ ldr q30, [x14], #(BDOF_BLOCK_SIZE * 2) // gh0
+ ldr q31, [x13], #(BDOF_BLOCK_SIZE * 2) // gh1
+ ldr q8, [x10], #(BDOF_BLOCK_SIZE * 2) // gv0
+ ldr q9, [x9], #(BDOF_BLOCK_SIZE * 2) // gv1
+ add x0, x0, #(VVC_MAX_PB_SIZE * 2)
+ add x1, x1, #(VVC_MAX_PB_SIZE * 2)
+
+ sshr v28.8h, v28.8h, #0x4
+ sshr v29.8h, v29.8h, #0x4
+ shadd v30.8h, v30.8h, v31.8h // tmph
+ shadd v31.8h, v8.8h, v9.8h // tmpv
+ sub v8.8h, v28.8h, v29.8h // diff
+
+ abs v28.8h, v30.8h
+ abs v29.8h, v31.8h
+
+ bdof_vx_vy_8x_padding_left_right v28, v9, v10, v28
+ bdof_vx_vy_8x_padding_left_right v29, v9, v10, v29
+
+ bdof_vx_vy_sign v30, v9, v10, v9
+ bdof_vx_vy_sign v31, v10, v31, v31
+
+ mul v30.8h, v31.8h, v30.8h
+ mul v9.8h, v9.8h, v8.8h
+ mul v8.8h, v31.8h, v8.8h
+
+ bdof_vx_vy_8x_padding_left_right v30, v31, v10, v30
+ bdof_vx_vy_8x_padding_left_right v9, v31, v10, v31
+ bdof_vx_vy_8x_padding_left_right v8, v9, v10, v8
+
+ bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
+
+ cmp w12, w6
+ b.ne 10f
+ cbnz x15, 10f
+
+ // y == block_h && dy == 0, duplicate first line results
+ bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
+ add x15, x15, #0x1
+ b 9b
+10:
+ cmp x15, #(BDOF_MIN_BLOCK_SIZE - 1)
+ b.eq 11f
+ cmp x15, #(BDOF_MIN_BLOCK_SIZE)
+ b.ne 12f
+ b 1b
+11:
+ // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
+ // duplicate the results and break
+ cmp x12, #(BDOF_MIN_BLOCK_SIZE)
+ b.eq 13f
+ bdof_vx_vy_8x_save_line v18, v19, v20, v21, v22
+12:
+ add x15, x15, #1
+ b 8b
+13:
+ // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
+ // padding bottom then break
+ bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
+ b 2b
+16:
+ ldp d9, d8, [sp, #0x10]
+ ldp d11, d10, [sp], #0x20
ret
+endfunc
-.unreq src0
-.unreq src1
-.unreq pad_mask
-.unreq gh
-.unreq gv
-.unreq vx
-.unreq vy
-.unreq sgx2
-.unreq sgy2
-.unreq sgxgy
-.unreq sgxdi
-.unreq sgydi
-.unreq sgx2_v
-.unreq sgy2_v
-.unreq sgxgy_v
-.unreq sgxdi_v
-.unreq sgydi_v
-.unreq sgx2_v2
-.unreq sgy2_v2
-.unreq sgxgy_v2
-.unreq sgxdi_v2
-.unreq sgydi_v2
-.unreq y
+/*
+ * x0: const int16_t *_src0,
+ * x1: const int16_t *_src1,
+ * x2: int16_t *gradient_h[2],
+ * x3: int16_t *gradient_v[2],
+ * x4: int16_t vx[16],
+ * x5: int16_t vy[16],
+ * w6: int block_h
+ */
+function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
+ sub sp, sp, #0x80
+ stp d15, d14, [sp, #0x30]
+ stp d13, d12, [sp, #0x40]
+ stp d11, d10, [sp, #0x50]
+ stp d9, d8, [sp, #0x60]
+ stp x29, x30, [sp, #0x70]
+
+ ldp x8, x9, [x2] // gh0, gh1
+ ldp x10, x11, [x3] // gv0, gv1
+ movrel x12, bdof_vx_vy_16x_tbl
+ ldp q0, q1, [x12] // table
+ mov w13, w6 // y = block_h
+ b 4f
+
+1:
+ // save line4
+ bdof_vx_vy_16x_save_line v6, v7, v16, v17, v18
+2:
+ clz v3.4s, v25.4s
+ mvni v5.4s, #0x1e
+ add v3.4s, v3.4s, v5.4s // -log2()
+ shl v4.4s, v28.4s, #0x2
+ sshl v3.4s, v4.4s, v3.4s
+
+ movi v28.4s, #0xf // clip to 15
+ mvni v29.4s, #0xe // clip to -15
+ bdof_vx_vy_clip_mask v3, v28, v29, v25, v3
+ sqxtn v4.4h, v3.4s
+ st1 {v4.d}[0], [x4], #(BDOF_MIN_BLOCK_SIZE * 2)
+
+ subs x13, x13, #(BDOF_MIN_BLOCK_SIZE) // y -= BDOF_MIN_BLOCK_SIZE
+
+ clz v4.4s, v24.4s
+ add v4.4s, v4.4s, v5.4s // -log2()
+ shl v5.4s, v27.4s, #0x2
+ mul v3.4s, v3.4s, v26.4s
+ sshr v3.4s, v3.4s, #0x1
+ sub v3.4s, v5.4s, v3.4s
+ sshl v3.4s, v3.4s, v4.4s
+
+ bdof_vx_vy_clip_mask v3, v28, v29, v24, v3
+ sqxtn v3.4h, v3.4s
+ st1 {v3.d}[0], [x5], #(BDOF_MIN_BLOCK_SIZE * 2)
+ b.eq 16f
+4:
+ mov w14, #0x0 // dy, inner loop
+
+ movi v25.2d, #0
+ movi v24.2d, #0
+ movi v26.2d, #0
+ movi v28.2d, #0
+ movi v27.2d, #0
+ b 8f
+
+5:
+ // add line(-1) and line0 from previous results
+ bdof_vx_vy_16x_add_line v19, v20, v21, v22, v23
+ bdof_vx_vy_16x_add_line v6, v7, v16, v17, v18
+ add w14, w14, #0x1
+
+ 8:
+ cmp w13, w6
+ b.hs 9f
+ // y < block_h && dy == 0, reuse previous results
+ cbz w14, 5b
+9:
+ ld1 {v29.8h, v30.8h}, [x0] // src0
+ sshr v31.8h, v29.8h, #0x4
+ ld1 {v8.8h, v9.8h}, [x1] // src1
+ sshr v10.8h, v8.8h, #0x4
+ ld1 {v11.8h, v12.8h}, [x8], #32 // gh0
+ sshr v29.8h, v30.8h, #0x4
+ sshr v30.8h, v9.8h, #0x4
+ ld1 {v8.8h, v9.8h}, [x9], #32 // gh1
+ shadd v13.8h, v11.8h, v8.8h // (gh0 + gh1) >> 1, left half
+ ld1 {v14.8h, v15.8h}, [x10], #32 // gv0
+ ld1 {v3.8h, v4.8h}, [x11], #32 // gv1
+ shadd v5.8h, v14.8h, v3.8h // (gv0 + gv1) >> 1, left half
+ sub v31.8h, v31.8h, v10.8h // diff, left half
+ shadd v8.8h, v12.8h, v9.8h // (gh0 + gh1) >> 1, right half
+ shadd v3.8h, v15.8h, v4.8h // (gv0 + gv1) >> 1, right half
+ sub v4.8h, v29.8h, v30.8h // diff, right half
+
+ abs v29.8h, v13.8h
+ abs v30.8h, v8.8h
+ abs v9.8h, v5.8h
+ abs v10.8h, v3.8h
+
+ add x0, x0, #(VVC_MAX_PB_SIZE * 2)
+ add x1, x1, #(VVC_MAX_PB_SIZE * 2)
+
+ bdof_vx_vy_16x_padding_left_right v29, v30, v11, v12, v14, v29
+ bdof_vx_vy_16x_padding_left_right v9, v10, v11, v12, v14, v30
+
+ bdof_vx_vy_sign v13, v9, v10, v9
+ bdof_vx_vy_sign v8, v10, v11, v10
+ bdof_vx_vy_sign v5, v11, v5, v5
+ bdof_vx_vy_sign v3, v11, v3, v3
+
+ mul v11.8h, v5.8h, v13.8h
+ mul v12.8h, v3.8h, v8.8h
+ mul v8.8h, v9.8h, v31.8h
+ mul v9.8h, v10.8h, v4.8h
+ mul v13.8h, v5.8h, v31.8h
+ mul v14.8h, v3.8h, v4.8h
+
+ bdof_vx_vy_16x_padding_left_right v11, v12, v3, v4, v5, v31
+ bdof_vx_vy_16x_padding_left_right v8, v9, v3, v4, v5, v8
+ bdof_vx_vy_16x_padding_left_right v13, v14, v3, v4, v5, v9
+
+ bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
+ // check whether padding top
+ cmp w13, w6
+ b.ne 10f
+ cbnz w14, 10f
+ // y == block_h && dy == 0, padding top
+ bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
+ add w14, w14, #0x1
+ b 9b
+10:
+ cmp w14, #(BDOF_MIN_BLOCK_SIZE - 1)
+ b.eq 11f
+ cmp w14, #(BDOF_MIN_BLOCK_SIZE)
+ b.ne 12f
+ // save line4
+ b 1b
+ 11:
+ // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1, padding bottom
+ cmp x13, #(BDOF_MIN_BLOCK_SIZE)
+ b.eq 13f
+ // save line3
+ bdof_vx_vy_16x_save_line v19, v20, v21, v22, v23
+12:
+ add w14, w14, #0x1 // dy++
+ b 8b
+13:
+ // padding bottom
+ bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
+ b 2b
+16:
+ // restore
+ ldp x29, x30, [sp, #0x70]
+ ldp d9, d8, [sp, #0x60]
+ ldp d11, d10, [sp, #0x50]
+ ldp d13, d12, [sp, #0x40]
+ ldp d15, d14, [sp, #0x30]
+ add sp, sp, #0x80
+ ret
endfunc
diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c
index ac6182b09d..d8ddaacb14 100644
--- a/libavcodec/aarch64/vvc/of_template.c
+++ b/libavcodec/aarch64/vvc/of_template.c
@@ -41,6 +41,11 @@ static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride,
ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1],
BDOF_BLOCK_SIZE,
_src1, MAX_PB_SIZE, block_w, block_h);
+ int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
+ if (block_w == 8)
+ ff_vvc_derive_bdof_vx_vy_8x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
+ else
+ ff_vvc_derive_bdof_vx_vy_16x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE * 2) {
@@ -50,14 +55,10 @@ static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride,
int idx = BDOF_BLOCK_SIZE * y + x;
const int16_t *gh[] = {gradient_h[0] + idx, gradient_h[1] + idx};
const int16_t *gv[] = {gradient_v[0] + idx, gradient_v[1] + idx};
- int16_t vx[2], vy[2];
- int pad_mask = !x | ((!y) << 1) |
- ((x + 2 * BDOF_MIN_BLOCK_SIZE == block_w) << 2) |
- ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3);
- ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, vx, vy);
+ int idx1 = y + x / BDOF_MIN_BLOCK_SIZE;
FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(d, dst_stride,
- src0, src1, gh, gv,
- vx, vy);
+ src0, src1, gh, gv,
+ vx + idx1, vy + idx1);
}
dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
}
--
2.46.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-06-20 13:15 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-06-20 13:15 [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: Optimize derive_bdof_vx_vy Zhao Zhili
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git