[FFmpeg-devel] [PATCH] vvc-bdof-rework-2 (PR #20241)

* [FFmpeg-devel] [PATCH] vvc-bdof-rework-2 (PR #20241)
@ 2025-08-14 16:35 Zhao Zhili
  0 siblings, 0 replies; only message in thread
From: Zhao Zhili @ 2025-08-14 16:35 UTC (permalink / raw)
  To: ffmpeg-devel

PR #20241 opened by Zhao Zhili (quink)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20241
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20241.patch


From c3362c98ce019e218463a62fbee770b16c3bb478 Mon Sep 17 00:00:00 2001
From: Zhao Zhili <zhilizhao@tencent.com>
Date: Fri, 20 Jun 2025 21:15:20 +0800
Subject: [PATCH 1/2] avcodec/aarch64/vvc: Optimize derive_bdof_vx_vy

                               Before             After
-----------------------------------------------------------------
apply_bdof_8_8x16_c:       |   7375.5 ( 1.00x) |  7473.8 ( 1.00x)
apply_bdof_8_8x16_neon:    |   1875.1 ( 3.93x) |  1135.8 ( 6.58x)
apply_bdof_8_16x8_c:       |   7273.9 ( 1.00x) |  7204.0 ( 1.00x)
apply_bdof_8_16x8_neon:    |   1738.2 ( 4.18x) |  1013.0 ( 7.11x)
apply_bdof_8_16x16_c:      |  14744.9 ( 1.00x) | 14712.6 ( 1.00x)
apply_bdof_8_16x16_neon:   |   3446.7 ( 4.28x) |  1997.7 ( 7.36x)
apply_bdof_10_8x16_c:      |   7352.4 ( 1.00x) |  7485.7 ( 1.00x)
apply_bdof_10_8x16_neon:   |   1861.0 ( 3.95x) |  1134.1 ( 6.60x)
apply_bdof_10_16x8_c:      |   7330.5 ( 1.00x) |  7232.8 ( 1.00x)
apply_bdof_10_16x8_neon:   |   1747.2 ( 4.20x) |  1002.6 ( 7.21x)
apply_bdof_10_16x16_c:     |  14522.4 ( 1.00x) | 14664.8 ( 1.00x)
apply_bdof_10_16x16_neon:  |   3490.5 ( 4.16x) |  1978.4 ( 7.41x)
apply_bdof_12_8x16_c:      |   7389.0 ( 1.00x) |  7380.1 ( 1.00x)
apply_bdof_12_8x16_neon:   |   1861.3 ( 3.97x) |  1134.0 ( 6.51x)
apply_bdof_12_16x8_c:      |   7283.1 ( 1.00x) |  7336.9 ( 1.00x)
apply_bdof_12_16x8_neon:   |   1749.1 ( 4.16x) |  1002.3 ( 7.32x)
apply_bdof_12_16x16_c:     |  14580.7 ( 1.00x) | 14502.7 ( 1.00x)
apply_bdof_12_16x16_neon:  |   3472.9 ( 4.20x) |  1978.3 ( 7.33x)

Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
---
 libavcodec/aarch64/vvc/dsp_init.c    |  17 +-
 libavcodec/aarch64/vvc/inter.S       | 606 ++++++++++++++++-----------
 libavcodec/aarch64/vvc/of_template.c |  15 +-
 3 files changed, 386 insertions(+), 252 deletions(-)

diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index 9a171234f6..1db38ebb1d 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -37,11 +37,18 @@ void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
                                      ptrdiff_t src_stride,
                                      int width, int height);
 
-void ff_vvc_derive_bdof_vx_vy_neon(const int16_t *_src0, const int16_t *_src1,
-                                   int pad_mask,
-                                   const int16_t **gradient_h,
-                                   const int16_t **gradient_v,
-                                   int16_t *vx, int16_t *vy);
+void ff_vvc_derive_bdof_vx_vy_8x_neon(const int16_t *_src0,
+                                      const int16_t *_src1,
+                                      int16_t *const gradient_h[2],
+                                      int16_t *const gradient_v[2],
+                                      int16_t vx[16], int16_t vy[16],
+                                      int block_h);
+void ff_vvc_derive_bdof_vx_vy_16x_neon(const int16_t *_src0,
+                                      const int16_t *_src1,
+                                      int16_t *const gradient_h[2],
+                                      int16_t *const gradient_v[2],
+                                      int16_t vx[16], int16_t vy[16],
+                                      int block_h);
 #define BIT_DEPTH 8
 #include "alf_template.c"
 #include "of_template.c"
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index c299e6f68b..06c6f3619b 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -804,262 +804,388 @@ function ff_vvc_apply_bdof_block_12_neon, export=1
         vvc_apply_bdof_block 12
 endfunc
 
-function ff_vvc_derive_bdof_vx_vy_neon, export=1
-        src0            .req x0
-        src1            .req x1
-        pad_mask        .req w2
-        gh              .req x3
-        gv              .req x4
-        vx              .req x5
-        vy              .req x6
+const bdof_vx_vy_8x_tbl
+        .byte 0, 1, 16, 16, 16, 16, 8, 9
+        .byte 6, 7, 16, 16, 16, 16, 14, 15
+endconst
 
-        gh0             .req x7
-        gh1             .req x8
-        gv0             .req x9
-        gv1             .req x10
-        y               .req x12
+const bdof_vx_vy_16x_tbl
+        .byte 0,  1,  64, 64, 64, 64, 8,  9
+        .byte 6,  7,  64, 64, 64, 64, 16, 17
+        .byte 14, 15, 64, 64, 64, 64, 24, 25
+        .byte 22, 23, 64, 64, 64, 64, 30, 31
+endconst
 
-        sgx2            .req w7
-        sgy2            .req w8
-        sgxgy           .req w9
-        sgxdi           .req w10
-        sgydi           .req w11
+// line(-1), line0, line1, line2, line3, line4
+// line3 and line4 becomes line(-1) and line0 in the next block.
+.macro bdof_vx_vy_8x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
+        mov             \tmp0\().16b, v28.16b
+        mov             \tmp1\().16b, v29.16b
+        mov             \tmp2\().16b, v30.16b
+        mov             \tmp3\().16b, v31.16b
+        mov             \tmp4\().16b, v8.16b
+.endm
 
-        sgx2_v          .req v22
-        sgy2_v          .req v23
-        sgxgy_v         .req v24
-        sgxdi_v         .req v25
-        sgydi_v         .req v26
+.macro bdof_vx_vy_8x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
+        add             v25.4s, v25.4s, \tmp0\().4s
+        add             v27.4s, v27.4s, \tmp1\().4s
+        add             v23.4s, v23.4s, \tmp2\().4s
+        sub             v26.4s, v26.4s, \tmp3\().4s
+        sub             v24.4s, v24.4s, \tmp4\().4s
+.endm
 
-        sgx2_v2         .req v27
-        sgy2_v2         .req v28
-        sgxgy_v2        .req v29
-        sgxdi_v2        .req v30
-        sgydi_v2        .req v31
+.macro bdof_vx_vy_8x_padding_left_right src, tmp0, tmp1, dst
+        tbl             \tmp0\().16b, { \src\().16b }, v0.16b
+        saddl           \tmp1\().4s, \tmp0\().4h, \src\().4h
+        saddl2          \dst\().4s, \tmp0\().8h, \src\().8h
+        addp            \dst\().4s, \tmp1\().4s, \dst\().4s
+.endm
+
+.macro bdof_vx_vy_sign src, tmp0, tmp1, dst
+        cmlt            \tmp0\().8h, \src\().8h, #0
+        cmgt            \tmp1\().8h, \src\().8h, #0
+        sub             \dst\().8h, \tmp0\().8h, \tmp1\().8h
+.endm
+
+.macro bdof_vx_vy_clip_mask src, max, min, mask, dst
+        smin            \src\().4s, \src\().4s, \max\().4s
+        smax            \src\().4s, \src\().4s, \min\().4s
+        cmgt            \mask\().4s, \mask\().4s, #0
+        and             \dst\().16b, \src\().16b, \mask\().16b
+.endm
+
+.macro bdof_vx_vy_16x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
+        mov             \tmp0\().16b, v29.16b
+        mov             \tmp1\().16b, v30.16b
+        mov             \tmp2\().16b, v31.16b
+        mov             \tmp3\().16b, v8.16b
+        mov             \tmp4\().16b, v9.16b
+.endm
+
+.macro bdof_vx_vy_16x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
+        add             v25.4s, v25.4s, \tmp0\().4s
+        add             v24.4s, v24.4s, \tmp1\().4s
+        add             v26.4s, v26.4s, \tmp2\().4s
+        sub             v28.4s, v28.4s, \tmp3\().4s
+        sub             v27.4s, v27.4s, \tmp4\().4s
+.endm
+
+.macro bdof_vx_vy_16x_padding_left_right src0, src1, tmp0, tmp1, tmp2, dst
+        tbl             \tmp0\().16b, {\src0\().16b, \src1\().16b}, v0.16b
+        tbl             v2.16b, {\src0\().16b, \src1\().16b}, v1.16b
+        saddl           \tmp1\().4s, \tmp0\().4h, \src0\().4h
+        saddl           \tmp2\().4s, v2.4h, \src1\().4h
+        saddl2          \tmp0\().4s, \tmp0\().8h, \src0\().8h
+        saddl2          \dst\().4s, v2.8h, \src1\().8h
+        addp            \tmp0\().4s, \tmp1\().4s, \tmp0\().4s
+        addp            \dst\().4s, \tmp2\().4s, \dst\().4s
+        addp            \dst\().4s, \tmp0\().4s, \dst\().4s
+.endm
+
+/*
+ * x0: const int16_t *_src0,
+ * x1: const int16_t *_src1,
+ * x2: int16_t *gradient_h[2],
+ * x3: int16_t *gradient_v[2],
+ * x4: int16_t vx[16],
+ * x5: int16_t vy[16],
+ * w6: int block_h
+ */
+function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1
+        stp             d11, d10, [sp, #-0x20]!
+        stp             d9, d8, [sp, #0x10]
+
+        ldp             x14, x13, [x2]                      // gh0, gh1
+        ldp             x10, x9, [x3]                       // gv0, gv1
+        movrel          x11, bdof_vx_vy_8x_tbl
+        ldr             q0, [x11]                           // table
+        mvni            v2.4s, #30                          // -31, for log2
+        movi            v3.4s, #15                          // clip to 15
+        mvni            v4.4s, #14                          // clip to -15
+
+        mov             w11, #0x8
+        mov             w12, w6                             // y = block_h
+        b               4f
 
-        ldp             gh0, gh1, [gh]
-        ldp             gv0, gv1, [gv]
-        movi            sgx2_v.4s, #0
-        movi            sgy2_v.4s, #0
-        movi            sgxgy_v.4s, #0
-        movi            sgxdi_v.4s, #0
-        movi            sgydi_v.4s, #0
-        movi            sgx2_v2.4s, #0
-        movi            sgy2_v2.4s, #0
-        movi            sgxgy_v2.4s, #0
-        movi            sgxdi_v2.4s, #0
-        movi            sgydi_v2.4s, #0
-        mov             x13, #-1                    // dy
-        movi            v6.4s, #0
-        mov             y, #-1
-        tbz             pad_mask, #1, 1f            // check pad top
-        mov             x13, #0                     // dy: pad top
 1:
-        mov             x16, #-2                    // dx
-        add             x14, src0, x13, lsl #8      // local src0
-        add             x15, src1, x13, lsl #8      // local src1
-        add             x17, x16, x13, lsl #5
-        ldr             q0, [x14, x16]
-        ldr             q1, [x15, x16]
-        ldr             q2, [gh0, x17]
-        ldr             q3, [gh1, x17]
-        ldr             q4, [gv0, x17]
-        ldr             q5, [gv1, x17]
-        add             x16, x16, #8
-        add             x17, x17, #8
-        ins             v0.s[3], v6.s[3]
-        ins             v1.s[3], v6.s[3]
-        ins             v2.s[3], v6.s[3]
-        ins             v3.s[3], v6.s[3]
-        ins             v4.s[3], v6.s[3]
-        ins             v5.s[3], v6.s[3]
+        // save line4 results
+        bdof_vx_vy_8x_save_line v5, v6, v7, v16, v17
+2:
+        addp            v25.4s, v25.4s, v25.4s
+        addp            v27.4s, v27.4s, v27.4s
+        addp            v26.4s, v26.4s, v26.4s
+        addp            v23.4s, v23.4s, v23.4s
+        addp            v24.4s, v24.4s, v24.4s
 
-        ldr             q16, [x14, x16]
-        ldr             q17, [x15, x16]
-        ldr             q18, [gh0, x17]
-        ldr             q19, [gh1, x17]
-        ldr             q20, [gv0, x17]
-        ldr             q21, [gv1, x17]
-        ins             v16.s[3], v6.s[3]
-        ins             v17.s[3], v6.s[3]
-        ins             v18.s[3], v6.s[3]
-        ins             v19.s[3], v6.s[3]
-        ins             v20.s[3], v6.s[3]
-        ins             v21.s[3], v6.s[3]
+        clz             v28.4s, v25.4s
+        add             v28.4s, v28.4s, v2.4s               // log2
+        shl             v26.4s, v26.4s, #0x2
+        sshl            v26.4s, v26.4s, v28.4s
 
-        tbz             pad_mask, #0, 20f
-        // pad left
-        ins             v0.h[0], v0.h[1]
-        ins             v1.h[0], v1.h[1]
-        ins             v2.h[0], v2.h[1]
-        ins             v3.h[0], v3.h[1]
-        ins             v4.h[0], v4.h[1]
-        ins             v5.h[0], v5.h[1]
-20:
-        tbz             pad_mask, #2, 21f
-        // pad right
-        ins             v16.h[5], v16.h[4]
-        ins             v17.h[5], v17.h[4]
-        ins             v18.h[5], v18.h[4]
-        ins             v19.h[5], v19.h[4]
-        ins             v20.h[5], v20.h[4]
-        ins             v21.h[5], v21.h[4]
-21:
-        sshr            v0.8h, v0.8h, #4
-        sshr            v1.8h, v1.8h, #4
-        add             v2.8h, v2.8h, v3.8h
-        add             v4.8h, v4.8h, v5.8h
-        sub             v0.8h, v0.8h, v1.8h         // diff
-        sshr            v2.8h, v2.8h, #1            // temph
-        sshr            v4.8h, v4.8h, #1            // tempv
+        bdof_vx_vy_clip_mask v26, v3, v4, v25, v25
+        sqxtn           v26.4h, v25.4s
+        st1             {v26.s}[0], [x4], x11
 
-        sshr            v16.8h, v16.8h, #4
-        sshr            v17.8h, v17.8h, #4
-        add             v18.8h, v18.8h, v19.8h
-        add             v20.8h, v20.8h, v21.8h
-        sub             v16.8h, v16.8h, v17.8h      // diff
-        sshr            v18.8h, v18.8h, #1          // temph
-        sshr            v20.8h, v20.8h, #1          // tempv
+        subs            x12, x12, #(BDOF_MIN_BLOCK_SIZE)
 
-        abs             v3.8h, v2.8h
-        abs             v5.8h, v4.8h
-        uxtl            v19.4s, v3.4h
-        uxtl            v21.4s, v5.4h
-        uxtl2           v3.4s, v3.8h
-        uxtl2           v5.4s, v5.8h
-        add             v3.4s, v3.4s, v19.4s
-        add             v5.4s, v5.4s, v21.4s
-        add             sgx2_v.4s, sgx2_v.4s, v3.4s
-        add             sgy2_v.4s, sgy2_v.4s, v5.4s
+        clz             v26.4s, v27.4s
+        add             v26.4s, v26.4s, v2.4s
+        shl             v24.4s, v24.4s, #0x2
+        mul             v23.4s, v25.4s, v23.4s
+        sshr            v23.4s, v23.4s, #0x1
+        sub             v23.4s, v24.4s, v23.4s
+        sshl            v23.4s, v23.4s, v26.4s
 
-        abs             v3.8h, v18.8h
-        abs             v5.8h, v20.8h
-        uxtl            v19.4s, v3.4h
-        uxtl            v21.4s, v5.4h
-        uxtl2           v3.4s, v3.8h
-        uxtl2           v5.4s, v5.8h
-        add             v3.4s, v3.4s, v19.4s
-        add             v5.4s, v5.4s, v21.4s
-        add             sgx2_v2.4s, sgx2_v2.4s, v3.4s
-        add             sgy2_v2.4s, sgy2_v2.4s, v5.4s
+        bdof_vx_vy_clip_mask v23, v3, v4, v27, v23
+        sqxtn           v23.4h, v23.4s
+        st1             {v23.s}[0], [x5], x11
 
-        cmgt            v17.8h, v4.8h, #0
-        cmlt            v7.8h, v4.8h, #0
-        cmgt            v19.8h, v20.8h, #0
-        cmlt            v21.8h, v20.8h, #0
-        sub             v17.8h, v7.8h, v17.8h       // VVC_SIGN(tempv)
-        sub             v19.8h, v21.8h, v19.8h      // VVC_SIGN(tempv)
-
-        smlal           sgxgy_v.4s, v17.4h, v2.4h
-        smlal2          sgxgy_v.4s, v17.8h, v2.8h
-        smlsl           sgydi_v.4s, v17.4h, v0.4h
-        smlsl2          sgydi_v.4s, v17.8h, v0.8h
-
-        cmgt            v3.8h, v2.8h, #0
-        cmlt            v5.8h, v2.8h, #0
-        cmgt            v17.8h, v18.8h, #0
-        cmlt            v21.8h, v18.8h, #0
-        sub             v3.8h, v5.8h, v3.8h         // VVC_SIGN(temph)
-        sub             v17.8h, v21.8h, v17.8h      // VVC_SIGN(temph)
-
-        smlal           sgxgy_v2.4s, v19.4h, v18.4h
-        smlal2          sgxgy_v2.4s, v19.8h, v18.8h
-        smlsl           sgydi_v2.4s, v19.4h, v16.4h
-        smlsl2          sgydi_v2.4s, v19.8h, v16.8h
-
-        smlsl           sgxdi_v.4s, v3.4h, v0.4h
-        smlsl2          sgxdi_v.4s, v3.8h, v0.8h
-        smlsl           sgxdi_v2.4s, v17.4h, v16.4h
-        smlsl2          sgxdi_v2.4s, v17.8h, v16.8h
-3:
-        add             y, y, #1
-        cmp             y, #(BDOF_MIN_BLOCK_SIZE)
-        mov             x13, y
-        b.gt            4f
-        b.lt            1b
-        tbz             pad_mask, #3, 1b
-        sub             x13, x13, #1                // pad bottom
-        b               1b
+        b.eq            16f
 4:
-        addv            s22, sgx2_v.4s
-        addv            s23, sgy2_v.4s
-        addv            s24, sgxgy_v.4s
-        addv            s25, sgxdi_v.4s
-        addv            s26, sgydi_v.4s
+        mov             x15, #0x0                           // dy, inner loop
 
-        mov             w3, #31
-        mov             w16, #-15
-        mov             w17, #15
-40:
-        mov             w14, #0
+        movi            v25.2d, #0
+        movi            v27.2d, #0
+        movi            v23.2d, #0
+        movi            v26.2d, #0
+        movi            v24.2d, #0
+        b               8f
 
-        mov             sgx2, v22.s[0]
-        mov             sgy2, v23.s[0]
-        mov             sgxgy, v24.s[0]
-        mov             sgxdi, v25.s[0]
-        mov             sgydi, v26.s[0]
-
-        cbz             sgx2, 5f
-        clz             w12, sgx2
-        lsl             sgxdi, sgxdi, #2
-        sub             w13, w3, w12                // log2(sgx2)
-        asr             sgxdi, sgxdi, w13
-        cmp             sgxdi, w16
-        csel            w14, w16, sgxdi, lt         // clip to -15
-        b.le            5f
-        cmp             sgxdi, w17
-        csel            w14, w17, sgxdi, gt         // clip to 15
 5:
-        strh            w14, [vx], #2
+        // add line(-1) and line0 from previous results
+        bdof_vx_vy_8x_add_line v18, v19, v20, v21, v22
+        bdof_vx_vy_8x_add_line v5, v6, v7, v16, v17
+        add             x15, x15, #1
+8:
+        cmp             w12, w6
+        b.hs            9f
+        // y < block_h && dy == 0, reuse previous results
+        cbz             x15, 5b
+9:
+        ldr             q28, [x0]                                   // src0
+        ldr             q29, [x1]                                   // src1
+        ldr             q30, [x14], #(BDOF_BLOCK_SIZE * 2)          // gh0
+        ldr             q31, [x13], #(BDOF_BLOCK_SIZE * 2)          // gh1
+        ldr             q8, [x10], #(BDOF_BLOCK_SIZE * 2)           // gv0
+        ldr             q9, [x9], #(BDOF_BLOCK_SIZE * 2)            // gv1
+        add             x0, x0, #(VVC_MAX_PB_SIZE * 2)
+        add             x1, x1, #(VVC_MAX_PB_SIZE * 2)
 
-        mov             w15, #0
-        cbz             sgy2, 6f
-        lsl             sgydi, sgydi, #2
-        smull           x14, w14, sgxgy
-        asr             w14, w14, #1
-        sub             sgydi, sgydi, w14
-        clz             w12, sgy2
-        sub             w13, w3, w12                // log2(sgy2)
-        asr             sgydi, sgydi, w13
-        cmp             sgydi, w16
-        csel            w15, w16, sgydi, lt         // clip to -15
-        b.le            6f
-        cmp             sgydi, w17
-        csel            w15, w17, sgydi, gt         // clip to 15
-6:
-        strh            w15, [vy], #2
-        cbz             x0, 7f
-        addv            s22, sgx2_v2.4s
-        addv            s23, sgy2_v2.4s
-        addv            s24, sgxgy_v2.4s
-        addv            s25, sgxdi_v2.4s
-        addv            s26, sgydi_v2.4s
-        mov             x0, #0
-        b               40b
-7:
+        sshr            v28.8h, v28.8h, #0x4
+        sshr            v29.8h, v29.8h, #0x4
+        shadd           v30.8h, v30.8h, v31.8h                      // tmph
+        shadd           v31.8h, v8.8h, v9.8h                        // tmpv
+        sub             v8.8h, v28.8h, v29.8h                       // diff
+
+        abs             v28.8h, v30.8h
+        abs             v29.8h, v31.8h
+
+        bdof_vx_vy_8x_padding_left_right v28, v9, v10, v28
+        bdof_vx_vy_8x_padding_left_right v29, v9, v10, v29
+
+        bdof_vx_vy_sign v30, v9, v10, v9
+        bdof_vx_vy_sign v31, v10, v31, v31
+
+        mul             v30.8h, v31.8h, v30.8h
+        mul             v9.8h, v9.8h, v8.8h
+        mul             v8.8h, v31.8h, v8.8h
+
+        bdof_vx_vy_8x_padding_left_right v30, v31, v10, v30
+        bdof_vx_vy_8x_padding_left_right v9, v31, v10, v31
+        bdof_vx_vy_8x_padding_left_right v8, v9, v10, v8
+
+        bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
+
+        cmp             w12, w6
+        b.ne            10f
+        cbnz            x15, 10f
+
+        // y == block_h && dy == 0, duplicate first line results
+        bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
+        add             x15, x15, #0x1
+        b               9b
+10:
+        cmp             x15, #(BDOF_MIN_BLOCK_SIZE - 1)
+        b.eq            11f
+        cmp             x15, #(BDOF_MIN_BLOCK_SIZE)
+        b.ne            12f
+        b               1b
+11:
+        // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
+        // duplicate the results and break
+        cmp             x12, #(BDOF_MIN_BLOCK_SIZE)
+        b.eq            13f
+        bdof_vx_vy_8x_save_line v18, v19, v20, v21, v22
+12:
+        add             x15, x15, #1
+        b               8b
+13:
+        // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
+        // padding bottom then break
+        bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
+        b               2b
+16:
+        ldp             d9, d8, [sp, #0x10]
+        ldp             d11, d10, [sp], #0x20
+        ret
+endfunc
+
+/*
+ * x0: const int16_t *_src0,
+ * x1: const int16_t *_src1,
+ * x2: int16_t *gradient_h[2],
+ * x3: int16_t *gradient_v[2],
+ * x4: int16_t vx[16],
+ * x5: int16_t vy[16],
+ * w6: int block_h
+ */
+function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
+        sub             sp, sp, #0x80
+        stp             d15, d14, [sp, #0x30]
+        stp             d13, d12, [sp, #0x40]
+        stp             d11, d10, [sp, #0x50]
+        stp             d9, d8,   [sp, #0x60]
+        stp             x29, x30, [sp, #0x70]
+
+        ldp             x8, x9, [x2]                        // gh0, gh1
+        ldp             x10, x11, [x3]                      // gv0, gv1
+        movrel          x12, bdof_vx_vy_16x_tbl
+        ldp             q0, q1, [x12]                       // table
+        mov             w13, w6                             // y = block_h
+        b               4f
+
+1:
+        // save line4
+        bdof_vx_vy_16x_save_line v6, v7, v16, v17, v18
+2:
+        clz             v3.4s, v25.4s
+        mvni            v5.4s, #0x1e
+        add             v3.4s, v3.4s, v5.4s                 // -log2()
+        shl             v4.4s, v28.4s, #0x2
+        sshl            v3.4s, v4.4s, v3.4s
+
+        movi            v28.4s, #0xf                        // clip to 15
+        mvni            v29.4s, #0xe                        // clip to -15
+        bdof_vx_vy_clip_mask v3, v28, v29, v25, v3
+        sqxtn           v4.4h, v3.4s
+        st1             {v4.d}[0], [x4], #(BDOF_MIN_BLOCK_SIZE * 2)
+
+        subs            x13, x13, #(BDOF_MIN_BLOCK_SIZE)    // y -= BDOF_MIN_BLOCK_SIZE
+
+        clz             v4.4s, v24.4s
+        add             v4.4s, v4.4s, v5.4s                 // -log2()
+        shl             v5.4s, v27.4s, #0x2
+        mul             v3.4s, v3.4s, v26.4s
+        sshr            v3.4s, v3.4s, #0x1
+        sub             v3.4s, v5.4s, v3.4s
+        sshl            v3.4s, v3.4s, v4.4s
+
+        bdof_vx_vy_clip_mask v3, v28, v29, v24, v3
+        sqxtn           v3.4h, v3.4s
+        st1             {v3.d}[0], [x5], #(BDOF_MIN_BLOCK_SIZE * 2)
+        b.eq            16f
+4:
+        mov             w14, #0x0                           // dy, inner loop
+
+        movi            v25.2d, #0
+        movi            v24.2d, #0
+        movi            v26.2d, #0
+        movi            v28.2d, #0
+        movi            v27.2d, #0
+        b               8f
+
+5:
+        // add line(-1) and line0 from previous results
+        bdof_vx_vy_16x_add_line v19, v20, v21, v22, v23
+        bdof_vx_vy_16x_add_line v6, v7, v16, v17, v18
+        add             w14, w14, #0x1
+
+ 8:
+        cmp             w13, w6
+        b.hs            9f
+        // y < block_h && dy == 0, reuse previous results
+        cbz             w14, 5b
+9:
+        ld1             {v29.8h, v30.8h}, [x0]              // src0
+        sshr            v31.8h, v29.8h, #0x4
+        ld1             {v8.8h, v9.8h}, [x1]                // src1
+        sshr            v10.8h, v8.8h, #0x4
+        ld1             {v11.8h, v12.8h}, [x8], #32         // gh0
+        sshr            v29.8h, v30.8h, #0x4
+        sshr            v30.8h, v9.8h, #0x4
+        ld1             {v8.8h, v9.8h}, [x9], #32           // gh1
+        shadd           v13.8h, v11.8h, v8.8h               // (gh0 + gh1) >> 1, left half
+        ld1             {v14.8h, v15.8h}, [x10], #32        // gv0
+        ld1             {v3.8h, v4.8h}, [x11], #32          // gv1
+        shadd           v5.8h, v14.8h, v3.8h                // (gv0 + gv1) >> 1, left half
+        sub             v31.8h, v31.8h, v10.8h              // diff, left half
+        shadd           v8.8h, v12.8h, v9.8h                // (gh0 + gh1) >> 1, right half
+        shadd           v3.8h, v15.8h, v4.8h                // (gv0 + gv1) >> 1, right half
+        sub             v4.8h, v29.8h, v30.8h               // diff, right half
+
+        abs             v29.8h, v13.8h
+        abs             v30.8h, v8.8h
+        abs             v9.8h, v5.8h
+        abs             v10.8h, v3.8h
+
+        add             x0, x0, #(VVC_MAX_PB_SIZE * 2)
+        add             x1, x1, #(VVC_MAX_PB_SIZE * 2)
+
+        bdof_vx_vy_16x_padding_left_right v29, v30, v11, v12, v14, v29
+        bdof_vx_vy_16x_padding_left_right v9, v10, v11, v12, v14, v30
+
+        bdof_vx_vy_sign v13, v9, v10, v9
+        bdof_vx_vy_sign v8, v10, v11, v10
+        bdof_vx_vy_sign v5, v11, v5, v5
+        bdof_vx_vy_sign v3, v11, v3, v3
+
+        mul             v11.8h, v5.8h, v13.8h
+        mul             v12.8h, v3.8h, v8.8h
+        mul             v8.8h, v9.8h, v31.8h
+        mul             v9.8h, v10.8h, v4.8h
+        mul             v13.8h, v5.8h, v31.8h
+        mul             v14.8h, v3.8h, v4.8h
+
+        bdof_vx_vy_16x_padding_left_right v11, v12, v3, v4, v5, v31
+        bdof_vx_vy_16x_padding_left_right v8, v9, v3, v4, v5, v8
+        bdof_vx_vy_16x_padding_left_right v13, v14, v3, v4, v5, v9
+
+        bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
+        // check whether padding top
+        cmp             w13, w6
+        b.ne            10f
+        cbnz            w14, 10f
+        // y == block_h && dy == 0, padding top
+        bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
+        add             w14, w14, #0x1
+        b               9b
+10:
+        cmp             w14, #(BDOF_MIN_BLOCK_SIZE - 1)
+        b.eq            11f
+        cmp             w14, #(BDOF_MIN_BLOCK_SIZE)
+        b.ne            12f
+        // save line4
+        b               1b
+ 11:
+        // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1, padding bottom
+        cmp             x13, #(BDOF_MIN_BLOCK_SIZE)
+        b.eq            13f
+        // save line3
+        bdof_vx_vy_16x_save_line v19, v20, v21, v22, v23
+12:
+        add             w14, w14, #0x1                      // dy++
+        b               8b
+13:
+        // padding bottom
+        bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
+        b               2b
+16:
+        // restore
+        ldp             x29, x30, [sp, #0x70]
+        ldp             d9, d8, [sp, #0x60]
+        ldp             d11, d10, [sp, #0x50]
+        ldp             d13, d12, [sp, #0x40]
+        ldp             d15, d14, [sp, #0x30]
+        add             sp, sp, #0x80
         ret
-
-.unreq src0
-.unreq src1
-.unreq pad_mask
-.unreq gh
-.unreq gv
-.unreq vx
-.unreq vy
-.unreq sgx2
-.unreq sgy2
-.unreq sgxgy
-.unreq sgxdi
-.unreq sgydi
-.unreq sgx2_v
-.unreq sgy2_v
-.unreq sgxgy_v
-.unreq sgxdi_v
-.unreq sgydi_v
-.unreq sgx2_v2
-.unreq sgy2_v2
-.unreq sgxgy_v2
-.unreq sgxdi_v2
-.unreq sgydi_v2
-.unreq y
 endfunc
diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c
index ac6182b09d..d8ddaacb14 100644
--- a/libavcodec/aarch64/vvc/of_template.c
+++ b/libavcodec/aarch64/vvc/of_template.c
@@ -41,6 +41,11 @@ static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride,
     ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1],
                                     BDOF_BLOCK_SIZE,
                                     _src1, MAX_PB_SIZE, block_w, block_h);
+    int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
+    if (block_w == 8)
+        ff_vvc_derive_bdof_vx_vy_8x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
+    else
+        ff_vvc_derive_bdof_vx_vy_16x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
 
     for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
         for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE * 2) {
@@ -50,14 +55,10 @@ static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride,
             int idx = BDOF_BLOCK_SIZE * y + x;
             const int16_t *gh[] = {gradient_h[0] + idx, gradient_h[1] + idx};
             const int16_t *gv[] = {gradient_v[0] + idx, gradient_v[1] + idx};
-            int16_t vx[2], vy[2];
-            int pad_mask = !x | ((!y) << 1) |
-                           ((x + 2 * BDOF_MIN_BLOCK_SIZE == block_w) << 2) |
-                           ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3);
-            ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, vx, vy);
+            int idx1 = y + x / BDOF_MIN_BLOCK_SIZE;
             FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(d, dst_stride,
-                                                             src0, src1, gh, gv,
-                                                             vx, vy);
+                                                     src0, src1, gh, gv,
+                                                     vx + idx1, vy + idx1);
         }
         dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
     }
-- 
2.49.1


From 6edb238e7ac1f98c7b16b0db05e0f5436152cc48 Mon Sep 17 00:00:00 2001
From: Zhao Zhili <zhilizhao@tencent.com>
Date: Thu, 14 Aug 2025 12:42:38 +0800
Subject: [PATCH 2/2] avcodec/aarch64/vvc: Optimize apply_bdof

                               Before               After
--------------------------------------------------------------------
apply_bdof_8_8x16_c:       |   7431.4 ( 1.00x)   |   7371.7 ( 1.00x)
apply_bdof_8_8x16_neon:    |   1175.4 ( 6.32x)   |   1036.3 ( 7.11x)
apply_bdof_8_16x8_c:       |   7182.2 ( 1.00x)   |   7201.1 ( 1.00x)
apply_bdof_8_16x8_neon:    |   1021.7 ( 7.03x)   |    879.9 ( 8.18x)
apply_bdof_8_16x16_c:      |  14577.1 ( 1.00x)   |  14589.3 ( 1.00x)
apply_bdof_8_16x16_neon:   |   2012.8 ( 7.24x)   |   1743.3 ( 8.37x)
apply_bdof_10_8x16_c:      |   7292.4 ( 1.00x)   |   7308.5 ( 1.00x)
apply_bdof_10_8x16_neon:   |   1156.3 ( 6.31x)   |   1045.3 ( 6.99x)
apply_bdof_10_16x8_c:      |   7112.4 ( 1.00x)   |   7214.4 ( 1.00x)
apply_bdof_10_16x8_neon:   |   1007.6 ( 7.06x)   |    904.8 ( 7.97x)
apply_bdof_10_16x16_c:     |  14363.3 ( 1.00x)   |  14476.4 ( 1.00x)
apply_bdof_10_16x16_neon:  |   1986.9 ( 7.23x)   |   1783.1 ( 8.12x)
apply_bdof_12_8x16_c:      |   7433.3 ( 1.00x)   |   7374.7 ( 1.00x)
apply_bdof_12_8x16_neon:   |   1155.9 ( 6.43x)   |   1040.8 ( 7.09x)
apply_bdof_12_16x8_c:      |   7171.1 ( 1.00x)   |   7376.3 ( 1.00x)
apply_bdof_12_16x8_neon:   |   1010.8 ( 7.09x)   |    899.4 ( 8.20x)
apply_bdof_12_16x16_c:     |  14515.5 ( 1.00x)   |  14731.5 ( 1.00x)
apply_bdof_12_16x16_neon:  |   1988.4 ( 7.30x)   |   1785.2 ( 8.25x)
---
 libavcodec/aarch64/vvc/dsp_init.c    |  37 +--
 libavcodec/aarch64/vvc/inter.S       | 402 ++++++++++++++++++++++++---
 libavcodec/aarch64/vvc/of_template.c |  65 -----
 3 files changed, 368 insertions(+), 136 deletions(-)
 delete mode 100644 libavcodec/aarch64/vvc/of_template.c

diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index 1db38ebb1d..df0b536539 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -30,38 +30,16 @@
 #define BDOF_BLOCK_SIZE         16
 #define BDOF_MIN_BLOCK_SIZE     4
 
-void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
-                                     int16_t *gradient_v,
-                                     ptrdiff_t gradient_stride,
-                                     const int16_t *_src,
-                                     ptrdiff_t src_stride,
-                                     int width, int height);
-
-void ff_vvc_derive_bdof_vx_vy_8x_neon(const int16_t *_src0,
-                                      const int16_t *_src1,
-                                      int16_t *const gradient_h[2],
-                                      int16_t *const gradient_v[2],
-                                      int16_t vx[16], int16_t vy[16],
-                                      int block_h);
-void ff_vvc_derive_bdof_vx_vy_16x_neon(const int16_t *_src0,
-                                      const int16_t *_src1,
-                                      int16_t *const gradient_h[2],
-                                      int16_t *const gradient_v[2],
-                                      int16_t vx[16], int16_t vy[16],
-                                      int block_h);
 #define BIT_DEPTH 8
 #include "alf_template.c"
-#include "of_template.c"
 #undef BIT_DEPTH
 
 #define BIT_DEPTH 10
 #include "alf_template.c"
-#include "of_template.c"
 #undef BIT_DEPTH
 
 #define BIT_DEPTH 12
 #include "alf_template.c"
-#include "of_template.c"
 #undef BIT_DEPTH
 
 int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
@@ -121,6 +99,15 @@ DMVR_FUN(hv_, 8)
 DMVR_FUN(hv_, 10)
 DMVR_FUN(hv_, 12)
 
+#define APPLY_BDOF_FUNC(bd) \
+    void ff_vvc_apply_bdof_ ## bd ## _neon(uint8_t *_dst, ptrdiff_t _dst_stride, \
+        const int16_t *_src0, const int16_t *_src1, \
+        int block_w, int block_h);
+
+APPLY_BDOF_FUNC(8)
+APPLY_BDOF_FUNC(10)
+APPLY_BDOF_FUNC(12)
+
 void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -202,7 +189,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         c->inter.w_avg = vvc_w_avg_8;
         c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon;
         c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
-        c->inter.apply_bdof = apply_bdof_8;
+        c->inter.apply_bdof = ff_vvc_apply_bdof_8_neon;
 
         c->sao.band_filter[0] = ff_h26x_sao_band_filter_8x8_8_neon;
         for (int i = 1; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
@@ -246,7 +233,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         c->inter.avg = ff_vvc_avg_10_neon;
         c->inter.w_avg = vvc_w_avg_10;
         c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
-        c->inter.apply_bdof = apply_bdof_10;
+        c->inter.apply_bdof = ff_vvc_apply_bdof_10_neon;
 
         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
@@ -255,7 +242,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         c->inter.w_avg = vvc_w_avg_12;
         c->inter.dmvr[0][0] = ff_vvc_dmvr_12_neon;
         c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
-        c->inter.apply_bdof = apply_bdof_12;
+        c->inter.apply_bdof = ff_vvc_apply_bdof_12_neon;
 
         c->alf.filter[LUMA] = alf_filter_luma_12_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 06c6f3619b..61de56c6ac 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -716,7 +716,93 @@ function ff_vvc_prof_grad_filter_8x_neon, export=1
 .unreq height
 endfunc
 
-.macro vvc_apply_bdof_block bit_depth
+function ff_vvc_bdof_grad_filter_8x_neon, export=1
+        gh0             .req x0
+        gh1             .req x1
+        gv0             .req x2
+        gv1             .req x3
+        src0            .req x4
+        src1            .req x5
+        width           .req w6
+        height          .req w7
+
+1:
+        mov             x10, src0
+        mov             w11, width
+        mov             x12, gh0
+        mov             x13, gv0
+        mov             x14, src1
+        mov             x15, gh1
+        mov             x16, gv1
+2:
+        ldur            q0, [x10, #2]
+        ldur            q1, [x10, #-2]
+        ldr             q2, [x10, #(VVC_MAX_PB_SIZE << 1)]
+        ldr             q3, [x10, #-(VVC_MAX_PB_SIZE << 1)]
+        sshr            v0.8h, v0.8h, #6
+        sshr            v1.8h, v1.8h, #6
+        ldur            q4, [x14, #2]
+        ldur            q5, [x14, #-2]
+        sshr            v2.8h, v2.8h, #6
+        sshr            v3.8h, v3.8h, #6
+        ldr             q6, [x14, #(VVC_MAX_PB_SIZE << 1)]
+        ldr             q7, [x14, #-(VVC_MAX_PB_SIZE << 1)]
+        // results of gradient_h0
+        sub             v0.8h, v0.8h, v1.8h
+        // results of gradient_v0
+        sub             v2.8h, v2.8h, v3.8h
+
+        sshr            v4.8h, v4.8h, #6
+        sshr            v5.8h, v5.8h, #6
+        sshr            v6.8h, v6.8h, #6
+        sshr            v7.8h, v7.8h, #6
+        // results of gradient_h1
+        sub             v4.8h, v4.8h, v5.8h
+        // results of gradient_v1
+        sub             v6.8h, v6.8h, v7.8h
+
+        add             x10, x10, #16
+        add             x14, x14, #16
+
+        // (gradient_h0 + gradient_h1) >> 1
+        shadd           v1.8h, v0.8h, v4.8h
+        // gradient_h0 - gradient_h1
+        sub             v5.8h, v0.8h, v4.8h
+
+        subs            w11, w11, #8
+
+        // (gradient_v0 + gradient_v1) >> 1
+        shadd           v3.8h, v2.8h, v6.8h
+        // gradient_v0 - gradient_v1
+        sub             v7.8h, v2.8h, v6.8h
+
+        st1             {v1.8h}, [x12], #16
+        st1             {v5.8h}, [x15], #16
+        st1             {v3.8h}, [x13], #16
+        st1             {v7.8h}, [x16], #16
+        b.ne            2b
+
+        subs            height, height, #1
+        add             gh0, gh0, #(BDOF_BLOCK_SIZE << 1)
+        add             gv0, gv0, #(BDOF_BLOCK_SIZE << 1)
+        add             src0, src0, #(VVC_MAX_PB_SIZE << 1)
+        add             gh1, gh1, #(BDOF_BLOCK_SIZE << 1)
+        add             gv1, gv1, #(BDOF_BLOCK_SIZE << 1)
+        add             src1, src1, #(VVC_MAX_PB_SIZE << 1)
+        b.ne            1b
+        ret
+
+.unreq gh0
+.unreq gh1
+.unreq gv0
+.unreq gv1
+.unreq src0
+.unreq src1
+.unreq width
+.unreq height
+endfunc
+
+.macro vvc_apply_bdof_block_8x bit_depth
         dst             .req x0
         dst_stride      .req x1
         src0            .req x2
@@ -726,33 +812,28 @@ endfunc
         vx              .req x6
         vy              .req x7
 
-        ld1r            {v0.8h}, [vx], #2
-        ld1r            {v1.8h}, [vy], #2
-        ld1r            {v2.8h}, [vx]
-        ld1r            {v3.8h}, [vy]
-        ins             v0.d[1], v2.d[1]
-        ins             v1.d[1], v3.d[1]
-
+        ldr             w8, [sp]
         movi            v7.4s, #(1 << (14 - \bit_depth))
-        ldp             x8, x9, [gh]
-        ldp             x10, x11, [gv]
         mov             x12, #(BDOF_BLOCK_SIZE * 2)
-        mov             w13, #(BDOF_MIN_BLOCK_SIZE)
         mov             x14, #(VVC_MAX_PB_SIZE * 2)
 .if \bit_depth >= 10
         // clip pixel
         mov             w15, #((1 << \bit_depth) - 1)
         movi            v18.8h, #0
-        lsl             dst_stride, dst_stride, #1
         dup             v19.8h, w15
 .endif
+
+0:
+        ld1r            {v0.8h}, [vx], #2
+        ld1r            {v1.8h}, [vy], #2
+        ld1r            {v2.8h}, [vx]
+        ld1r            {v3.8h}, [vy]
+        mov             w13, #(BDOF_MIN_BLOCK_SIZE)
+        ins             v0.d[1], v2.d[1]
+        ins             v1.d[1], v3.d[1]
 1:
-        ld1             {v2.8h}, [x8], x12
-        ld1             {v3.8h}, [x9], x12
-        ld1             {v4.8h}, [x10], x12
-        ld1             {v5.8h}, [x11], x12
-        sub             v2.8h, v2.8h, v3.8h
-        sub             v4.8h, v4.8h, v5.8h
+        ld1             {v2.8h}, [gh], x12
+        ld1             {v4.8h}, [gv], x12
         smull           v3.4s, v0.4h, v2.4h
         smull2          v16.4s, v0.8h, v2.8h
         smlal           v3.4s, v1.4h, v4.4h
@@ -780,6 +861,11 @@ endfunc
         st1             {v5.8h}, [dst], dst_stride
 .endif
         b.ne            1b
+
+        subs            w8, w8, #(BDOF_MIN_BLOCK_SIZE)
+        add             vx, vx, #(2 * BDOF_MIN_BLOCK_SIZE - 2)
+        add             vy, vy, #(2 * BDOF_MIN_BLOCK_SIZE - 2)
+        b.ne            0b
         ret
 
 .unreq dst
@@ -792,16 +878,128 @@ endfunc
 .unreq vy
 .endm
 
-function ff_vvc_apply_bdof_block_8_neon, export=1
-        vvc_apply_bdof_block 8
+function ff_vvc_apply_bdof_block_8x_8_neon, export=1
+        vvc_apply_bdof_block_8x 8
 endfunc
 
-function ff_vvc_apply_bdof_block_10_neon, export=1
-        vvc_apply_bdof_block 10
+function ff_vvc_apply_bdof_block_8x_10_neon, export=1
+        vvc_apply_bdof_block_8x 10
 endfunc
 
-function ff_vvc_apply_bdof_block_12_neon, export=1
-        vvc_apply_bdof_block 12
+function ff_vvc_apply_bdof_block_8x_12_neon, export=1
+        vvc_apply_bdof_block_8x 12
+endfunc
+
+.macro vvc_apply_bdof_block_16x bit_depth
+        dst             .req x0
+        dst_stride      .req x1
+        src0            .req x2
+        src1            .req x3
+        gh              .req x4
+        gv              .req x5
+        vx              .req x6
+        vy              .req x7
+
+        ldr             w8, [sp]
+        movi            v7.4s, #(1 << (14 - \bit_depth))
+.if \bit_depth >= 10
+        // clip pixel
+        mov             w15, #((1 << \bit_depth) - 1)
+        movi            v18.8h, #0
+        dup             v19.8h, w15
+.endif
+
+0:
+        ld1r            {v0.8h}, [vx], #2
+        ld1r            {v1.8h}, [vy], #2
+        ld1r            {v2.8h}, [vx], #2
+        ld1r            {v3.8h}, [vy], #2
+
+        mov             w13, #(BDOF_MIN_BLOCK_SIZE)
+
+        ld1r            {v20.8h}, [vx], #2
+        ld1r            {v21.8h}, [vy], #2
+        ld1r            {v22.8h}, [vx], #2
+        ld1r            {v23.8h}, [vy], #2
+
+        ins             v0.d[1], v2.d[1]
+        ins             v1.d[1], v3.d[1]
+        ins             v20.d[1], v22.d[1]
+        ins             v21.d[1], v23.d[1]
+1:
+        ldp             q2, q22, [gh], #(BDOF_BLOCK_SIZE * 2)
+        ldp             q4, q24, [gv], #(BDOF_BLOCK_SIZE * 2)
+        smull           v3.4s, v0.4h, v2.4h
+        smull2          v16.4s, v0.8h, v2.8h
+        smlal           v3.4s, v1.4h, v4.4h
+        smlal2          v16.4s, v1.8h, v4.8h
+
+        ldp             q5, q25, [src0], #(VVC_MAX_PB_SIZE * 2)
+        ldp             q6, q26, [src1], #(VVC_MAX_PB_SIZE * 2)
+
+        smull           v23.4s, v20.4h, v22.4h
+        smull2          v27.4s, v20.8h, v22.8h
+        smlal           v23.4s, v21.4h, v24.4h
+        smlal2          v27.4s, v21.8h, v24.8h
+
+        saddl           v2.4s, v5.4h, v6.4h
+        add             v2.4s, v2.4s, v7.4s
+        add             v2.4s, v2.4s, v3.4s
+        saddl2          v4.4s, v5.8h, v6.8h
+        add             v4.4s, v4.4s, v7.4s
+        add             v4.4s, v4.4s, v16.4s
+
+        saddl           v22.4s, v25.4h, v26.4h
+        add             v22.4s, v22.4s, v7.4s
+        add             v22.4s, v22.4s, v23.4s
+        saddl2          v24.4s, v25.8h, v26.8h
+        add             v24.4s, v24.4s, v7.4s
+        add             v24.4s, v24.4s, v27.4s
+
+        sqshrn          v5.4h, v2.4s, #(15 - \bit_depth)
+        sqshrn2         v5.8h, v4.4s, #(15 - \bit_depth)
+        sqshrn          v25.4h, v22.4s, #(15 - \bit_depth)
+        sqshrn2         v25.8h, v24.4s, #(15 - \bit_depth)
+
+        subs            w13, w13, #1
+.if \bit_depth == 8
+        sqxtun          v5.8b, v5.8h
+        sqxtun2         v5.16b, v25.8h
+        str             q5, [dst]
+.else
+        smin            v5.8h, v5.8h, v19.8h
+        smax            v5.8h, v5.8h, v18.8h
+        smin            v25.8h, v25.8h, v19.8h
+        smax            v25.8h, v25.8h, v18.8h
+        stp             q5, q25, [dst]
+.endif
+        add             dst, dst, dst_stride
+        b.ne            1b
+
+        subs            w8, w8, #(BDOF_MIN_BLOCK_SIZE)
+        b.ne            0b
+        ret
+
+.unreq dst
+.unreq dst_stride
+.unreq src0
+.unreq src1
+.unreq gh
+.unreq gv
+.unreq vx
+.unreq vy
+.endm
+
+function ff_vvc_apply_bdof_block_16x_8_neon, export=1
+        vvc_apply_bdof_block_16x 8
+endfunc
+
+function ff_vvc_apply_bdof_block_16x_10_neon, export=1
+        vvc_apply_bdof_block_16x 10
+endfunc
+
+function ff_vvc_apply_bdof_block_16x_12_neon, export=1
+        vvc_apply_bdof_block_16x 12
 endfunc
 
 const bdof_vx_vy_8x_tbl
@@ -885,8 +1083,8 @@ endconst
 /*
  * x0: const int16_t *_src0,
  * x1: const int16_t *_src1,
- * x2: int16_t *gradient_h[2],
- * x3: int16_t *gradient_v[2],
+ * x2: const int16_t *gradient_h,
+ * x3: const int16_t *gradient_v,
  * x4: int16_t vx[16],
  * x5: int16_t vy[16],
  * w6: int block_h
@@ -895,8 +1093,6 @@ function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1
         stp             d11, d10, [sp, #-0x20]!
         stp             d9, d8, [sp, #0x10]
 
-        ldp             x14, x13, [x2]                      // gh0, gh1
-        ldp             x10, x9, [x3]                       // gv0, gv1
         movrel          x11, bdof_vx_vy_8x_tbl
         ldr             q0, [x11]                           // table
         mvni            v2.4s, #30                          // -31, for log2
@@ -964,17 +1160,13 @@ function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1
 9:
         ldr             q28, [x0]                                   // src0
         ldr             q29, [x1]                                   // src1
-        ldr             q30, [x14], #(BDOF_BLOCK_SIZE * 2)          // gh0
-        ldr             q31, [x13], #(BDOF_BLOCK_SIZE * 2)          // gh1
-        ldr             q8, [x10], #(BDOF_BLOCK_SIZE * 2)           // gv0
-        ldr             q9, [x9], #(BDOF_BLOCK_SIZE * 2)            // gv1
+        ldr             q30, [x2], #(BDOF_BLOCK_SIZE * 2)           // (gh0 + gh1) >> 1
+        ldr             q31, [x3], #(BDOF_BLOCK_SIZE * 2)           // (gv0 + gv1) >> 1
         add             x0, x0, #(VVC_MAX_PB_SIZE * 2)
         add             x1, x1, #(VVC_MAX_PB_SIZE * 2)
 
         sshr            v28.8h, v28.8h, #0x4
         sshr            v29.8h, v29.8h, #0x4
-        shadd           v30.8h, v30.8h, v31.8h                      // tmph
-        shadd           v31.8h, v8.8h, v9.8h                        // tmpv
         sub             v8.8h, v28.8h, v29.8h                       // diff
 
         abs             v28.8h, v30.8h
@@ -1033,8 +1225,8 @@ endfunc
 /*
  * x0: const int16_t *_src0,
  * x1: const int16_t *_src1,
- * x2: int16_t *gradient_h[2],
- * x3: int16_t *gradient_v[2],
+ * x2: const int16_t *gradient_h,
+ * x3: const int16_t *gradient_v,
  * x4: int16_t vx[16],
  * x5: int16_t vy[16],
  * w6: int block_h
@@ -1047,8 +1239,6 @@ function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
         stp             d9, d8,   [sp, #0x60]
         stp             x29, x30, [sp, #0x70]
 
-        ldp             x8, x9, [x2]                        // gh0, gh1
-        ldp             x10, x11, [x3]                      // gv0, gv1
         movrel          x12, bdof_vx_vy_16x_tbl
         ldp             q0, q1, [x12]                       // table
         mov             w13, w6                             // y = block_h
@@ -1110,17 +1300,11 @@ function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
         sshr            v31.8h, v29.8h, #0x4
         ld1             {v8.8h, v9.8h}, [x1]                // src1
         sshr            v10.8h, v8.8h, #0x4
-        ld1             {v11.8h, v12.8h}, [x8], #32         // gh0
+        ldp             q13, q8, [x2], #32                  // (gh0 + gh1) >> 1
         sshr            v29.8h, v30.8h, #0x4
         sshr            v30.8h, v9.8h, #0x4
-        ld1             {v8.8h, v9.8h}, [x9], #32           // gh1
-        shadd           v13.8h, v11.8h, v8.8h               // (gh0 + gh1) >> 1, left half
-        ld1             {v14.8h, v15.8h}, [x10], #32        // gv0
-        ld1             {v3.8h, v4.8h}, [x11], #32          // gv1
-        shadd           v5.8h, v14.8h, v3.8h                // (gv0 + gv1) >> 1, left half
+        ldp             q5, q3, [x3], #32                   // (gv0 + gv1) >> 1
         sub             v31.8h, v31.8h, v10.8h              // diff, left half
-        shadd           v8.8h, v12.8h, v9.8h                // (gh0 + gh1) >> 1, right half
-        shadd           v3.8h, v15.8h, v4.8h                // (gv0 + gv1) >> 1, right half
         sub             v4.8h, v29.8h, v30.8h               // diff, right half
 
         abs             v29.8h, v13.8h
@@ -1189,3 +1373,129 @@ function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
         add             sp, sp, #0x80
         ret
 endfunc
+
+function ff_vvc_apply_bdof_10_neon, export=1
+        mov             w6, #10
+        b               0f
+endfunc
+
+function ff_vvc_apply_bdof_12_neon, export=1
+        mov             w6, #12
+        b               0f
+endfunc
+
+// int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]
+// int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]
+// int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
+#define APPLY_BDOF_STACK_SIZE   ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 4)
+#define GRADIENT_H0_OFFSET      2
+#define GRADIENT_H1_OFFSET      ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 2 + 2)
+#define GRADIENT_V0_OFFSET      ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 4 + 2)
+#define GRADIENT_V1_OFFSET      ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 6 + 2)
+#define VX_OFFSET               ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8)
+#define VY_OFFSET               ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 2)
+function ff_vvc_apply_bdof_8_neon, export=1
+        mov             w6, #8
+0:
+        stp             x19, x20, [sp, #-0x40]!
+        stp             x21, x22, [sp, #0x10]
+        stp             x23, x24, [sp, #0x20]
+        stp             x25, x30, [sp, #0x30]
+
+        sub             sp, sp, #APPLY_BDOF_STACK_SIZE
+        mov             w19, w6                         // bit_depth
+        mov             x20, x0                         // dst
+        mov             x21, x1                         // dst_stride
+        mov             x22, x2                         // src0
+        mov             x23, x3                         // src1
+        mov             w24, w4                         // block_w
+        mov             w25, w5                         // block_h
+
+        // int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]};
+        add             x0, sp, #GRADIENT_H0_OFFSET
+        add             x1, sp, #GRADIENT_H1_OFFSET
+        add             x2, sp, #GRADIENT_V0_OFFSET
+        add             x3, sp, #GRADIENT_V1_OFFSET
+        mov             x4, x22
+        mov             x5, x23
+        mov             w6, w24
+        mov             w7, w25
+        bl              X(ff_vvc_bdof_grad_filter_8x_neon)
+
+        cmp             w24, #8
+        mov             x0, x22                         // src0
+        mov             x1, x23                         // src1
+        add             x2, sp, #GRADIENT_H0_OFFSET     // gh0
+        add             x3, sp, #GRADIENT_V0_OFFSET     // gv0
+        add             x4, sp, #VX_OFFSET              // vx
+        add             x5, sp, #VY_OFFSET              // vy
+        mov             w6, w25                         // block_h
+
+        b.gt            16f
+
+        bl              X(ff_vvc_derive_bdof_vx_vy_8x_neon)
+        cmp             w19, #10                        // check bitdepth
+        mov             x0, x20                         // dst
+        mov             x1, x21                         // dst_stride
+        mov             x2, x22                         // src0
+        mov             x3, x23                         // src1
+        add             x4, sp, #GRADIENT_H1_OFFSET     // gh1
+        add             x5, sp, #GRADIENT_V1_OFFSET     // gv1
+        add             x6, sp, #VX_OFFSET
+        add             x7, sp, #VY_OFFSET
+        str             w25, [sp]
+        b.eq            1f
+        b.gt            2f
+        // 8bit
+0:
+        bl              X(ff_vvc_apply_bdof_block_8x_8_neon)
+        b               32f
+1:
+        // 10bit
+        bl              X(ff_vvc_apply_bdof_block_8x_10_neon)
+        b               32f
+2:
+        // 12bit
+        bl              X(ff_vvc_apply_bdof_block_8x_12_neon)
+        b               32f
+16:
+        bl              X(ff_vvc_derive_bdof_vx_vy_16x_neon)
+
+        cmp             w19, #10                        // check bitdepth
+        mov             x0, x20                         // dst
+        mov             x1, x21                         // dst_stride
+        mov             x2, x22                         // src0
+        mov             x3, x23                         // src1
+        add             x4, sp, #GRADIENT_H1_OFFSET     // gh1
+        add             x5, sp, #GRADIENT_V1_OFFSET     // gv1
+        add             x6, sp, #VX_OFFSET
+        add             x7, sp, #VY_OFFSET
+        str             w25, [sp]
+        b.eq            17f
+        b.gt            18f
+        // 8bit
+        bl              X(ff_vvc_apply_bdof_block_16x_8_neon)
+        b               32f
+17:
+        // 10bit
+        bl              X(ff_vvc_apply_bdof_block_16x_10_neon)
+        b               32f
+18:
+        // 12bit
+        bl              X(ff_vvc_apply_bdof_block_16x_12_neon)
+32:
+        add             sp, sp, #APPLY_BDOF_STACK_SIZE
+        ldp             x25, x30, [sp, #0x30]
+        ldp             x23, x24, [sp, #0x20]
+        ldp             x21, x22, [sp, #0x10]
+        ldp             x19, x20, [sp], #0x40
+        ret
+endfunc
+
+#undef APPLY_BDOF_STACK_SIZE
+#undef GRADIENT_H0_OFFSET
+#undef GRADIENT_H1_OFFSET
+#undef GRADIENT_V0_OFFSET
+#undef GRADIENT_V1_OFFSET
+#undef VX_OFFSET
+#undef VY_OFFSET
diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c
deleted file mode 100644
index d8ddaacb14..0000000000
--- a/libavcodec/aarch64/vvc/of_template.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/bit_depth_template.c"
-
-void FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(pixel* dst,
-        ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1,
-        const int16_t **gh, const int16_t **gv, int16_t *vx, int16_t *vy);
-
-static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride,
-                             const int16_t *_src0, const int16_t *_src1,
-                             int block_w, int block_h) {
-    // +2 for pad left and right
-    int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2];
-    int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2];
-    int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]};
-    int16_t *gradient_v[2] = {&gradient_buf_v[0][1], &gradient_buf_v[1][1]};
-    ptrdiff_t dst_stride = _dst_stride / sizeof(pixel);
-    pixel *dst = (pixel *) _dst;
-
-    ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0],
-                                    BDOF_BLOCK_SIZE,
-                                    _src0, MAX_PB_SIZE, block_w, block_h);
-    ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1],
-                                    BDOF_BLOCK_SIZE,
-                                    _src1, MAX_PB_SIZE, block_w, block_h);
-    int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
-    if (block_w == 8)
-        ff_vvc_derive_bdof_vx_vy_8x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
-    else
-        ff_vvc_derive_bdof_vx_vy_16x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
-
-    for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
-        for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE * 2) {
-            const int16_t *src0 = _src0 + y * MAX_PB_SIZE + x;
-            const int16_t *src1 = _src1 + y * MAX_PB_SIZE + x;
-            pixel *d = dst + x;
-            int idx = BDOF_BLOCK_SIZE * y + x;
-            const int16_t *gh[] = {gradient_h[0] + idx, gradient_h[1] + idx};
-            const int16_t *gv[] = {gradient_v[0] + idx, gradient_v[1] + idx};
-            int idx1 = y + x / BDOF_MIN_BLOCK_SIZE;
-            FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(d, dst_stride,
-                                                     src0, src1, gh, gv,
-                                                     vx + idx1, vy + idx1);
-        }
-        dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
-    }
-}
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] only message in thread