[FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: Optimize derive_bdof_vx_vy

From: Zhao Zhili <quinkblack-at-foxmail.com@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: Zhao Zhili <zhilizhao@tencent.com>
Subject: [FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: Optimize derive_bdof_vx_vy
Date: Fri, 20 Jun 2025 21:15:20 +0800
Message-ID: <tencent_74EA81130BDC658B84A618109A26F5A7E706@qq.com> (raw)

From: Zhao Zhili <zhilizhao@tencent.com>

                               Before             After
-----------------------------------------------------------------
apply_bdof_8_8x16_c:       |   7375.5 ( 1.00x) |  7473.8 ( 1.00x)
apply_bdof_8_8x16_neon:    |   1875.1 ( 3.93x) |  1135.8 ( 6.58x)
apply_bdof_8_16x8_c:       |   7273.9 ( 1.00x) |  7204.0 ( 1.00x)
apply_bdof_8_16x8_neon:    |   1738.2 ( 4.18x) |  1013.0 ( 7.11x)
apply_bdof_8_16x16_c:      |  14744.9 ( 1.00x) | 14712.6 ( 1.00x)
apply_bdof_8_16x16_neon:   |   3446.7 ( 4.28x) |  1997.7 ( 7.36x)
apply_bdof_10_8x16_c:      |   7352.4 ( 1.00x) |  7485.7 ( 1.00x)
apply_bdof_10_8x16_neon:   |   1861.0 ( 3.95x) |  1134.1 ( 6.60x)
apply_bdof_10_16x8_c:      |   7330.5 ( 1.00x) |  7232.8 ( 1.00x)
apply_bdof_10_16x8_neon:   |   1747.2 ( 4.20x) |  1002.6 ( 7.21x)
apply_bdof_10_16x16_c:     |  14522.4 ( 1.00x) | 14664.8 ( 1.00x)
apply_bdof_10_16x16_neon:  |   3490.5 ( 4.16x) |  1978.4 ( 7.41x)
apply_bdof_12_8x16_c:      |   7389.0 ( 1.00x) |  7380.1 ( 1.00x)
apply_bdof_12_8x16_neon:   |   1861.3 ( 3.97x) |  1134.0 ( 6.51x)
apply_bdof_12_16x8_c:      |   7283.1 ( 1.00x) |  7336.9 ( 1.00x)
apply_bdof_12_16x8_neon:   |   1749.1 ( 4.16x) |  1002.3 ( 7.32x)
apply_bdof_12_16x16_c:     |  14580.7 ( 1.00x) | 14502.7 ( 1.00x)
apply_bdof_12_16x16_neon:  |   3472.9 ( 4.20x) |  1978.3 ( 7.33x)
---
 libavcodec/aarch64/vvc/dsp_init.c    |  17 +-
 libavcodec/aarch64/vvc/inter.S       | 632 ++++++++++++++++-----------
 libavcodec/aarch64/vvc/of_template.c |  15 +-
 3 files changed, 399 insertions(+), 265 deletions(-)

diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index 9a171234f6..1db38ebb1d 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -37,11 +37,18 @@ void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
                                      ptrdiff_t src_stride,
                                      int width, int height);
 
-void ff_vvc_derive_bdof_vx_vy_neon(const int16_t *_src0, const int16_t *_src1,
-                                   int pad_mask,
-                                   const int16_t **gradient_h,
-                                   const int16_t **gradient_v,
-                                   int16_t *vx, int16_t *vy);
+void ff_vvc_derive_bdof_vx_vy_8x_neon(const int16_t *_src0,
+                                      const int16_t *_src1,
+                                      int16_t *const gradient_h[2],
+                                      int16_t *const gradient_v[2],
+                                      int16_t vx[16], int16_t vy[16],
+                                      int block_h);
+void ff_vvc_derive_bdof_vx_vy_16x_neon(const int16_t *_src0,
+                                      const int16_t *_src1,
+                                      int16_t *const gradient_h[2],
+                                      int16_t *const gradient_v[2],
+                                      int16_t vx[16], int16_t vy[16],
+                                      int block_h);
 #define BIT_DEPTH 8
 #include "alf_template.c"
 #include "of_template.c"
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index c299e6f68b..06c6f3619b 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -804,262 +804,388 @@ function ff_vvc_apply_bdof_block_12_neon, export=1
         vvc_apply_bdof_block 12
 endfunc
 
-function ff_vvc_derive_bdof_vx_vy_neon, export=1
-        src0            .req x0
-        src1            .req x1
-        pad_mask        .req w2
-        gh              .req x3
-        gv              .req x4
-        vx              .req x5
-        vy              .req x6
-
-        gh0             .req x7
-        gh1             .req x8
-        gv0             .req x9
-        gv1             .req x10
-        y               .req x12
-
-        sgx2            .req w7
-        sgy2            .req w8
-        sgxgy           .req w9
-        sgxdi           .req w10
-        sgydi           .req w11
-
-        sgx2_v          .req v22
-        sgy2_v          .req v23
-        sgxgy_v         .req v24
-        sgxdi_v         .req v25
-        sgydi_v         .req v26
-
-        sgx2_v2         .req v27
-        sgy2_v2         .req v28
-        sgxgy_v2        .req v29
-        sgxdi_v2        .req v30
-        sgydi_v2        .req v31
-
-        ldp             gh0, gh1, [gh]
-        ldp             gv0, gv1, [gv]
-        movi            sgx2_v.4s, #0
-        movi            sgy2_v.4s, #0
-        movi            sgxgy_v.4s, #0
-        movi            sgxdi_v.4s, #0
-        movi            sgydi_v.4s, #0
-        movi            sgx2_v2.4s, #0
-        movi            sgy2_v2.4s, #0
-        movi            sgxgy_v2.4s, #0
-        movi            sgxdi_v2.4s, #0
-        movi            sgydi_v2.4s, #0
-        mov             x13, #-1                    // dy
-        movi            v6.4s, #0
-        mov             y, #-1
-        tbz             pad_mask, #1, 1f            // check pad top
-        mov             x13, #0                     // dy: pad top
+const bdof_vx_vy_8x_tbl
+        .byte 0, 1, 16, 16, 16, 16, 8, 9
+        .byte 6, 7, 16, 16, 16, 16, 14, 15
+endconst
+
+const bdof_vx_vy_16x_tbl
+        .byte 0,  1,  64, 64, 64, 64, 8,  9
+        .byte 6,  7,  64, 64, 64, 64, 16, 17
+        .byte 14, 15, 64, 64, 64, 64, 24, 25
+        .byte 22, 23, 64, 64, 64, 64, 30, 31
+endconst
+
+// line(-1), line0, line1, line2, line3, line4
+// line3 and line4 becomes line(-1) and line0 in the next block.
+.macro bdof_vx_vy_8x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
+        mov             \tmp0\().16b, v28.16b
+        mov             \tmp1\().16b, v29.16b
+        mov             \tmp2\().16b, v30.16b
+        mov             \tmp3\().16b, v31.16b
+        mov             \tmp4\().16b, v8.16b
+.endm
+
+.macro bdof_vx_vy_8x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
+        add             v25.4s, v25.4s, \tmp0\().4s
+        add             v27.4s, v27.4s, \tmp1\().4s
+        add             v23.4s, v23.4s, \tmp2\().4s
+        sub             v26.4s, v26.4s, \tmp3\().4s
+        sub             v24.4s, v24.4s, \tmp4\().4s
+.endm
+
+.macro bdof_vx_vy_8x_padding_left_right src, tmp0, tmp1, dst
+        tbl             \tmp0\().16b, { \src\().16b }, v0.16b
+        saddl           \tmp1\().4s, \tmp0\().4h, \src\().4h
+        saddl2          \dst\().4s, \tmp0\().8h, \src\().8h
+        addp            \dst\().4s, \tmp1\().4s, \dst\().4s
+.endm
+
+.macro bdof_vx_vy_sign src, tmp0, tmp1, dst
+        cmlt            \tmp0\().8h, \src\().8h, #0
+        cmgt            \tmp1\().8h, \src\().8h, #0
+        sub             \dst\().8h, \tmp0\().8h, \tmp1\().8h
+.endm
+
+.macro bdof_vx_vy_clip_mask src, max, min, mask, dst
+        smin            \src\().4s, \src\().4s, \max\().4s
+        smax            \src\().4s, \src\().4s, \min\().4s
+        cmgt            \mask\().4s, \mask\().4s, #0
+        and             \dst\().16b, \src\().16b, \mask\().16b
+.endm
+
+.macro bdof_vx_vy_16x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
+        mov             \tmp0\().16b, v29.16b
+        mov             \tmp1\().16b, v30.16b
+        mov             \tmp2\().16b, v31.16b
+        mov             \tmp3\().16b, v8.16b
+        mov             \tmp4\().16b, v9.16b
+.endm
+
+.macro bdof_vx_vy_16x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
+        add             v25.4s, v25.4s, \tmp0\().4s
+        add             v24.4s, v24.4s, \tmp1\().4s
+        add             v26.4s, v26.4s, \tmp2\().4s
+        sub             v28.4s, v28.4s, \tmp3\().4s
+        sub             v27.4s, v27.4s, \tmp4\().4s
+.endm
+
+.macro bdof_vx_vy_16x_padding_left_right src0, src1, tmp0, tmp1, tmp2, dst
+        tbl             \tmp0\().16b, {\src0\().16b, \src1\().16b}, v0.16b
+        tbl             v2.16b, {\src0\().16b, \src1\().16b}, v1.16b
+        saddl           \tmp1\().4s, \tmp0\().4h, \src0\().4h
+        saddl           \tmp2\().4s, v2.4h, \src1\().4h
+        saddl2          \tmp0\().4s, \tmp0\().8h, \src0\().8h
+        saddl2          \dst\().4s, v2.8h, \src1\().8h
+        addp            \tmp0\().4s, \tmp1\().4s, \tmp0\().4s
+        addp            \dst\().4s, \tmp2\().4s, \dst\().4s
+        addp            \dst\().4s, \tmp0\().4s, \dst\().4s
+.endm
+
+/*
+ * x0: const int16_t *_src0,
+ * x1: const int16_t *_src1,
+ * x2: int16_t *gradient_h[2],
+ * x3: int16_t *gradient_v[2],
+ * x4: int16_t vx[16],
+ * x5: int16_t vy[16],
+ * w6: int block_h
+ */
+function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1
+        stp             d11, d10, [sp, #-0x20]!
+        stp             d9, d8, [sp, #0x10]
+
+        ldp             x14, x13, [x2]                      // gh0, gh1
+        ldp             x10, x9, [x3]                       // gv0, gv1
+        movrel          x11, bdof_vx_vy_8x_tbl
+        ldr             q0, [x11]                           // table
+        mvni            v2.4s, #30                          // -31, for log2
+        movi            v3.4s, #15                          // clip to 15
+        mvni            v4.4s, #14                          // clip to -15
+
+        mov             w11, #0x8
+        mov             w12, w6                             // y = block_h
+        b               4f
+
 1:
-        mov             x16, #-2                    // dx
-        add             x14, src0, x13, lsl #8      // local src0
-        add             x15, src1, x13, lsl #8      // local src1
-        add             x17, x16, x13, lsl #5
-        ldr             q0, [x14, x16]
-        ldr             q1, [x15, x16]
-        ldr             q2, [gh0, x17]
-        ldr             q3, [gh1, x17]
-        ldr             q4, [gv0, x17]
-        ldr             q5, [gv1, x17]
-        add             x16, x16, #8
-        add             x17, x17, #8
-        ins             v0.s[3], v6.s[3]
-        ins             v1.s[3], v6.s[3]
-        ins             v2.s[3], v6.s[3]
-        ins             v3.s[3], v6.s[3]
-        ins             v4.s[3], v6.s[3]
-        ins             v5.s[3], v6.s[3]
-
-        ldr             q16, [x14, x16]
-        ldr             q17, [x15, x16]
-        ldr             q18, [gh0, x17]
-        ldr             q19, [gh1, x17]
-        ldr             q20, [gv0, x17]
-        ldr             q21, [gv1, x17]
-        ins             v16.s[3], v6.s[3]
-        ins             v17.s[3], v6.s[3]
-        ins             v18.s[3], v6.s[3]
-        ins             v19.s[3], v6.s[3]
-        ins             v20.s[3], v6.s[3]
-        ins             v21.s[3], v6.s[3]
-
-        tbz             pad_mask, #0, 20f
-        // pad left
-        ins             v0.h[0], v0.h[1]
-        ins             v1.h[0], v1.h[1]
-        ins             v2.h[0], v2.h[1]
-        ins             v3.h[0], v3.h[1]
-        ins             v4.h[0], v4.h[1]
-        ins             v5.h[0], v5.h[1]
-20:
-        tbz             pad_mask, #2, 21f
-        // pad right
-        ins             v16.h[5], v16.h[4]
-        ins             v17.h[5], v17.h[4]
-        ins             v18.h[5], v18.h[4]
-        ins             v19.h[5], v19.h[4]
-        ins             v20.h[5], v20.h[4]
-        ins             v21.h[5], v21.h[4]
-21:
-        sshr            v0.8h, v0.8h, #4
-        sshr            v1.8h, v1.8h, #4
-        add             v2.8h, v2.8h, v3.8h
-        add             v4.8h, v4.8h, v5.8h
-        sub             v0.8h, v0.8h, v1.8h         // diff
-        sshr            v2.8h, v2.8h, #1            // temph
-        sshr            v4.8h, v4.8h, #1            // tempv
-
-        sshr            v16.8h, v16.8h, #4
-        sshr            v17.8h, v17.8h, #4
-        add             v18.8h, v18.8h, v19.8h
-        add             v20.8h, v20.8h, v21.8h
-        sub             v16.8h, v16.8h, v17.8h      // diff
-        sshr            v18.8h, v18.8h, #1          // temph
-        sshr            v20.8h, v20.8h, #1          // tempv
-
-        abs             v3.8h, v2.8h
-        abs             v5.8h, v4.8h
-        uxtl            v19.4s, v3.4h
-        uxtl            v21.4s, v5.4h
-        uxtl2           v3.4s, v3.8h
-        uxtl2           v5.4s, v5.8h
-        add             v3.4s, v3.4s, v19.4s
-        add             v5.4s, v5.4s, v21.4s
-        add             sgx2_v.4s, sgx2_v.4s, v3.4s
-        add             sgy2_v.4s, sgy2_v.4s, v5.4s
-
-        abs             v3.8h, v18.8h
-        abs             v5.8h, v20.8h
-        uxtl            v19.4s, v3.4h
-        uxtl            v21.4s, v5.4h
-        uxtl2           v3.4s, v3.8h
-        uxtl2           v5.4s, v5.8h
-        add             v3.4s, v3.4s, v19.4s
-        add             v5.4s, v5.4s, v21.4s
-        add             sgx2_v2.4s, sgx2_v2.4s, v3.4s
-        add             sgy2_v2.4s, sgy2_v2.4s, v5.4s
-
-        cmgt            v17.8h, v4.8h, #0
-        cmlt            v7.8h, v4.8h, #0
-        cmgt            v19.8h, v20.8h, #0
-        cmlt            v21.8h, v20.8h, #0
-        sub             v17.8h, v7.8h, v17.8h       // VVC_SIGN(tempv)
-        sub             v19.8h, v21.8h, v19.8h      // VVC_SIGN(tempv)
-
-        smlal           sgxgy_v.4s, v17.4h, v2.4h
-        smlal2          sgxgy_v.4s, v17.8h, v2.8h
-        smlsl           sgydi_v.4s, v17.4h, v0.4h
-        smlsl2          sgydi_v.4s, v17.8h, v0.8h
-
-        cmgt            v3.8h, v2.8h, #0
-        cmlt            v5.8h, v2.8h, #0
-        cmgt            v17.8h, v18.8h, #0
-        cmlt            v21.8h, v18.8h, #0
-        sub             v3.8h, v5.8h, v3.8h         // VVC_SIGN(temph)
-        sub             v17.8h, v21.8h, v17.8h      // VVC_SIGN(temph)
-
-        smlal           sgxgy_v2.4s, v19.4h, v18.4h
-        smlal2          sgxgy_v2.4s, v19.8h, v18.8h
-        smlsl           sgydi_v2.4s, v19.4h, v16.4h
-        smlsl2          sgydi_v2.4s, v19.8h, v16.8h
-
-        smlsl           sgxdi_v.4s, v3.4h, v0.4h
-        smlsl2          sgxdi_v.4s, v3.8h, v0.8h
-        smlsl           sgxdi_v2.4s, v17.4h, v16.4h
-        smlsl2          sgxdi_v2.4s, v17.8h, v16.8h
-3:
-        add             y, y, #1
-        cmp             y, #(BDOF_MIN_BLOCK_SIZE)
-        mov             x13, y
-        b.gt            4f
-        b.lt            1b
-        tbz             pad_mask, #3, 1b
-        sub             x13, x13, #1                // pad bottom
-        b               1b
+        // save line4 results
+        bdof_vx_vy_8x_save_line v5, v6, v7, v16, v17
+2:
+        addp            v25.4s, v25.4s, v25.4s
+        addp            v27.4s, v27.4s, v27.4s
+        addp            v26.4s, v26.4s, v26.4s
+        addp            v23.4s, v23.4s, v23.4s
+        addp            v24.4s, v24.4s, v24.4s
+
+        clz             v28.4s, v25.4s
+        add             v28.4s, v28.4s, v2.4s               // log2
+        shl             v26.4s, v26.4s, #0x2
+        sshl            v26.4s, v26.4s, v28.4s
+
+        bdof_vx_vy_clip_mask v26, v3, v4, v25, v25
+        sqxtn           v26.4h, v25.4s
+        st1             {v26.s}[0], [x4], x11
+
+        subs            x12, x12, #(BDOF_MIN_BLOCK_SIZE)
+
+        clz             v26.4s, v27.4s
+        add             v26.4s, v26.4s, v2.4s
+        shl             v24.4s, v24.4s, #0x2
+        mul             v23.4s, v25.4s, v23.4s
+        sshr            v23.4s, v23.4s, #0x1
+        sub             v23.4s, v24.4s, v23.4s
+        sshl            v23.4s, v23.4s, v26.4s
+
+        bdof_vx_vy_clip_mask v23, v3, v4, v27, v23
+        sqxtn           v23.4h, v23.4s
+        st1             {v23.s}[0], [x5], x11
+
+        b.eq            16f
 4:
-        addv            s22, sgx2_v.4s
-        addv            s23, sgy2_v.4s
-        addv            s24, sgxgy_v.4s
-        addv            s25, sgxdi_v.4s
-        addv            s26, sgydi_v.4s
-
-        mov             w3, #31
-        mov             w16, #-15
-        mov             w17, #15
-40:
-        mov             w14, #0
-
-        mov             sgx2, v22.s[0]
-        mov             sgy2, v23.s[0]
-        mov             sgxgy, v24.s[0]
-        mov             sgxdi, v25.s[0]
-        mov             sgydi, v26.s[0]
-
-        cbz             sgx2, 5f
-        clz             w12, sgx2
-        lsl             sgxdi, sgxdi, #2
-        sub             w13, w3, w12                // log2(sgx2)
-        asr             sgxdi, sgxdi, w13
-        cmp             sgxdi, w16
-        csel            w14, w16, sgxdi, lt         // clip to -15
-        b.le            5f
-        cmp             sgxdi, w17
-        csel            w14, w17, sgxdi, gt         // clip to 15
+        mov             x15, #0x0                           // dy, inner loop
+
+        movi            v25.2d, #0
+        movi            v27.2d, #0
+        movi            v23.2d, #0
+        movi            v26.2d, #0
+        movi            v24.2d, #0
+        b               8f
+
 5:
-        strh            w14, [vx], #2
-
-        mov             w15, #0
-        cbz             sgy2, 6f
-        lsl             sgydi, sgydi, #2
-        smull           x14, w14, sgxgy
-        asr             w14, w14, #1
-        sub             sgydi, sgydi, w14
-        clz             w12, sgy2
-        sub             w13, w3, w12                // log2(sgy2)
-        asr             sgydi, sgydi, w13
-        cmp             sgydi, w16
-        csel            w15, w16, sgydi, lt         // clip to -15
-        b.le            6f
-        cmp             sgydi, w17
-        csel            w15, w17, sgydi, gt         // clip to 15
-6:
-        strh            w15, [vy], #2
-        cbz             x0, 7f
-        addv            s22, sgx2_v2.4s
-        addv            s23, sgy2_v2.4s
-        addv            s24, sgxgy_v2.4s
-        addv            s25, sgxdi_v2.4s
-        addv            s26, sgydi_v2.4s
-        mov             x0, #0
-        b               40b
-7:
+        // add line(-1) and line0 from previous results
+        bdof_vx_vy_8x_add_line v18, v19, v20, v21, v22
+        bdof_vx_vy_8x_add_line v5, v6, v7, v16, v17
+        add             x15, x15, #1
+8:
+        cmp             w12, w6
+        b.hs            9f
+        // y < block_h && dy == 0, reuse previous results
+        cbz             x15, 5b
+9:
+        ldr             q28, [x0]                                   // src0
+        ldr             q29, [x1]                                   // src1
+        ldr             q30, [x14], #(BDOF_BLOCK_SIZE * 2)          // gh0
+        ldr             q31, [x13], #(BDOF_BLOCK_SIZE * 2)          // gh1
+        ldr             q8, [x10], #(BDOF_BLOCK_SIZE * 2)           // gv0
+        ldr             q9, [x9], #(BDOF_BLOCK_SIZE * 2)            // gv1
+        add             x0, x0, #(VVC_MAX_PB_SIZE * 2)
+        add             x1, x1, #(VVC_MAX_PB_SIZE * 2)
+
+        sshr            v28.8h, v28.8h, #0x4
+        sshr            v29.8h, v29.8h, #0x4
+        shadd           v30.8h, v30.8h, v31.8h                      // tmph
+        shadd           v31.8h, v8.8h, v9.8h                        // tmpv
+        sub             v8.8h, v28.8h, v29.8h                       // diff
+
+        abs             v28.8h, v30.8h
+        abs             v29.8h, v31.8h
+
+        bdof_vx_vy_8x_padding_left_right v28, v9, v10, v28
+        bdof_vx_vy_8x_padding_left_right v29, v9, v10, v29
+
+        bdof_vx_vy_sign v30, v9, v10, v9
+        bdof_vx_vy_sign v31, v10, v31, v31
+
+        mul             v30.8h, v31.8h, v30.8h
+        mul             v9.8h, v9.8h, v8.8h
+        mul             v8.8h, v31.8h, v8.8h
+
+        bdof_vx_vy_8x_padding_left_right v30, v31, v10, v30
+        bdof_vx_vy_8x_padding_left_right v9, v31, v10, v31
+        bdof_vx_vy_8x_padding_left_right v8, v9, v10, v8
+
+        bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
+
+        cmp             w12, w6
+        b.ne            10f
+        cbnz            x15, 10f
+
+        // y == block_h && dy == 0, duplicate first line results
+        bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
+        add             x15, x15, #0x1
+        b               9b
+10:
+        cmp             x15, #(BDOF_MIN_BLOCK_SIZE - 1)
+        b.eq            11f
+        cmp             x15, #(BDOF_MIN_BLOCK_SIZE)
+        b.ne            12f
+        b               1b
+11:
+        // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
+        // duplicate the results and break
+        cmp             x12, #(BDOF_MIN_BLOCK_SIZE)
+        b.eq            13f
+        bdof_vx_vy_8x_save_line v18, v19, v20, v21, v22
+12:
+        add             x15, x15, #1
+        b               8b
+13:
+        // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
+        // padding bottom then break
+        bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
+        b               2b
+16:
+        ldp             d9, d8, [sp, #0x10]
+        ldp             d11, d10, [sp], #0x20
         ret
+endfunc
 
-.unreq src0
-.unreq src1
-.unreq pad_mask
-.unreq gh
-.unreq gv
-.unreq vx
-.unreq vy
-.unreq sgx2
-.unreq sgy2
-.unreq sgxgy
-.unreq sgxdi
-.unreq sgydi
-.unreq sgx2_v
-.unreq sgy2_v
-.unreq sgxgy_v
-.unreq sgxdi_v
-.unreq sgydi_v
-.unreq sgx2_v2
-.unreq sgy2_v2
-.unreq sgxgy_v2
-.unreq sgxdi_v2
-.unreq sgydi_v2
-.unreq y
+/*
+ * x0: const int16_t *_src0,
+ * x1: const int16_t *_src1,
+ * x2: int16_t *gradient_h[2],
+ * x3: int16_t *gradient_v[2],
+ * x4: int16_t vx[16],
+ * x5: int16_t vy[16],
+ * w6: int block_h
+ */
+function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
+        sub             sp, sp, #0x80
+        stp             d15, d14, [sp, #0x30]
+        stp             d13, d12, [sp, #0x40]
+        stp             d11, d10, [sp, #0x50]
+        stp             d9, d8,   [sp, #0x60]
+        stp             x29, x30, [sp, #0x70]
+
+        ldp             x8, x9, [x2]                        // gh0, gh1
+        ldp             x10, x11, [x3]                      // gv0, gv1
+        movrel          x12, bdof_vx_vy_16x_tbl
+        ldp             q0, q1, [x12]                       // table
+        mov             w13, w6                             // y = block_h
+        b               4f
+
+1:
+        // save line4
+        bdof_vx_vy_16x_save_line v6, v7, v16, v17, v18
+2:
+        clz             v3.4s, v25.4s
+        mvni            v5.4s, #0x1e
+        add             v3.4s, v3.4s, v5.4s                 // -log2()
+        shl             v4.4s, v28.4s, #0x2
+        sshl            v3.4s, v4.4s, v3.4s
+
+        movi            v28.4s, #0xf                        // clip to 15
+        mvni            v29.4s, #0xe                        // clip to -15
+        bdof_vx_vy_clip_mask v3, v28, v29, v25, v3
+        sqxtn           v4.4h, v3.4s
+        st1             {v4.d}[0], [x4], #(BDOF_MIN_BLOCK_SIZE * 2)
+
+        subs            x13, x13, #(BDOF_MIN_BLOCK_SIZE)    // y -= BDOF_MIN_BLOCK_SIZE
+
+        clz             v4.4s, v24.4s
+        add             v4.4s, v4.4s, v5.4s                 // -log2()
+        shl             v5.4s, v27.4s, #0x2
+        mul             v3.4s, v3.4s, v26.4s
+        sshr            v3.4s, v3.4s, #0x1
+        sub             v3.4s, v5.4s, v3.4s
+        sshl            v3.4s, v3.4s, v4.4s
+
+        bdof_vx_vy_clip_mask v3, v28, v29, v24, v3
+        sqxtn           v3.4h, v3.4s
+        st1             {v3.d}[0], [x5], #(BDOF_MIN_BLOCK_SIZE * 2)
+        b.eq            16f
+4:
+        mov             w14, #0x0                           // dy, inner loop
+
+        movi            v25.2d, #0
+        movi            v24.2d, #0
+        movi            v26.2d, #0
+        movi            v28.2d, #0
+        movi            v27.2d, #0
+        b               8f
+
+5:
+        // add line(-1) and line0 from previous results
+        bdof_vx_vy_16x_add_line v19, v20, v21, v22, v23
+        bdof_vx_vy_16x_add_line v6, v7, v16, v17, v18
+        add             w14, w14, #0x1
+
+ 8:
+        cmp             w13, w6
+        b.hs            9f
+        // y < block_h && dy == 0, reuse previous results
+        cbz             w14, 5b
+9:
+        ld1             {v29.8h, v30.8h}, [x0]              // src0
+        sshr            v31.8h, v29.8h, #0x4
+        ld1             {v8.8h, v9.8h}, [x1]                // src1
+        sshr            v10.8h, v8.8h, #0x4
+        ld1             {v11.8h, v12.8h}, [x8], #32         // gh0
+        sshr            v29.8h, v30.8h, #0x4
+        sshr            v30.8h, v9.8h, #0x4
+        ld1             {v8.8h, v9.8h}, [x9], #32           // gh1
+        shadd           v13.8h, v11.8h, v8.8h               // (gh0 + gh1) >> 1, left half
+        ld1             {v14.8h, v15.8h}, [x10], #32        // gv0
+        ld1             {v3.8h, v4.8h}, [x11], #32          // gv1
+        shadd           v5.8h, v14.8h, v3.8h                // (gv0 + gv1) >> 1, left half
+        sub             v31.8h, v31.8h, v10.8h              // diff, left half
+        shadd           v8.8h, v12.8h, v9.8h                // (gh0 + gh1) >> 1, right half
+        shadd           v3.8h, v15.8h, v4.8h                // (gv0 + gv1) >> 1, right half
+        sub             v4.8h, v29.8h, v30.8h               // diff, right half
+
+        abs             v29.8h, v13.8h
+        abs             v30.8h, v8.8h
+        abs             v9.8h, v5.8h
+        abs             v10.8h, v3.8h
+
+        add             x0, x0, #(VVC_MAX_PB_SIZE * 2)
+        add             x1, x1, #(VVC_MAX_PB_SIZE * 2)
+
+        bdof_vx_vy_16x_padding_left_right v29, v30, v11, v12, v14, v29
+        bdof_vx_vy_16x_padding_left_right v9, v10, v11, v12, v14, v30
+
+        bdof_vx_vy_sign v13, v9, v10, v9
+        bdof_vx_vy_sign v8, v10, v11, v10
+        bdof_vx_vy_sign v5, v11, v5, v5
+        bdof_vx_vy_sign v3, v11, v3, v3
+
+        mul             v11.8h, v5.8h, v13.8h
+        mul             v12.8h, v3.8h, v8.8h
+        mul             v8.8h, v9.8h, v31.8h
+        mul             v9.8h, v10.8h, v4.8h
+        mul             v13.8h, v5.8h, v31.8h
+        mul             v14.8h, v3.8h, v4.8h
+
+        bdof_vx_vy_16x_padding_left_right v11, v12, v3, v4, v5, v31
+        bdof_vx_vy_16x_padding_left_right v8, v9, v3, v4, v5, v8
+        bdof_vx_vy_16x_padding_left_right v13, v14, v3, v4, v5, v9
+
+        bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
+        // check whether padding top
+        cmp             w13, w6
+        b.ne            10f
+        cbnz            w14, 10f
+        // y == block_h && dy == 0, padding top
+        bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
+        add             w14, w14, #0x1
+        b               9b
+10:
+        cmp             w14, #(BDOF_MIN_BLOCK_SIZE - 1)
+        b.eq            11f
+        cmp             w14, #(BDOF_MIN_BLOCK_SIZE)
+        b.ne            12f
+        // save line4
+        b               1b
+ 11:
+        // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1, padding bottom
+        cmp             x13, #(BDOF_MIN_BLOCK_SIZE)
+        b.eq            13f
+        // save line3
+        bdof_vx_vy_16x_save_line v19, v20, v21, v22, v23
+12:
+        add             w14, w14, #0x1                      // dy++
+        b               8b
+13:
+        // padding bottom
+        bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
+        b               2b
+16:
+        // restore
+        ldp             x29, x30, [sp, #0x70]
+        ldp             d9, d8, [sp, #0x60]
+        ldp             d11, d10, [sp, #0x50]
+        ldp             d13, d12, [sp, #0x40]
+        ldp             d15, d14, [sp, #0x30]
+        add             sp, sp, #0x80
+        ret
 endfunc
diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c
index ac6182b09d..d8ddaacb14 100644
--- a/libavcodec/aarch64/vvc/of_template.c
+++ b/libavcodec/aarch64/vvc/of_template.c
@@ -41,6 +41,11 @@ static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride,
     ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1],
                                     BDOF_BLOCK_SIZE,
                                     _src1, MAX_PB_SIZE, block_w, block_h);
+    int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
+    if (block_w == 8)
+        ff_vvc_derive_bdof_vx_vy_8x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
+    else
+        ff_vvc_derive_bdof_vx_vy_16x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
 
     for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
         for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE * 2) {
@@ -50,14 +55,10 @@ static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride,
             int idx = BDOF_BLOCK_SIZE * y + x;
             const int16_t *gh[] = {gradient_h[0] + idx, gradient_h[1] + idx};
             const int16_t *gv[] = {gradient_v[0] + idx, gradient_v[1] + idx};
-            int16_t vx[2], vy[2];
-            int pad_mask = !x | ((!y) << 1) |
-                           ((x + 2 * BDOF_MIN_BLOCK_SIZE == block_w) << 2) |
-                           ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3);
-            ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, vx, vy);
+            int idx1 = y + x / BDOF_MIN_BLOCK_SIZE;
             FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(d, dst_stride,
-                                                             src0, src1, gh, gv,
-                                                             vx, vy);
+                                                     src0, src1, gh, gv,
+                                                     vx + idx1, vy + idx1);
         }
         dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
     }
-- 
2.46.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".