From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.ffmpeg.org (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTPS id C65BE47C3C for ; Thu, 14 Aug 2025 16:35:42 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.ffmpeg.org (Postfix) with ESMTP id 5E06968D203; Thu, 14 Aug 2025 19:35:38 +0300 (EEST) Received: from 1e8b7847f7d1 (code.ffmpeg.org [188.245.149.3]) by ffbox0-bg.ffmpeg.org (Postfix) with ESMTPS id 7D20D68D164 for ; Thu, 14 Aug 2025 19:35:36 +0300 (EEST) MIME-Version: 1.0 From: Zhao Zhili To: ffmpeg-devel@ffmpeg.org Subject: [FFmpeg-devel] =?utf-8?q?=5BPATCH=5D_vvc-bdof-rework-2_=28PR_=23?= =?utf-8?q?20241=29?= X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Message-Id: <20250814163538.5E06968D203@ffbox0-bg.ffmpeg.org> Date: Thu, 14 Aug 2025 19:35:38 +0300 (EEST) Archived-At: List-Archive: List-Post: PR #20241 opened by Zhao Zhili (quink) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20241 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20241.patch >From c3362c98ce019e218463a62fbee770b16c3bb478 Mon Sep 17 00:00:00 2001 From: Zhao Zhili Date: Fri, 20 Jun 2025 21:15:20 +0800 Subject: [PATCH 1/2] avcodec/aarch64/vvc: Optimize derive_bdof_vx_vy Before After ----------------------------------------------------------------- apply_bdof_8_8x16_c: | 7375.5 ( 1.00x) | 7473.8 ( 1.00x) apply_bdof_8_8x16_neon: | 1875.1 ( 3.93x) | 1135.8 ( 6.58x) apply_bdof_8_16x8_c: | 7273.9 ( 1.00x) | 7204.0 ( 1.00x) apply_bdof_8_16x8_neon: | 1738.2 ( 4.18x) | 1013.0 ( 7.11x) apply_bdof_8_16x16_c: | 14744.9 ( 1.00x) | 14712.6 ( 1.00x) apply_bdof_8_16x16_neon: | 3446.7 ( 4.28x) | 1997.7 ( 7.36x) apply_bdof_10_8x16_c: | 7352.4 ( 1.00x) | 7485.7 ( 1.00x) apply_bdof_10_8x16_neon: | 1861.0 ( 3.95x) | 1134.1 ( 6.60x) apply_bdof_10_16x8_c: | 7330.5 ( 1.00x) | 7232.8 ( 1.00x) apply_bdof_10_16x8_neon: | 1747.2 ( 4.20x) | 1002.6 ( 7.21x) apply_bdof_10_16x16_c: | 14522.4 ( 1.00x) | 14664.8 ( 1.00x) apply_bdof_10_16x16_neon: | 3490.5 ( 4.16x) | 1978.4 ( 7.41x) apply_bdof_12_8x16_c: | 7389.0 ( 1.00x) | 7380.1 ( 1.00x) apply_bdof_12_8x16_neon: | 1861.3 ( 3.97x) | 1134.0 ( 6.51x) apply_bdof_12_16x8_c: | 7283.1 ( 1.00x) | 7336.9 ( 1.00x) apply_bdof_12_16x8_neon: | 1749.1 ( 4.16x) | 1002.3 ( 7.32x) apply_bdof_12_16x16_c: | 14580.7 ( 1.00x) | 14502.7 ( 1.00x) apply_bdof_12_16x16_neon: | 3472.9 ( 4.20x) | 1978.3 ( 7.33x) Signed-off-by: Zhao Zhili --- libavcodec/aarch64/vvc/dsp_init.c | 17 +- libavcodec/aarch64/vvc/inter.S | 606 ++++++++++++++++----------- libavcodec/aarch64/vvc/of_template.c | 15 +- 3 files changed, 386 insertions(+), 252 deletions(-) diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c index 9a171234f6..1db38ebb1d 100644 --- a/libavcodec/aarch64/vvc/dsp_init.c +++ b/libavcodec/aarch64/vvc/dsp_init.c @@ -37,11 +37,18 @@ void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h, ptrdiff_t src_stride, int width, int height); -void ff_vvc_derive_bdof_vx_vy_neon(const int16_t *_src0, const int16_t *_src1, - int pad_mask, - const int16_t **gradient_h, - const int16_t **gradient_v, - int16_t *vx, int16_t *vy); +void ff_vvc_derive_bdof_vx_vy_8x_neon(const int16_t *_src0, + const int16_t *_src1, + int16_t *const gradient_h[2], + int16_t *const gradient_v[2], + int16_t vx[16], int16_t vy[16], + int block_h); +void ff_vvc_derive_bdof_vx_vy_16x_neon(const int16_t *_src0, + const int16_t *_src1, + int16_t *const gradient_h[2], + int16_t *const gradient_v[2], + int16_t vx[16], int16_t vy[16], + int block_h); #define BIT_DEPTH 8 #include "alf_template.c" #include "of_template.c" diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S index c299e6f68b..06c6f3619b 100644 --- a/libavcodec/aarch64/vvc/inter.S +++ b/libavcodec/aarch64/vvc/inter.S @@ -804,262 +804,388 @@ function ff_vvc_apply_bdof_block_12_neon, export=1 vvc_apply_bdof_block 12 endfunc -function ff_vvc_derive_bdof_vx_vy_neon, export=1 - src0 .req x0 - src1 .req x1 - pad_mask .req w2 - gh .req x3 - gv .req x4 - vx .req x5 - vy .req x6 +const bdof_vx_vy_8x_tbl + .byte 0, 1, 16, 16, 16, 16, 8, 9 + .byte 6, 7, 16, 16, 16, 16, 14, 15 +endconst - gh0 .req x7 - gh1 .req x8 - gv0 .req x9 - gv1 .req x10 - y .req x12 +const bdof_vx_vy_16x_tbl + .byte 0, 1, 64, 64, 64, 64, 8, 9 + .byte 6, 7, 64, 64, 64, 64, 16, 17 + .byte 14, 15, 64, 64, 64, 64, 24, 25 + .byte 22, 23, 64, 64, 64, 64, 30, 31 +endconst - sgx2 .req w7 - sgy2 .req w8 - sgxgy .req w9 - sgxdi .req w10 - sgydi .req w11 +// line(-1), line0, line1, line2, line3, line4 +// line3 and line4 becomes line(-1) and line0 in the next block. +.macro bdof_vx_vy_8x_save_line tmp0, tmp1, tmp2, tmp3, tmp4 + mov \tmp0\().16b, v28.16b + mov \tmp1\().16b, v29.16b + mov \tmp2\().16b, v30.16b + mov \tmp3\().16b, v31.16b + mov \tmp4\().16b, v8.16b +.endm - sgx2_v .req v22 - sgy2_v .req v23 - sgxgy_v .req v24 - sgxdi_v .req v25 - sgydi_v .req v26 +.macro bdof_vx_vy_8x_add_line tmp0, tmp1, tmp2, tmp3, tmp4 + add v25.4s, v25.4s, \tmp0\().4s + add v27.4s, v27.4s, \tmp1\().4s + add v23.4s, v23.4s, \tmp2\().4s + sub v26.4s, v26.4s, \tmp3\().4s + sub v24.4s, v24.4s, \tmp4\().4s +.endm - sgx2_v2 .req v27 - sgy2_v2 .req v28 - sgxgy_v2 .req v29 - sgxdi_v2 .req v30 - sgydi_v2 .req v31 +.macro bdof_vx_vy_8x_padding_left_right src, tmp0, tmp1, dst + tbl \tmp0\().16b, { \src\().16b }, v0.16b + saddl \tmp1\().4s, \tmp0\().4h, \src\().4h + saddl2 \dst\().4s, \tmp0\().8h, \src\().8h + addp \dst\().4s, \tmp1\().4s, \dst\().4s +.endm + +.macro bdof_vx_vy_sign src, tmp0, tmp1, dst + cmlt \tmp0\().8h, \src\().8h, #0 + cmgt \tmp1\().8h, \src\().8h, #0 + sub \dst\().8h, \tmp0\().8h, \tmp1\().8h +.endm + +.macro bdof_vx_vy_clip_mask src, max, min, mask, dst + smin \src\().4s, \src\().4s, \max\().4s + smax \src\().4s, \src\().4s, \min\().4s + cmgt \mask\().4s, \mask\().4s, #0 + and \dst\().16b, \src\().16b, \mask\().16b +.endm + +.macro bdof_vx_vy_16x_save_line tmp0, tmp1, tmp2, tmp3, tmp4 + mov \tmp0\().16b, v29.16b + mov \tmp1\().16b, v30.16b + mov \tmp2\().16b, v31.16b + mov \tmp3\().16b, v8.16b + mov \tmp4\().16b, v9.16b +.endm + +.macro bdof_vx_vy_16x_add_line tmp0, tmp1, tmp2, tmp3, tmp4 + add v25.4s, v25.4s, \tmp0\().4s + add v24.4s, v24.4s, \tmp1\().4s + add v26.4s, v26.4s, \tmp2\().4s + sub v28.4s, v28.4s, \tmp3\().4s + sub v27.4s, v27.4s, \tmp4\().4s +.endm + +.macro bdof_vx_vy_16x_padding_left_right src0, src1, tmp0, tmp1, tmp2, dst + tbl \tmp0\().16b, {\src0\().16b, \src1\().16b}, v0.16b + tbl v2.16b, {\src0\().16b, \src1\().16b}, v1.16b + saddl \tmp1\().4s, \tmp0\().4h, \src0\().4h + saddl \tmp2\().4s, v2.4h, \src1\().4h + saddl2 \tmp0\().4s, \tmp0\().8h, \src0\().8h + saddl2 \dst\().4s, v2.8h, \src1\().8h + addp \tmp0\().4s, \tmp1\().4s, \tmp0\().4s + addp \dst\().4s, \tmp2\().4s, \dst\().4s + addp \dst\().4s, \tmp0\().4s, \dst\().4s +.endm + +/* + * x0: const int16_t *_src0, + * x1: const int16_t *_src1, + * x2: int16_t *gradient_h[2], + * x3: int16_t *gradient_v[2], + * x4: int16_t vx[16], + * x5: int16_t vy[16], + * w6: int block_h + */ +function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1 + stp d11, d10, [sp, #-0x20]! + stp d9, d8, [sp, #0x10] + + ldp x14, x13, [x2] // gh0, gh1 + ldp x10, x9, [x3] // gv0, gv1 + movrel x11, bdof_vx_vy_8x_tbl + ldr q0, [x11] // table + mvni v2.4s, #30 // -31, for log2 + movi v3.4s, #15 // clip to 15 + mvni v4.4s, #14 // clip to -15 + + mov w11, #0x8 + mov w12, w6 // y = block_h + b 4f - ldp gh0, gh1, [gh] - ldp gv0, gv1, [gv] - movi sgx2_v.4s, #0 - movi sgy2_v.4s, #0 - movi sgxgy_v.4s, #0 - movi sgxdi_v.4s, #0 - movi sgydi_v.4s, #0 - movi sgx2_v2.4s, #0 - movi sgy2_v2.4s, #0 - movi sgxgy_v2.4s, #0 - movi sgxdi_v2.4s, #0 - movi sgydi_v2.4s, #0 - mov x13, #-1 // dy - movi v6.4s, #0 - mov y, #-1 - tbz pad_mask, #1, 1f // check pad top - mov x13, #0 // dy: pad top 1: - mov x16, #-2 // dx - add x14, src0, x13, lsl #8 // local src0 - add x15, src1, x13, lsl #8 // local src1 - add x17, x16, x13, lsl #5 - ldr q0, [x14, x16] - ldr q1, [x15, x16] - ldr q2, [gh0, x17] - ldr q3, [gh1, x17] - ldr q4, [gv0, x17] - ldr q5, [gv1, x17] - add x16, x16, #8 - add x17, x17, #8 - ins v0.s[3], v6.s[3] - ins v1.s[3], v6.s[3] - ins v2.s[3], v6.s[3] - ins v3.s[3], v6.s[3] - ins v4.s[3], v6.s[3] - ins v5.s[3], v6.s[3] + // save line4 results + bdof_vx_vy_8x_save_line v5, v6, v7, v16, v17 +2: + addp v25.4s, v25.4s, v25.4s + addp v27.4s, v27.4s, v27.4s + addp v26.4s, v26.4s, v26.4s + addp v23.4s, v23.4s, v23.4s + addp v24.4s, v24.4s, v24.4s - ldr q16, [x14, x16] - ldr q17, [x15, x16] - ldr q18, [gh0, x17] - ldr q19, [gh1, x17] - ldr q20, [gv0, x17] - ldr q21, [gv1, x17] - ins v16.s[3], v6.s[3] - ins v17.s[3], v6.s[3] - ins v18.s[3], v6.s[3] - ins v19.s[3], v6.s[3] - ins v20.s[3], v6.s[3] - ins v21.s[3], v6.s[3] + clz v28.4s, v25.4s + add v28.4s, v28.4s, v2.4s // log2 + shl v26.4s, v26.4s, #0x2 + sshl v26.4s, v26.4s, v28.4s - tbz pad_mask, #0, 20f - // pad left - ins v0.h[0], v0.h[1] - ins v1.h[0], v1.h[1] - ins v2.h[0], v2.h[1] - ins v3.h[0], v3.h[1] - ins v4.h[0], v4.h[1] - ins v5.h[0], v5.h[1] -20: - tbz pad_mask, #2, 21f - // pad right - ins v16.h[5], v16.h[4] - ins v17.h[5], v17.h[4] - ins v18.h[5], v18.h[4] - ins v19.h[5], v19.h[4] - ins v20.h[5], v20.h[4] - ins v21.h[5], v21.h[4] -21: - sshr v0.8h, v0.8h, #4 - sshr v1.8h, v1.8h, #4 - add v2.8h, v2.8h, v3.8h - add v4.8h, v4.8h, v5.8h - sub v0.8h, v0.8h, v1.8h // diff - sshr v2.8h, v2.8h, #1 // temph - sshr v4.8h, v4.8h, #1 // tempv + bdof_vx_vy_clip_mask v26, v3, v4, v25, v25 + sqxtn v26.4h, v25.4s + st1 {v26.s}[0], [x4], x11 - sshr v16.8h, v16.8h, #4 - sshr v17.8h, v17.8h, #4 - add v18.8h, v18.8h, v19.8h - add v20.8h, v20.8h, v21.8h - sub v16.8h, v16.8h, v17.8h // diff - sshr v18.8h, v18.8h, #1 // temph - sshr v20.8h, v20.8h, #1 // tempv + subs x12, x12, #(BDOF_MIN_BLOCK_SIZE) - abs v3.8h, v2.8h - abs v5.8h, v4.8h - uxtl v19.4s, v3.4h - uxtl v21.4s, v5.4h - uxtl2 v3.4s, v3.8h - uxtl2 v5.4s, v5.8h - add v3.4s, v3.4s, v19.4s - add v5.4s, v5.4s, v21.4s - add sgx2_v.4s, sgx2_v.4s, v3.4s - add sgy2_v.4s, sgy2_v.4s, v5.4s + clz v26.4s, v27.4s + add v26.4s, v26.4s, v2.4s + shl v24.4s, v24.4s, #0x2 + mul v23.4s, v25.4s, v23.4s + sshr v23.4s, v23.4s, #0x1 + sub v23.4s, v24.4s, v23.4s + sshl v23.4s, v23.4s, v26.4s - abs v3.8h, v18.8h - abs v5.8h, v20.8h - uxtl v19.4s, v3.4h - uxtl v21.4s, v5.4h - uxtl2 v3.4s, v3.8h - uxtl2 v5.4s, v5.8h - add v3.4s, v3.4s, v19.4s - add v5.4s, v5.4s, v21.4s - add sgx2_v2.4s, sgx2_v2.4s, v3.4s - add sgy2_v2.4s, sgy2_v2.4s, v5.4s + bdof_vx_vy_clip_mask v23, v3, v4, v27, v23 + sqxtn v23.4h, v23.4s + st1 {v23.s}[0], [x5], x11 - cmgt v17.8h, v4.8h, #0 - cmlt v7.8h, v4.8h, #0 - cmgt v19.8h, v20.8h, #0 - cmlt v21.8h, v20.8h, #0 - sub v17.8h, v7.8h, v17.8h // VVC_SIGN(tempv) - sub v19.8h, v21.8h, v19.8h // VVC_SIGN(tempv) - - smlal sgxgy_v.4s, v17.4h, v2.4h - smlal2 sgxgy_v.4s, v17.8h, v2.8h - smlsl sgydi_v.4s, v17.4h, v0.4h - smlsl2 sgydi_v.4s, v17.8h, v0.8h - - cmgt v3.8h, v2.8h, #0 - cmlt v5.8h, v2.8h, #0 - cmgt v17.8h, v18.8h, #0 - cmlt v21.8h, v18.8h, #0 - sub v3.8h, v5.8h, v3.8h // VVC_SIGN(temph) - sub v17.8h, v21.8h, v17.8h // VVC_SIGN(temph) - - smlal sgxgy_v2.4s, v19.4h, v18.4h - smlal2 sgxgy_v2.4s, v19.8h, v18.8h - smlsl sgydi_v2.4s, v19.4h, v16.4h - smlsl2 sgydi_v2.4s, v19.8h, v16.8h - - smlsl sgxdi_v.4s, v3.4h, v0.4h - smlsl2 sgxdi_v.4s, v3.8h, v0.8h - smlsl sgxdi_v2.4s, v17.4h, v16.4h - smlsl2 sgxdi_v2.4s, v17.8h, v16.8h -3: - add y, y, #1 - cmp y, #(BDOF_MIN_BLOCK_SIZE) - mov x13, y - b.gt 4f - b.lt 1b - tbz pad_mask, #3, 1b - sub x13, x13, #1 // pad bottom - b 1b + b.eq 16f 4: - addv s22, sgx2_v.4s - addv s23, sgy2_v.4s - addv s24, sgxgy_v.4s - addv s25, sgxdi_v.4s - addv s26, sgydi_v.4s + mov x15, #0x0 // dy, inner loop - mov w3, #31 - mov w16, #-15 - mov w17, #15 -40: - mov w14, #0 + movi v25.2d, #0 + movi v27.2d, #0 + movi v23.2d, #0 + movi v26.2d, #0 + movi v24.2d, #0 + b 8f - mov sgx2, v22.s[0] - mov sgy2, v23.s[0] - mov sgxgy, v24.s[0] - mov sgxdi, v25.s[0] - mov sgydi, v26.s[0] - - cbz sgx2, 5f - clz w12, sgx2 - lsl sgxdi, sgxdi, #2 - sub w13, w3, w12 // log2(sgx2) - asr sgxdi, sgxdi, w13 - cmp sgxdi, w16 - csel w14, w16, sgxdi, lt // clip to -15 - b.le 5f - cmp sgxdi, w17 - csel w14, w17, sgxdi, gt // clip to 15 5: - strh w14, [vx], #2 + // add line(-1) and line0 from previous results + bdof_vx_vy_8x_add_line v18, v19, v20, v21, v22 + bdof_vx_vy_8x_add_line v5, v6, v7, v16, v17 + add x15, x15, #1 +8: + cmp w12, w6 + b.hs 9f + // y < block_h && dy == 0, reuse previous results + cbz x15, 5b +9: + ldr q28, [x0] // src0 + ldr q29, [x1] // src1 + ldr q30, [x14], #(BDOF_BLOCK_SIZE * 2) // gh0 + ldr q31, [x13], #(BDOF_BLOCK_SIZE * 2) // gh1 + ldr q8, [x10], #(BDOF_BLOCK_SIZE * 2) // gv0 + ldr q9, [x9], #(BDOF_BLOCK_SIZE * 2) // gv1 + add x0, x0, #(VVC_MAX_PB_SIZE * 2) + add x1, x1, #(VVC_MAX_PB_SIZE * 2) - mov w15, #0 - cbz sgy2, 6f - lsl sgydi, sgydi, #2 - smull x14, w14, sgxgy - asr w14, w14, #1 - sub sgydi, sgydi, w14 - clz w12, sgy2 - sub w13, w3, w12 // log2(sgy2) - asr sgydi, sgydi, w13 - cmp sgydi, w16 - csel w15, w16, sgydi, lt // clip to -15 - b.le 6f - cmp sgydi, w17 - csel w15, w17, sgydi, gt // clip to 15 -6: - strh w15, [vy], #2 - cbz x0, 7f - addv s22, sgx2_v2.4s - addv s23, sgy2_v2.4s - addv s24, sgxgy_v2.4s - addv s25, sgxdi_v2.4s - addv s26, sgydi_v2.4s - mov x0, #0 - b 40b -7: + sshr v28.8h, v28.8h, #0x4 + sshr v29.8h, v29.8h, #0x4 + shadd v30.8h, v30.8h, v31.8h // tmph + shadd v31.8h, v8.8h, v9.8h // tmpv + sub v8.8h, v28.8h, v29.8h // diff + + abs v28.8h, v30.8h + abs v29.8h, v31.8h + + bdof_vx_vy_8x_padding_left_right v28, v9, v10, v28 + bdof_vx_vy_8x_padding_left_right v29, v9, v10, v29 + + bdof_vx_vy_sign v30, v9, v10, v9 + bdof_vx_vy_sign v31, v10, v31, v31 + + mul v30.8h, v31.8h, v30.8h + mul v9.8h, v9.8h, v8.8h + mul v8.8h, v31.8h, v8.8h + + bdof_vx_vy_8x_padding_left_right v30, v31, v10, v30 + bdof_vx_vy_8x_padding_left_right v9, v31, v10, v31 + bdof_vx_vy_8x_padding_left_right v8, v9, v10, v8 + + bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8 + + cmp w12, w6 + b.ne 10f + cbnz x15, 10f + + // y == block_h && dy == 0, duplicate first line results + bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8 + add x15, x15, #0x1 + b 9b +10: + cmp x15, #(BDOF_MIN_BLOCK_SIZE - 1) + b.eq 11f + cmp x15, #(BDOF_MIN_BLOCK_SIZE) + b.ne 12f + b 1b +11: + // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1 + // duplicate the results and break + cmp x12, #(BDOF_MIN_BLOCK_SIZE) + b.eq 13f + bdof_vx_vy_8x_save_line v18, v19, v20, v21, v22 +12: + add x15, x15, #1 + b 8b +13: + // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1 + // padding bottom then break + bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8 + b 2b +16: + ldp d9, d8, [sp, #0x10] + ldp d11, d10, [sp], #0x20 + ret +endfunc + +/* + * x0: const int16_t *_src0, + * x1: const int16_t *_src1, + * x2: int16_t *gradient_h[2], + * x3: int16_t *gradient_v[2], + * x4: int16_t vx[16], + * x5: int16_t vy[16], + * w6: int block_h + */ +function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1 + sub sp, sp, #0x80 + stp d15, d14, [sp, #0x30] + stp d13, d12, [sp, #0x40] + stp d11, d10, [sp, #0x50] + stp d9, d8, [sp, #0x60] + stp x29, x30, [sp, #0x70] + + ldp x8, x9, [x2] // gh0, gh1 + ldp x10, x11, [x3] // gv0, gv1 + movrel x12, bdof_vx_vy_16x_tbl + ldp q0, q1, [x12] // table + mov w13, w6 // y = block_h + b 4f + +1: + // save line4 + bdof_vx_vy_16x_save_line v6, v7, v16, v17, v18 +2: + clz v3.4s, v25.4s + mvni v5.4s, #0x1e + add v3.4s, v3.4s, v5.4s // -log2() + shl v4.4s, v28.4s, #0x2 + sshl v3.4s, v4.4s, v3.4s + + movi v28.4s, #0xf // clip to 15 + mvni v29.4s, #0xe // clip to -15 + bdof_vx_vy_clip_mask v3, v28, v29, v25, v3 + sqxtn v4.4h, v3.4s + st1 {v4.d}[0], [x4], #(BDOF_MIN_BLOCK_SIZE * 2) + + subs x13, x13, #(BDOF_MIN_BLOCK_SIZE) // y -= BDOF_MIN_BLOCK_SIZE + + clz v4.4s, v24.4s + add v4.4s, v4.4s, v5.4s // -log2() + shl v5.4s, v27.4s, #0x2 + mul v3.4s, v3.4s, v26.4s + sshr v3.4s, v3.4s, #0x1 + sub v3.4s, v5.4s, v3.4s + sshl v3.4s, v3.4s, v4.4s + + bdof_vx_vy_clip_mask v3, v28, v29, v24, v3 + sqxtn v3.4h, v3.4s + st1 {v3.d}[0], [x5], #(BDOF_MIN_BLOCK_SIZE * 2) + b.eq 16f +4: + mov w14, #0x0 // dy, inner loop + + movi v25.2d, #0 + movi v24.2d, #0 + movi v26.2d, #0 + movi v28.2d, #0 + movi v27.2d, #0 + b 8f + +5: + // add line(-1) and line0 from previous results + bdof_vx_vy_16x_add_line v19, v20, v21, v22, v23 + bdof_vx_vy_16x_add_line v6, v7, v16, v17, v18 + add w14, w14, #0x1 + + 8: + cmp w13, w6 + b.hs 9f + // y < block_h && dy == 0, reuse previous results + cbz w14, 5b +9: + ld1 {v29.8h, v30.8h}, [x0] // src0 + sshr v31.8h, v29.8h, #0x4 + ld1 {v8.8h, v9.8h}, [x1] // src1 + sshr v10.8h, v8.8h, #0x4 + ld1 {v11.8h, v12.8h}, [x8], #32 // gh0 + sshr v29.8h, v30.8h, #0x4 + sshr v30.8h, v9.8h, #0x4 + ld1 {v8.8h, v9.8h}, [x9], #32 // gh1 + shadd v13.8h, v11.8h, v8.8h // (gh0 + gh1) >> 1, left half + ld1 {v14.8h, v15.8h}, [x10], #32 // gv0 + ld1 {v3.8h, v4.8h}, [x11], #32 // gv1 + shadd v5.8h, v14.8h, v3.8h // (gv0 + gv1) >> 1, left half + sub v31.8h, v31.8h, v10.8h // diff, left half + shadd v8.8h, v12.8h, v9.8h // (gh0 + gh1) >> 1, right half + shadd v3.8h, v15.8h, v4.8h // (gv0 + gv1) >> 1, right half + sub v4.8h, v29.8h, v30.8h // diff, right half + + abs v29.8h, v13.8h + abs v30.8h, v8.8h + abs v9.8h, v5.8h + abs v10.8h, v3.8h + + add x0, x0, #(VVC_MAX_PB_SIZE * 2) + add x1, x1, #(VVC_MAX_PB_SIZE * 2) + + bdof_vx_vy_16x_padding_left_right v29, v30, v11, v12, v14, v29 + bdof_vx_vy_16x_padding_left_right v9, v10, v11, v12, v14, v30 + + bdof_vx_vy_sign v13, v9, v10, v9 + bdof_vx_vy_sign v8, v10, v11, v10 + bdof_vx_vy_sign v5, v11, v5, v5 + bdof_vx_vy_sign v3, v11, v3, v3 + + mul v11.8h, v5.8h, v13.8h + mul v12.8h, v3.8h, v8.8h + mul v8.8h, v9.8h, v31.8h + mul v9.8h, v10.8h, v4.8h + mul v13.8h, v5.8h, v31.8h + mul v14.8h, v3.8h, v4.8h + + bdof_vx_vy_16x_padding_left_right v11, v12, v3, v4, v5, v31 + bdof_vx_vy_16x_padding_left_right v8, v9, v3, v4, v5, v8 + bdof_vx_vy_16x_padding_left_right v13, v14, v3, v4, v5, v9 + + bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9 + // check whether padding top + cmp w13, w6 + b.ne 10f + cbnz w14, 10f + // y == block_h && dy == 0, padding top + bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9 + add w14, w14, #0x1 + b 9b +10: + cmp w14, #(BDOF_MIN_BLOCK_SIZE - 1) + b.eq 11f + cmp w14, #(BDOF_MIN_BLOCK_SIZE) + b.ne 12f + // save line4 + b 1b + 11: + // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1, padding bottom + cmp x13, #(BDOF_MIN_BLOCK_SIZE) + b.eq 13f + // save line3 + bdof_vx_vy_16x_save_line v19, v20, v21, v22, v23 +12: + add w14, w14, #0x1 // dy++ + b 8b +13: + // padding bottom + bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9 + b 2b +16: + // restore + ldp x29, x30, [sp, #0x70] + ldp d9, d8, [sp, #0x60] + ldp d11, d10, [sp, #0x50] + ldp d13, d12, [sp, #0x40] + ldp d15, d14, [sp, #0x30] + add sp, sp, #0x80 ret - -.unreq src0 -.unreq src1 -.unreq pad_mask -.unreq gh -.unreq gv -.unreq vx -.unreq vy -.unreq sgx2 -.unreq sgy2 -.unreq sgxgy -.unreq sgxdi -.unreq sgydi -.unreq sgx2_v -.unreq sgy2_v -.unreq sgxgy_v -.unreq sgxdi_v -.unreq sgydi_v -.unreq sgx2_v2 -.unreq sgy2_v2 -.unreq sgxgy_v2 -.unreq sgxdi_v2 -.unreq sgydi_v2 -.unreq y endfunc diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c index ac6182b09d..d8ddaacb14 100644 --- a/libavcodec/aarch64/vvc/of_template.c +++ b/libavcodec/aarch64/vvc/of_template.c @@ -41,6 +41,11 @@ static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride, ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1], BDOF_BLOCK_SIZE, _src1, MAX_PB_SIZE, block_w, block_h); + int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE]; + if (block_w == 8) + ff_vvc_derive_bdof_vx_vy_8x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h); + else + ff_vvc_derive_bdof_vx_vy_16x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h); for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) { for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE * 2) { @@ -50,14 +55,10 @@ static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride, int idx = BDOF_BLOCK_SIZE * y + x; const int16_t *gh[] = {gradient_h[0] + idx, gradient_h[1] + idx}; const int16_t *gv[] = {gradient_v[0] + idx, gradient_v[1] + idx}; - int16_t vx[2], vy[2]; - int pad_mask = !x | ((!y) << 1) | - ((x + 2 * BDOF_MIN_BLOCK_SIZE == block_w) << 2) | - ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3); - ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, vx, vy); + int idx1 = y + x / BDOF_MIN_BLOCK_SIZE; FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(d, dst_stride, - src0, src1, gh, gv, - vx, vy); + src0, src1, gh, gv, + vx + idx1, vy + idx1); } dst += BDOF_MIN_BLOCK_SIZE * dst_stride; } -- 2.49.1 >From 6edb238e7ac1f98c7b16b0db05e0f5436152cc48 Mon Sep 17 00:00:00 2001 From: Zhao Zhili Date: Thu, 14 Aug 2025 12:42:38 +0800 Subject: [PATCH 2/2] avcodec/aarch64/vvc: Optimize apply_bdof Before After -------------------------------------------------------------------- apply_bdof_8_8x16_c: | 7431.4 ( 1.00x) | 7371.7 ( 1.00x) apply_bdof_8_8x16_neon: | 1175.4 ( 6.32x) | 1036.3 ( 7.11x) apply_bdof_8_16x8_c: | 7182.2 ( 1.00x) | 7201.1 ( 1.00x) apply_bdof_8_16x8_neon: | 1021.7 ( 7.03x) | 879.9 ( 8.18x) apply_bdof_8_16x16_c: | 14577.1 ( 1.00x) | 14589.3 ( 1.00x) apply_bdof_8_16x16_neon: | 2012.8 ( 7.24x) | 1743.3 ( 8.37x) apply_bdof_10_8x16_c: | 7292.4 ( 1.00x) | 7308.5 ( 1.00x) apply_bdof_10_8x16_neon: | 1156.3 ( 6.31x) | 1045.3 ( 6.99x) apply_bdof_10_16x8_c: | 7112.4 ( 1.00x) | 7214.4 ( 1.00x) apply_bdof_10_16x8_neon: | 1007.6 ( 7.06x) | 904.8 ( 7.97x) apply_bdof_10_16x16_c: | 14363.3 ( 1.00x) | 14476.4 ( 1.00x) apply_bdof_10_16x16_neon: | 1986.9 ( 7.23x) | 1783.1 ( 8.12x) apply_bdof_12_8x16_c: | 7433.3 ( 1.00x) | 7374.7 ( 1.00x) apply_bdof_12_8x16_neon: | 1155.9 ( 6.43x) | 1040.8 ( 7.09x) apply_bdof_12_16x8_c: | 7171.1 ( 1.00x) | 7376.3 ( 1.00x) apply_bdof_12_16x8_neon: | 1010.8 ( 7.09x) | 899.4 ( 8.20x) apply_bdof_12_16x16_c: | 14515.5 ( 1.00x) | 14731.5 ( 1.00x) apply_bdof_12_16x16_neon: | 1988.4 ( 7.30x) | 1785.2 ( 8.25x) --- libavcodec/aarch64/vvc/dsp_init.c | 37 +-- libavcodec/aarch64/vvc/inter.S | 402 ++++++++++++++++++++++++--- libavcodec/aarch64/vvc/of_template.c | 65 ----- 3 files changed, 368 insertions(+), 136 deletions(-) delete mode 100644 libavcodec/aarch64/vvc/of_template.c diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c index 1db38ebb1d..df0b536539 100644 --- a/libavcodec/aarch64/vvc/dsp_init.c +++ b/libavcodec/aarch64/vvc/dsp_init.c @@ -30,38 +30,16 @@ #define BDOF_BLOCK_SIZE 16 #define BDOF_MIN_BLOCK_SIZE 4 -void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h, - int16_t *gradient_v, - ptrdiff_t gradient_stride, - const int16_t *_src, - ptrdiff_t src_stride, - int width, int height); - -void ff_vvc_derive_bdof_vx_vy_8x_neon(const int16_t *_src0, - const int16_t *_src1, - int16_t *const gradient_h[2], - int16_t *const gradient_v[2], - int16_t vx[16], int16_t vy[16], - int block_h); -void ff_vvc_derive_bdof_vx_vy_16x_neon(const int16_t *_src0, - const int16_t *_src1, - int16_t *const gradient_h[2], - int16_t *const gradient_v[2], - int16_t vx[16], int16_t vy[16], - int block_h); #define BIT_DEPTH 8 #include "alf_template.c" -#include "of_template.c" #undef BIT_DEPTH #define BIT_DEPTH 10 #include "alf_template.c" -#include "of_template.c" #undef BIT_DEPTH #define BIT_DEPTH 12 #include "alf_template.c" -#include "of_template.c" #undef BIT_DEPTH int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy, @@ -121,6 +99,15 @@ DMVR_FUN(hv_, 8) DMVR_FUN(hv_, 10) DMVR_FUN(hv_, 12) +#define APPLY_BDOF_FUNC(bd) \ + void ff_vvc_apply_bdof_ ## bd ## _neon(uint8_t *_dst, ptrdiff_t _dst_stride, \ + const int16_t *_src0, const int16_t *_src1, \ + int block_w, int block_h); + +APPLY_BDOF_FUNC(8) +APPLY_BDOF_FUNC(10) +APPLY_BDOF_FUNC(12) + void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) { int cpu_flags = av_get_cpu_flags(); @@ -202,7 +189,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.w_avg = vvc_w_avg_8; c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon; c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon; - c->inter.apply_bdof = apply_bdof_8; + c->inter.apply_bdof = ff_vvc_apply_bdof_8_neon; c->sao.band_filter[0] = ff_h26x_sao_band_filter_8x8_8_neon; for (int i = 1; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++) @@ -246,7 +233,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.avg = ff_vvc_avg_10_neon; c->inter.w_avg = vvc_w_avg_10; c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon; - c->inter.apply_bdof = apply_bdof_10; + c->inter.apply_bdof = ff_vvc_apply_bdof_10_neon; c->alf.filter[LUMA] = alf_filter_luma_10_neon; c->alf.filter[CHROMA] = alf_filter_chroma_10_neon; @@ -255,7 +242,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.w_avg = vvc_w_avg_12; c->inter.dmvr[0][0] = ff_vvc_dmvr_12_neon; c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon; - c->inter.apply_bdof = apply_bdof_12; + c->inter.apply_bdof = ff_vvc_apply_bdof_12_neon; c->alf.filter[LUMA] = alf_filter_luma_12_neon; c->alf.filter[CHROMA] = alf_filter_chroma_12_neon; diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S index 06c6f3619b..61de56c6ac 100644 --- a/libavcodec/aarch64/vvc/inter.S +++ b/libavcodec/aarch64/vvc/inter.S @@ -716,7 +716,93 @@ function ff_vvc_prof_grad_filter_8x_neon, export=1 .unreq height endfunc -.macro vvc_apply_bdof_block bit_depth +function ff_vvc_bdof_grad_filter_8x_neon, export=1 + gh0 .req x0 + gh1 .req x1 + gv0 .req x2 + gv1 .req x3 + src0 .req x4 + src1 .req x5 + width .req w6 + height .req w7 + +1: + mov x10, src0 + mov w11, width + mov x12, gh0 + mov x13, gv0 + mov x14, src1 + mov x15, gh1 + mov x16, gv1 +2: + ldur q0, [x10, #2] + ldur q1, [x10, #-2] + ldr q2, [x10, #(VVC_MAX_PB_SIZE << 1)] + ldr q3, [x10, #-(VVC_MAX_PB_SIZE << 1)] + sshr v0.8h, v0.8h, #6 + sshr v1.8h, v1.8h, #6 + ldur q4, [x14, #2] + ldur q5, [x14, #-2] + sshr v2.8h, v2.8h, #6 + sshr v3.8h, v3.8h, #6 + ldr q6, [x14, #(VVC_MAX_PB_SIZE << 1)] + ldr q7, [x14, #-(VVC_MAX_PB_SIZE << 1)] + // results of gradient_h0 + sub v0.8h, v0.8h, v1.8h + // results of gradient_v0 + sub v2.8h, v2.8h, v3.8h + + sshr v4.8h, v4.8h, #6 + sshr v5.8h, v5.8h, #6 + sshr v6.8h, v6.8h, #6 + sshr v7.8h, v7.8h, #6 + // results of gradient_h1 + sub v4.8h, v4.8h, v5.8h + // results of gradient_v1 + sub v6.8h, v6.8h, v7.8h + + add x10, x10, #16 + add x14, x14, #16 + + // (gradient_h0 + gradient_h1) >> 1 + shadd v1.8h, v0.8h, v4.8h + // gradient_h0 - gradient_h1 + sub v5.8h, v0.8h, v4.8h + + subs w11, w11, #8 + + // (gradient_v0 + gradient_v1) >> 1 + shadd v3.8h, v2.8h, v6.8h + // gradient_v0 - gradient_v1 + sub v7.8h, v2.8h, v6.8h + + st1 {v1.8h}, [x12], #16 + st1 {v5.8h}, [x15], #16 + st1 {v3.8h}, [x13], #16 + st1 {v7.8h}, [x16], #16 + b.ne 2b + + subs height, height, #1 + add gh0, gh0, #(BDOF_BLOCK_SIZE << 1) + add gv0, gv0, #(BDOF_BLOCK_SIZE << 1) + add src0, src0, #(VVC_MAX_PB_SIZE << 1) + add gh1, gh1, #(BDOF_BLOCK_SIZE << 1) + add gv1, gv1, #(BDOF_BLOCK_SIZE << 1) + add src1, src1, #(VVC_MAX_PB_SIZE << 1) + b.ne 1b + ret + +.unreq gh0 +.unreq gh1 +.unreq gv0 +.unreq gv1 +.unreq src0 +.unreq src1 +.unreq width +.unreq height +endfunc + +.macro vvc_apply_bdof_block_8x bit_depth dst .req x0 dst_stride .req x1 src0 .req x2 @@ -726,33 +812,28 @@ endfunc vx .req x6 vy .req x7 - ld1r {v0.8h}, [vx], #2 - ld1r {v1.8h}, [vy], #2 - ld1r {v2.8h}, [vx] - ld1r {v3.8h}, [vy] - ins v0.d[1], v2.d[1] - ins v1.d[1], v3.d[1] - + ldr w8, [sp] movi v7.4s, #(1 << (14 - \bit_depth)) - ldp x8, x9, [gh] - ldp x10, x11, [gv] mov x12, #(BDOF_BLOCK_SIZE * 2) - mov w13, #(BDOF_MIN_BLOCK_SIZE) mov x14, #(VVC_MAX_PB_SIZE * 2) .if \bit_depth >= 10 // clip pixel mov w15, #((1 << \bit_depth) - 1) movi v18.8h, #0 - lsl dst_stride, dst_stride, #1 dup v19.8h, w15 .endif + +0: + ld1r {v0.8h}, [vx], #2 + ld1r {v1.8h}, [vy], #2 + ld1r {v2.8h}, [vx] + ld1r {v3.8h}, [vy] + mov w13, #(BDOF_MIN_BLOCK_SIZE) + ins v0.d[1], v2.d[1] + ins v1.d[1], v3.d[1] 1: - ld1 {v2.8h}, [x8], x12 - ld1 {v3.8h}, [x9], x12 - ld1 {v4.8h}, [x10], x12 - ld1 {v5.8h}, [x11], x12 - sub v2.8h, v2.8h, v3.8h - sub v4.8h, v4.8h, v5.8h + ld1 {v2.8h}, [gh], x12 + ld1 {v4.8h}, [gv], x12 smull v3.4s, v0.4h, v2.4h smull2 v16.4s, v0.8h, v2.8h smlal v3.4s, v1.4h, v4.4h @@ -780,6 +861,11 @@ endfunc st1 {v5.8h}, [dst], dst_stride .endif b.ne 1b + + subs w8, w8, #(BDOF_MIN_BLOCK_SIZE) + add vx, vx, #(2 * BDOF_MIN_BLOCK_SIZE - 2) + add vy, vy, #(2 * BDOF_MIN_BLOCK_SIZE - 2) + b.ne 0b ret .unreq dst @@ -792,16 +878,128 @@ endfunc .unreq vy .endm -function ff_vvc_apply_bdof_block_8_neon, export=1 - vvc_apply_bdof_block 8 +function ff_vvc_apply_bdof_block_8x_8_neon, export=1 + vvc_apply_bdof_block_8x 8 endfunc -function ff_vvc_apply_bdof_block_10_neon, export=1 - vvc_apply_bdof_block 10 +function ff_vvc_apply_bdof_block_8x_10_neon, export=1 + vvc_apply_bdof_block_8x 10 endfunc -function ff_vvc_apply_bdof_block_12_neon, export=1 - vvc_apply_bdof_block 12 +function ff_vvc_apply_bdof_block_8x_12_neon, export=1 + vvc_apply_bdof_block_8x 12 +endfunc + +.macro vvc_apply_bdof_block_16x bit_depth + dst .req x0 + dst_stride .req x1 + src0 .req x2 + src1 .req x3 + gh .req x4 + gv .req x5 + vx .req x6 + vy .req x7 + + ldr w8, [sp] + movi v7.4s, #(1 << (14 - \bit_depth)) +.if \bit_depth >= 10 + // clip pixel + mov w15, #((1 << \bit_depth) - 1) + movi v18.8h, #0 + dup v19.8h, w15 +.endif + +0: + ld1r {v0.8h}, [vx], #2 + ld1r {v1.8h}, [vy], #2 + ld1r {v2.8h}, [vx], #2 + ld1r {v3.8h}, [vy], #2 + + mov w13, #(BDOF_MIN_BLOCK_SIZE) + + ld1r {v20.8h}, [vx], #2 + ld1r {v21.8h}, [vy], #2 + ld1r {v22.8h}, [vx], #2 + ld1r {v23.8h}, [vy], #2 + + ins v0.d[1], v2.d[1] + ins v1.d[1], v3.d[1] + ins v20.d[1], v22.d[1] + ins v21.d[1], v23.d[1] +1: + ldp q2, q22, [gh], #(BDOF_BLOCK_SIZE * 2) + ldp q4, q24, [gv], #(BDOF_BLOCK_SIZE * 2) + smull v3.4s, v0.4h, v2.4h + smull2 v16.4s, v0.8h, v2.8h + smlal v3.4s, v1.4h, v4.4h + smlal2 v16.4s, v1.8h, v4.8h + + ldp q5, q25, [src0], #(VVC_MAX_PB_SIZE * 2) + ldp q6, q26, [src1], #(VVC_MAX_PB_SIZE * 2) + + smull v23.4s, v20.4h, v22.4h + smull2 v27.4s, v20.8h, v22.8h + smlal v23.4s, v21.4h, v24.4h + smlal2 v27.4s, v21.8h, v24.8h + + saddl v2.4s, v5.4h, v6.4h + add v2.4s, v2.4s, v7.4s + add v2.4s, v2.4s, v3.4s + saddl2 v4.4s, v5.8h, v6.8h + add v4.4s, v4.4s, v7.4s + add v4.4s, v4.4s, v16.4s + + saddl v22.4s, v25.4h, v26.4h + add v22.4s, v22.4s, v7.4s + add v22.4s, v22.4s, v23.4s + saddl2 v24.4s, v25.8h, v26.8h + add v24.4s, v24.4s, v7.4s + add v24.4s, v24.4s, v27.4s + + sqshrn v5.4h, v2.4s, #(15 - \bit_depth) + sqshrn2 v5.8h, v4.4s, #(15 - \bit_depth) + sqshrn v25.4h, v22.4s, #(15 - \bit_depth) + sqshrn2 v25.8h, v24.4s, #(15 - \bit_depth) + + subs w13, w13, #1 +.if \bit_depth == 8 + sqxtun v5.8b, v5.8h + sqxtun2 v5.16b, v25.8h + str q5, [dst] +.else + smin v5.8h, v5.8h, v19.8h + smax v5.8h, v5.8h, v18.8h + smin v25.8h, v25.8h, v19.8h + smax v25.8h, v25.8h, v18.8h + stp q5, q25, [dst] +.endif + add dst, dst, dst_stride + b.ne 1b + + subs w8, w8, #(BDOF_MIN_BLOCK_SIZE) + b.ne 0b + ret + +.unreq dst +.unreq dst_stride +.unreq src0 +.unreq src1 +.unreq gh +.unreq gv +.unreq vx +.unreq vy +.endm + +function ff_vvc_apply_bdof_block_16x_8_neon, export=1 + vvc_apply_bdof_block_16x 8 +endfunc + +function ff_vvc_apply_bdof_block_16x_10_neon, export=1 + vvc_apply_bdof_block_16x 10 +endfunc + +function ff_vvc_apply_bdof_block_16x_12_neon, export=1 + vvc_apply_bdof_block_16x 12 endfunc const bdof_vx_vy_8x_tbl @@ -885,8 +1083,8 @@ endconst /* * x0: const int16_t *_src0, * x1: const int16_t *_src1, - * x2: int16_t *gradient_h[2], - * x3: int16_t *gradient_v[2], + * x2: const int16_t *gradient_h, + * x3: const int16_t *gradient_v, * x4: int16_t vx[16], * x5: int16_t vy[16], * w6: int block_h @@ -895,8 +1093,6 @@ function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1 stp d11, d10, [sp, #-0x20]! stp d9, d8, [sp, #0x10] - ldp x14, x13, [x2] // gh0, gh1 - ldp x10, x9, [x3] // gv0, gv1 movrel x11, bdof_vx_vy_8x_tbl ldr q0, [x11] // table mvni v2.4s, #30 // -31, for log2 @@ -964,17 +1160,13 @@ function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1 9: ldr q28, [x0] // src0 ldr q29, [x1] // src1 - ldr q30, [x14], #(BDOF_BLOCK_SIZE * 2) // gh0 - ldr q31, [x13], #(BDOF_BLOCK_SIZE * 2) // gh1 - ldr q8, [x10], #(BDOF_BLOCK_SIZE * 2) // gv0 - ldr q9, [x9], #(BDOF_BLOCK_SIZE * 2) // gv1 + ldr q30, [x2], #(BDOF_BLOCK_SIZE * 2) // (gh0 + gh1) >> 1 + ldr q31, [x3], #(BDOF_BLOCK_SIZE * 2) // (gv0 + gv1) >> 1 add x0, x0, #(VVC_MAX_PB_SIZE * 2) add x1, x1, #(VVC_MAX_PB_SIZE * 2) sshr v28.8h, v28.8h, #0x4 sshr v29.8h, v29.8h, #0x4 - shadd v30.8h, v30.8h, v31.8h // tmph - shadd v31.8h, v8.8h, v9.8h // tmpv sub v8.8h, v28.8h, v29.8h // diff abs v28.8h, v30.8h @@ -1033,8 +1225,8 @@ endfunc /* * x0: const int16_t *_src0, * x1: const int16_t *_src1, - * x2: int16_t *gradient_h[2], - * x3: int16_t *gradient_v[2], + * x2: const int16_t *gradient_h, + * x3: const int16_t *gradient_v, * x4: int16_t vx[16], * x5: int16_t vy[16], * w6: int block_h @@ -1047,8 +1239,6 @@ function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1 stp d9, d8, [sp, #0x60] stp x29, x30, [sp, #0x70] - ldp x8, x9, [x2] // gh0, gh1 - ldp x10, x11, [x3] // gv0, gv1 movrel x12, bdof_vx_vy_16x_tbl ldp q0, q1, [x12] // table mov w13, w6 // y = block_h @@ -1110,17 +1300,11 @@ function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1 sshr v31.8h, v29.8h, #0x4 ld1 {v8.8h, v9.8h}, [x1] // src1 sshr v10.8h, v8.8h, #0x4 - ld1 {v11.8h, v12.8h}, [x8], #32 // gh0 + ldp q13, q8, [x2], #32 // (gh0 + gh1) >> 1 sshr v29.8h, v30.8h, #0x4 sshr v30.8h, v9.8h, #0x4 - ld1 {v8.8h, v9.8h}, [x9], #32 // gh1 - shadd v13.8h, v11.8h, v8.8h // (gh0 + gh1) >> 1, left half - ld1 {v14.8h, v15.8h}, [x10], #32 // gv0 - ld1 {v3.8h, v4.8h}, [x11], #32 // gv1 - shadd v5.8h, v14.8h, v3.8h // (gv0 + gv1) >> 1, left half + ldp q5, q3, [x3], #32 // (gv0 + gv1) >> 1 sub v31.8h, v31.8h, v10.8h // diff, left half - shadd v8.8h, v12.8h, v9.8h // (gh0 + gh1) >> 1, right half - shadd v3.8h, v15.8h, v4.8h // (gv0 + gv1) >> 1, right half sub v4.8h, v29.8h, v30.8h // diff, right half abs v29.8h, v13.8h @@ -1189,3 +1373,129 @@ function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1 add sp, sp, #0x80 ret endfunc + +function ff_vvc_apply_bdof_10_neon, export=1 + mov w6, #10 + b 0f +endfunc + +function ff_vvc_apply_bdof_12_neon, export=1 + mov w6, #12 + b 0f +endfunc + +// int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2] +// int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2] +// int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE]; +#define APPLY_BDOF_STACK_SIZE ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 4) +#define GRADIENT_H0_OFFSET 2 +#define GRADIENT_H1_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 2 + 2) +#define GRADIENT_V0_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 4 + 2) +#define GRADIENT_V1_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 6 + 2) +#define VX_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8) +#define VY_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 2) +function ff_vvc_apply_bdof_8_neon, export=1 + mov w6, #8 +0: + stp x19, x20, [sp, #-0x40]! + stp x21, x22, [sp, #0x10] + stp x23, x24, [sp, #0x20] + stp x25, x30, [sp, #0x30] + + sub sp, sp, #APPLY_BDOF_STACK_SIZE + mov w19, w6 // bit_depth + mov x20, x0 // dst + mov x21, x1 // dst_stride + mov x22, x2 // src0 + mov x23, x3 // src1 + mov w24, w4 // block_w + mov w25, w5 // block_h + + // int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]}; + add x0, sp, #GRADIENT_H0_OFFSET + add x1, sp, #GRADIENT_H1_OFFSET + add x2, sp, #GRADIENT_V0_OFFSET + add x3, sp, #GRADIENT_V1_OFFSET + mov x4, x22 + mov x5, x23 + mov w6, w24 + mov w7, w25 + bl X(ff_vvc_bdof_grad_filter_8x_neon) + + cmp w24, #8 + mov x0, x22 // src0 + mov x1, x23 // src1 + add x2, sp, #GRADIENT_H0_OFFSET // gh0 + add x3, sp, #GRADIENT_V0_OFFSET // gv0 + add x4, sp, #VX_OFFSET // vx + add x5, sp, #VY_OFFSET // vy + mov w6, w25 // block_h + + b.gt 16f + + bl X(ff_vvc_derive_bdof_vx_vy_8x_neon) + cmp w19, #10 // check bitdepth + mov x0, x20 // dst + mov x1, x21 // dst_stride + mov x2, x22 // src0 + mov x3, x23 // src1 + add x4, sp, #GRADIENT_H1_OFFSET // gh1 + add x5, sp, #GRADIENT_V1_OFFSET // gv1 + add x6, sp, #VX_OFFSET + add x7, sp, #VY_OFFSET + str w25, [sp] + b.eq 1f + b.gt 2f + // 8bit +0: + bl X(ff_vvc_apply_bdof_block_8x_8_neon) + b 32f +1: + // 10bit + bl X(ff_vvc_apply_bdof_block_8x_10_neon) + b 32f +2: + // 12bit + bl X(ff_vvc_apply_bdof_block_8x_12_neon) + b 32f +16: + bl X(ff_vvc_derive_bdof_vx_vy_16x_neon) + + cmp w19, #10 // check bitdepth + mov x0, x20 // dst + mov x1, x21 // dst_stride + mov x2, x22 // src0 + mov x3, x23 // src1 + add x4, sp, #GRADIENT_H1_OFFSET // gh1 + add x5, sp, #GRADIENT_V1_OFFSET // gv1 + add x6, sp, #VX_OFFSET + add x7, sp, #VY_OFFSET + str w25, [sp] + b.eq 17f + b.gt 18f + // 8bit + bl X(ff_vvc_apply_bdof_block_16x_8_neon) + b 32f +17: + // 10bit + bl X(ff_vvc_apply_bdof_block_16x_10_neon) + b 32f +18: + // 12bit + bl X(ff_vvc_apply_bdof_block_16x_12_neon) +32: + add sp, sp, #APPLY_BDOF_STACK_SIZE + ldp x25, x30, [sp, #0x30] + ldp x23, x24, [sp, #0x20] + ldp x21, x22, [sp, #0x10] + ldp x19, x20, [sp], #0x40 + ret +endfunc + +#undef APPLY_BDOF_STACK_SIZE +#undef GRADIENT_H0_OFFSET +#undef GRADIENT_H1_OFFSET +#undef GRADIENT_V0_OFFSET +#undef GRADIENT_V1_OFFSET +#undef VX_OFFSET +#undef VY_OFFSET diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c deleted file mode 100644 index d8ddaacb14..0000000000 --- a/libavcodec/aarch64/vvc/of_template.c +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2024 Zhao Zhili - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/bit_depth_template.c" - -void FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(pixel* dst, - ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, - const int16_t **gh, const int16_t **gv, int16_t *vx, int16_t *vy); - -static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride, - const int16_t *_src0, const int16_t *_src1, - int block_w, int block_h) { - // +2 for pad left and right - int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]; - int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]; - int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]}; - int16_t *gradient_v[2] = {&gradient_buf_v[0][1], &gradient_buf_v[1][1]}; - ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); - pixel *dst = (pixel *) _dst; - - ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0], - BDOF_BLOCK_SIZE, - _src0, MAX_PB_SIZE, block_w, block_h); - ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1], - BDOF_BLOCK_SIZE, - _src1, MAX_PB_SIZE, block_w, block_h); - int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE]; - if (block_w == 8) - ff_vvc_derive_bdof_vx_vy_8x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h); - else - ff_vvc_derive_bdof_vx_vy_16x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h); - - for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) { - for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE * 2) { - const int16_t *src0 = _src0 + y * MAX_PB_SIZE + x; - const int16_t *src1 = _src1 + y * MAX_PB_SIZE + x; - pixel *d = dst + x; - int idx = BDOF_BLOCK_SIZE * y + x; - const int16_t *gh[] = {gradient_h[0] + idx, gradient_h[1] + idx}; - const int16_t *gv[] = {gradient_v[0] + idx, gradient_v[1] + idx}; - int idx1 = y + x / BDOF_MIN_BLOCK_SIZE; - FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(d, dst_stride, - src0, src1, gh, gv, - vx + idx1, vy + idx1); - } - dst += BDOF_MIN_BLOCK_SIZE * dst_stride; - } -} -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".