From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTPS id CCC3E4BBFC for ; Mon, 3 Mar 2025 21:19:01 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 6F14568EB44; Mon, 3 Mar 2025 23:18:58 +0200 (EET) Received: from szaka.eu (szaka.eu [144.217.86.229]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 0205668DEA2 for ; Mon, 3 Mar 2025 23:18:51 +0200 (EET) To: ffmpeg-devel@ffmpeg.org Date: Mon, 3 Mar 2025 22:18:23 +0100 Message-ID: <20250303211822.13925-2-ffmpeg@szaka.eu> X-Mailer: git-send-email 2.47.2 In-Reply-To: References: MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH v2] avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12} X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , From: Krzysztof Pyrkosz via ffmpeg-devel Reply-To: FFmpeg development discussions and patches Cc: Krzysztof Pyrkosz Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: This patch replaces integer widening with halving addition, and multi-step "emulated" rounding shift with a single asm instruction doing exactly that. Benchmarks before and after: A78 avg_8_64x64_neon: 2686.2 ( 6.12x) avg_8_128x128_neon: 10734.2 ( 5.88x) avg_10_64x64_neon: 2536.8 ( 5.40x) avg_10_128x128_neon: 10079.0 ( 5.22x) avg_12_64x64_neon: 2548.2 ( 5.38x) avg_12_128x128_neon: 10133.8 ( 5.19x) avg_8_64x64_neon: 897.8 (18.26x) avg_8_128x128_neon: 3608.5 (17.37x) avg_10_32x32_neon: 444.2 ( 8.51x) avg_10_64x64_neon: 1711.8 ( 8.00x) avg_12_64x64_neon: 1706.2 ( 8.02x) avg_12_128x128_neon: 7010.0 ( 7.46x) A72 avg_8_64x64_neon: 5823.4 ( 3.88x) avg_8_128x128_neon: 17430.5 ( 4.73x) avg_10_64x64_neon: 5228.1 ( 3.71x) avg_10_128x128_neon: 16722.2 ( 4.17x) avg_12_64x64_neon: 5379.1 ( 3.51x) avg_12_128x128_neon: 16715.7 ( 4.17x) avg_8_64x64_neon: 2006.5 (10.61x) avg_8_128x128_neon: 9158.7 ( 8.96x) avg_10_64x64_neon: 3357.7 ( 5.60x) avg_10_128x128_neon: 12411.7 ( 5.56x) avg_12_64x64_neon: 3317.5 ( 5.67x) avg_12_128x128_neon: 12358.5 ( 5.58x) A53 avg_8_64x64_neon: 8327.8 ( 5.18x) avg_8_128x128_neon: 31631.3 ( 5.34x) avg_10_64x64_neon: 8783.5 ( 4.98x) avg_10_128x128_neon: 32617.0 ( 5.25x) avg_12_64x64_neon: 8686.0 ( 5.06x) avg_12_128x128_neon: 32487.5 ( 5.25x) avg_8_64x64_neon: 6032.3 ( 7.17x) avg_8_128x128_neon: 22008.5 ( 7.69x) avg_10_64x64_neon: 7738.0 ( 5.68x) avg_10_128x128_neon: 27813.8 ( 6.14x) avg_12_64x64_neon: 7844.5 ( 5.60x) avg_12_128x128_neon: 26999.5 ( 6.34x) --- libavcodec/aarch64/vvc/inter.S | 177 ++++++++++++++++++++++++--------- 1 file changed, 130 insertions(+), 47 deletions(-) diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S index 0edc861f97..b2f44697d3 100644 --- a/libavcodec/aarch64/vvc/inter.S +++ b/libavcodec/aarch64/vvc/inter.S @@ -24,9 +24,9 @@ #define BDOF_BLOCK_SIZE 16 #define BDOF_MIN_BLOCK_SIZE 4 -.macro vvc_avg type, bit_depth +.macro vvc_avg bit_depth -.macro vvc_\type\()_\bit_depth\()_2_4 tap +.macro vvc_w_avg_\bit_depth\()_2_4 tap .if \tap == 2 ldr s0, [src0] ldr s2, [src1] @@ -34,18 +34,11 @@ ldr d0, [src0] ldr d2, [src1] .endif - -.ifc \type, avg - saddl v4.4s, v0.4h, v2.4h - add v4.4s, v4.4s, v16.4s - sqshrun v4.4h, v4.4s, #(15 - \bit_depth) -.else mov v4.16b, v16.16b smlal v4.4s, v0.4h, v19.4h smlal v4.4s, v2.4h, v20.4h sqshl v4.4s, v4.4s, v22.4s sqxtun v4.4h, v4.4s -.endif .if \bit_depth == 8 sqxtun v4.8b, v4.8h @@ -68,7 +61,7 @@ add dst, dst, dst_stride .endm -function ff_vvc_\type\()_\bit_depth\()_neon, export=1 +function ff_vvc_w_avg_\bit_depth\()_neon, export=1 dst .req x0 dst_stride .req x1 src0 .req x2 @@ -78,9 +71,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 mov x10, #(VVC_MAX_PB_SIZE * 2) cmp width, #8 -.ifc \type, avg - movi v16.4s, #(1 << (14 - \bit_depth)) -.else lsr x11, x6, #32 // weight0 mov w12, w6 // weight1 lsr x13, x7, #32 // offset @@ -91,9 +81,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 dup v20.8h, w12 dup v16.4s, w13 dup v22.4s, w14 -.endif // avg - .if \bit_depth >= 10 +.if \bit_depth >= 10 // clip pixel mov w6, #((1 << \bit_depth) - 1) dup v17.8h, w6 @@ -105,25 +94,17 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 b.eq 4f 2: // width == 2 subs height, height, #1 - vvc_\type\()_\bit_depth\()_2_4 2 + vvc_w_avg_\bit_depth\()_2_4 2 b.ne 2b b 32f 4: // width == 4 subs height, height, #1 - vvc_\type\()_\bit_depth\()_2_4 4 + vvc_w_avg_\bit_depth\()_2_4 4 b.ne 4b b 32f 8: // width == 8 ld1 {v0.8h}, [src0], x10 ld1 {v2.8h}, [src1], x10 -.ifc \type, avg - saddl v4.4s, v0.4h, v2.4h - saddl2 v5.4s, v0.8h, v2.8h - add v4.4s, v4.4s, v16.4s - add v5.4s, v5.4s, v16.4s - sqshrun v4.4h, v4.4s, #(15 - \bit_depth) - sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth) -.else mov v4.16b, v16.16b mov v5.16b, v16.16b smlal v4.4s, v0.4h, v19.4h @@ -134,7 +115,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 sqshl v5.4s, v5.4s, v22.4s sqxtun v4.4h, v4.4s sqxtun2 v4.8h, v5.4s -.endif subs height, height, #1 .if \bit_depth == 8 sqxtun v4.8b, v4.8h @@ -153,20 +133,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 17: ldp q0, q1, [x7], #32 ldp q2, q3, [x8], #32 -.ifc \type, avg - saddl v4.4s, v0.4h, v2.4h - saddl2 v5.4s, v0.8h, v2.8h - saddl v6.4s, v1.4h, v3.4h - saddl2 v7.4s, v1.8h, v3.8h - add v4.4s, v4.4s, v16.4s - add v5.4s, v5.4s, v16.4s - add v6.4s, v6.4s, v16.4s - add v7.4s, v7.4s, v16.4s - sqshrun v4.4h, v4.4s, #(15 - \bit_depth) - sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth) - sqshrun v6.4h, v6.4s, #(15 - \bit_depth) - sqshrun2 v6.8h, v7.4s, #(15 - \bit_depth) -.else // avg mov v4.16b, v16.16b mov v5.16b, v16.16b mov v6.16b, v16.16b @@ -187,7 +153,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 sqxtun v6.4h, v6.4s sqxtun2 v4.8h, v5.4s sqxtun2 v6.8h, v7.4s -.endif // w_avg subs w6, w6, #16 .if \bit_depth == 8 sqxtun v4.8b, v4.8h @@ -217,12 +182,130 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 endfunc .endm -vvc_avg avg, 8 -vvc_avg avg, 10 -vvc_avg avg, 12 -vvc_avg w_avg, 8 -vvc_avg w_avg, 10 -vvc_avg w_avg, 12 +vvc_avg 8 +vvc_avg 10 +vvc_avg 12 + +.macro vvc_avg2 bit_depth +function ff_vvc_avg_\bit_depth\()_neon, export=1 + mov x10, #(VVC_MAX_PB_SIZE * 2) + movi v16.8h, #0 + movi v17.16b, #255 + ushr v17.8h, v17.8h, #(16 - \bit_depth) + + cmp w4, #8 + b.gt 16f + b.eq 8f + cmp w4, #4 + b.eq 4f + +2: // width == 2 + ldr s0, [x2] + subs w5, w5, #1 + ldr s1, [x3] +.if \bit_depth == 8 + shadd v0.4h, v0.4h, v1.4h + sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth) + str h0, [x0] +.else + shadd v0.4h, v0.4h, v1.4h + srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth) + smax v0.4h, v0.4h, v16.4h + smin v0.4h, v0.4h, v17.4h + str s0, [x0] +.endif + add x2, x2, #(VVC_MAX_PB_SIZE * 2) + add x3, x3, #(VVC_MAX_PB_SIZE * 2) + add x0, x0, x1 + b.ne 2b + ret + +4: // width == 4 + ldr d0, [x2] + subs w5, w5, #1 + ldr d1, [x3] +.if \bit_depth == 8 + shadd v0.4h, v0.4h, v1.4h + sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth) + str s0, [x0] +.else + shadd v0.4h, v0.4h, v1.4h + srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth) + smax v0.4h, v0.4h, v16.4h + smin v0.4h, v0.4h, v17.4h + str d0, [x0] +.endif + add x2, x2, #(VVC_MAX_PB_SIZE * 2) + add x3, x3, #(VVC_MAX_PB_SIZE * 2) + add x0, x0, x1 + b.ne 4b + ret + +8: // width == 8 + ldr q0, [x2] + subs w5, w5, #1 + ldr q1, [x3] +.if \bit_depth == 8 + shadd v0.8h, v0.8h, v1.8h + sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth) + str d0, [x0] +.else + shadd v0.8h, v0.8h, v1.8h + srshr v0.8h, v0.8h, #(15 - 1 - \bit_depth) + smax v0.8h, v0.8h, v16.8h + smin v0.8h, v0.8h, v17.8h + str q0, [x0] +.endif + add x2, x2, #(VVC_MAX_PB_SIZE * 2) + add x3, x3, #(VVC_MAX_PB_SIZE * 2) + add x0, x0, x1 + b.ne 8b + ret + +16: // width >= 16 +.if \bit_depth == 8 + sub x1, x1, w4, sxtw +.else + sub x1, x1, w4, sxtw #1 +.endif + sub x10, x10, w4, sxtw #1 +3: + mov w6, w4 // width +1: + ldp q0, q1, [x2], #32 + subs w6, w6, #16 + ldp q2, q3, [x3], #32 +.if \bit_depth == 8 + shadd v4.8h, v0.8h, v2.8h + shadd v5.8h, v1.8h, v3.8h + sqrshrun v0.8b, v4.8h, #6 + sqrshrun2 v0.16b, v5.8h, #6 + st1 {v0.16b}, [x0], #16 +.else + shadd v4.8h, v0.8h, v2.8h + shadd v5.8h, v1.8h, v3.8h + srshr v0.8h, v4.8h, #(15 - 1 - \bit_depth) + srshr v1.8h, v5.8h, #(15 - 1 - \bit_depth) + smax v0.8h, v0.8h, v16.8h + smax v1.8h, v1.8h, v16.8h + smin v0.8h, v0.8h, v17.8h + smin v1.8h, v1.8h, v17.8h + stp q0, q1, [x0], #32 +.endif + b.ne 1b + + subs w5, w5, #1 + add x2, x2, x10 + add x3, x3, x10 + add x0, x0, x1 + b.ne 3b + ret +endfunc +.endm + +vvc_avg2 8 +vvc_avg2 10 +vvc_avg2 12 /* x0: int16_t *dst * x1: const uint8_t *_src -- 2.47.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".