From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTPS id 8BFBD4D297 for ; Wed, 19 Feb 2025 16:50:51 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 7F88F68C275; Wed, 19 Feb 2025 18:50:47 +0200 (EET) Received: from out162-62-57-137.mail.qq.com (out162-62-57-137.mail.qq.com [162.62.57.137]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 3A65168BEF2 for ; Wed, 19 Feb 2025 18:50:39 +0200 (EET) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=foxmail.com; s=s201512; t=1739983828; bh=gOnr5Y5ENLbEIk+mCOfZFK8wktvVUTZgD1Y1zQ+FlX8=; h=From:To:Cc:Subject:Date; b=OooPtjsv7FhemfD+KUNTxR7fWKckmN+g8uqwvW990K1fTuUaurS/9CzHcic0EQh4m bloiLt7YGdB5e4vRyIvHlD8iHHp8lTbeuHqHC6q3JeTWt1Y1fb+aKes82AyGoiEbq6 Rmz+zYWdHBynQMyLz1yo6dBB6HiRV1xv6Mkszfa0= Received: from ZHILIZHAO-MB1.tencent.com ([240e:3b7:3277:36f0:81b:3417:f475:29b9]) by newxmesmtplogicsvrszb16-1.qq.com (NewEsmtp) with SMTP id C9B93E49; Thu, 20 Feb 2025 00:50:27 +0800 X-QQ-mid: xmsmtpt1739983827thpyri5ar Message-ID: X-QQ-XMAILINFO: Msf7FzQQGWpRfAhXLhrwsYE8K4aDq/06O9e05Jvp7HGpFV2bRdOjmLwvW2xesH KH6NFEswIXd+3dG6IPI6lRnaGrK2kVpdRukEIzyX2emSMGzg2LcrtOez3cGHy5Ij4bieoMkEkyv7 qdtrvxlJt3tL1fhqOUyzhm3kHAEjQCiOBlt+goS9iLNm1/GzeJO4MteJSpH6SpF7KhfM98nmDnVH Cqua5EB4OFqExqD6LToiggTcGxx4b37KiFXqqemBTxXOrKanB3xIuDbFg7qz+KhXokVpY06jQeM3 IegiDR0VEaEaxRy4cYYGnHlfIpGmbXRsFJIohNFn+UzGMuDN3DM5bfylnKnD5hIMY4SMO0h0awWD s6AKRrscRGXVReESPqbP7wru+ZYlYFf+rUc1EuOb9ZC0P4Wa7VdWyxPRvOoOTYyb20av3yy6qPRq 21hzcoDYdbo/AyvyvXaLIMFRYB9VJSwkzTMcN6j6xQgyjIkquZuKVVooYdLuVGgV5nGJ4Wzi9rnu lRTNmpGNxv58NG2CtC9ZGHmLzZRXGDwfAnc00mb7ycjhYmQPH6fBLgxpvXHC2y4rQNgPHruwcHS7 ad/VKnjJMH0+pvHEPyKTf4Lpx4Bjwjb9HeCYLdgYmEqB62MoncHVNo8/Xf5S2acQKjFZbSTRBnqW 0LD8TergOXxabh3yxS6OzXz68Gyw6vctKleRPz18tavSwBGrqyVGRIETedFdow3dU1PsHdh4jYN7 B6pzyJQVdnsl6X9o3SRfRqxAXfjmqsxNxqHzhE3FDgXl/yHzCUmE2P+n3gJVtqKj/zGASl6cW0Hu EN9m8piCvva56HUQEpORwTdma42U6dqmIOgeaPAmFkSSuQK7c+hxd2MVmfeuFuHIIMjbe7HbEpDi t1YJQ/6QK102d+KX9bwQqpuVAwkTNwsQ2KXZf3jie11hb8sj6vjrY1vkgjm2ntUf5hJiwR65DF2i hSYGSRpClGx1vKKDn/t8GhObYejR/xaJavhehXNGvkZfdXNE0lCw== X-QQ-XMRINFO: Mp0Kj//9VHAxr69bL5MkOOs= From: Zhao Zhili To: ffmpeg-devel@ffmpeg.org Date: Thu, 20 Feb 2025 00:50:23 +0800 X-OQ-MSGID: <20250219165023.47171-1-quinkblack@foxmail.com> X-Mailer: git-send-email 2.46.0 MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 1/2] aarch64/hevcdsp_idct_neon: Optimize idct dc X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: Zhao Zhili Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: From: Zhao Zhili clang does better than the assembly code before the patch, especially for small size: hevc_idct_4x4_dc_8_c: 11.2 ( 1.00x) hevc_idct_4x4_dc_8_neon: 15.5 ( 0.73x) hevc_idct_4x4_dc_10_c: 12.0 ( 1.00x) hevc_idct_4x4_dc_10_neon: 15.2 ( 0.79x) hevc_idct_8x8_dc_8_c: 13.2 ( 1.00x) hevc_idct_8x8_dc_8_neon: 18.2 ( 0.73x) hevc_idct_8x8_dc_10_c: 13.5 ( 1.00x) hevc_idct_8x8_dc_10_neon: 17.2 ( 0.78x) hevc_idct_16x16_dc_8_c: 41.8 ( 1.00x) hevc_idct_16x16_dc_8_neon: 37.8 ( 1.11x) hevc_idct_16x16_dc_10_c: 41.8 ( 1.00x) hevc_idct_16x16_dc_10_neon: 37.8 ( 1.11x) hevc_idct_32x32_dc_8_c: 130.2 ( 1.00x) hevc_idct_32x32_dc_8_neon: 132.2 ( 0.98x) hevc_idct_32x32_dc_10_c: 130.2 ( 1.00x) hevc_idct_32x32_dc_10_neon: 132.2 ( 0.98x) This patch basically clone what the compiler does, so the performance is the same. --- libavcodec/aarch64/hevcdsp_idct_neon.S | 59 ++++++++++++++------------ 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index 3cac6e6db9..4543ab6b07 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -888,38 +888,45 @@ function ff_hevc_transform_luma_4x4_neon_8, export=1 ret endfunc +.macro idct_8x8_dc_store offset +.irp i, 0x0, 0x20, 0x40, 0x60 + stp q0, q0, [x0, #(\offset + \i)] +.endr +.endm + +.macro idct_16x16_dc_store +.irp index, 0x0, 0x80, 0x100, 0x180 + idct_8x8_dc_store offset=\index +.endr +.endm + // void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs) .macro idct_dc size, bitdepth function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1 - ld1r {v4.8h}, [x0] - srshr v4.8h, v4.8h, #1 - srshr v0.8h, v4.8h, #(14 - \bitdepth) - srshr v1.8h, v4.8h, #(14 - \bitdepth) -.if \size > 4 - srshr v2.8h, v4.8h, #(14 - \bitdepth) - srshr v3.8h, v4.8h, #(14 - \bitdepth) -.if \size > 16 /* dc 32x32 */ - mov x2, #4 + ldrsh w1, [x0] + add w1, w1, #1 + asr w1, w1, #1 + add w1, w1, #(1 << (13 - \bitdepth)) + asr w1, w1, #(14 - \bitdepth) + dup v0.8h, w1 + +.if \size < 8 + stp q0, q0, [x0] +.else +.if \size < 16 + idct_8x8_dc_store 0x0 +.else +.if \size < 32 + idct_16x16_dc_store +.else + add x2, x0, #(32 * 32 * 2) 1: - subs x2, x2, #1 + idct_16x16_dc_store + add x0, x0, #(16 * 16 * 2) + cmp x0, x2 + b.lt 1b .endif - add x12, x0, #64 - mov x13, #128 -.if \size > 8 /* dc 16x16 */ - st1 {v0.8h-v3.8h}, [x0], x13 - st1 {v0.8h-v3.8h}, [x12], x13 - st1 {v0.8h-v3.8h}, [x0], x13 - st1 {v0.8h-v3.8h}, [x12], x13 - st1 {v0.8h-v3.8h}, [x0], x13 - st1 {v0.8h-v3.8h}, [x12], x13 -.endif /* dc 8x8 */ - st1 {v0.8h-v3.8h}, [x0], x13 - st1 {v0.8h-v3.8h}, [x12], x13 -.if \size > 16 /* dc 32x32 */ - bne 1b .endif -.else /* dc 4x4 */ - st1 {v0.8h-v1.8h}, [x0] .endif ret endfunc -- 2.46.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".