From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTP id 2C5FE47073 for ; Sat, 26 Aug 2023 08:49:58 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 3337B68C59D; Sat, 26 Aug 2023 11:49:53 +0300 (EEST) Received: from smtp-my3-01p7.yunyou.top (smtp-my3-01p7.yunyou.top [60.247.169.7]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 2DFE568AAB3 for ; Sat, 26 Aug 2023 11:49:46 +0300 (EEST) Received: from [192.168.15.105] (unknown [183.158.247.103]) by smtp-my-01.yunyou.top (WestCloudMail) with ESMTPA id BF335FE635; Sat, 26 Aug 2023 16:49:38 +0800 (CST) Message-ID: <33af9c88-c31d-e11e-58a3-7f9a05718c8f@myais.com.cn> Date: Sat, 26 Aug 2023 16:49:38 +0800 MIME-Version: 1.0 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Thunderbird/102.14.0 From: "Logan.Lyu" To: ffmpeg-devel@ffmpeg.org Organization: myais Subject: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_v X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: jb@videolan.org, jdek@itanimul.li Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="us-ascii"; Format="flowed" Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: checkasm bench: put_hevc_epel_uni_hv64_8_i8mm: 6568.7 put_hevc_epel_uni_v4_8_c: 88.7 put_hevc_epel_uni_v4_8_neon: 32.7 put_hevc_epel_uni_v6_8_c: 185.4 put_hevc_epel_uni_v6_8_neon: 44.9 put_hevc_epel_uni_v8_8_c: 333.9 put_hevc_epel_uni_v8_8_neon: 44.4 put_hevc_epel_uni_v12_8_c: 728.7 put_hevc_epel_uni_v12_8_neon: 119.7 put_hevc_epel_uni_v16_8_c: 1224.2 put_hevc_epel_uni_v16_8_neon: 139.7 put_hevc_epel_uni_v24_8_c: 2531.2 put_hevc_epel_uni_v24_8_neon: 329.9 put_hevc_epel_uni_v32_8_c: 4739.9 put_hevc_epel_uni_v32_8_neon: 562.7 put_hevc_epel_uni_v48_8_c: 10618.7 put_hevc_epel_uni_v48_8_neon: 1256.2 put_hevc_epel_uni_v64_8_c: 19169.9 put_hevc_epel_uni_v64_8_neon: 2179.2 Co-Authored-By: J. Dekker Signed-off-by: Logon Lyu --- libavcodec/aarch64/hevcdsp_epel_neon.S | 320 ++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 + 2 files changed, 325 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S index a8d694639b..7ce7eec829 100644 --- a/libavcodec/aarch64/hevcdsp_epel_neon.S +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S @@ -32,6 +32,326 @@ const epel_filters, align=4 .byte -2, 10, 58, -2 endconst +.macro load_epel_filterb freg, xreg + movrel \xreg, epel_filters + add \xreg, \xreg, \freg, lsl #2 + ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg] // filter + neg v0.16b, v0.16b + neg v3.16b, v3.16b +.endm + +.macro calc_epelb dst, src0, src1, src2, src3 + umlsl \dst\().8h, \src0\().8b, v0.8b + umlal \dst\().8h, \src1\().8b, v1.8b + umlal \dst\().8h, \src2\().8b, v2.8b + umlsl \dst\().8h, \src3\().8b, v3.8b +.endm + +.macro calc_epelb2 dst, src0, src1, src2, src3 + umlsl2 \dst\().8h, \src0\().16b, v0.16b + umlal2 \dst\().8h, \src1\().16b, v1.16b + umlal2 \dst\().8h, \src2\().16b, v2.16b + umlsl2 \dst\().8h, \src3\().16b, v3.16b +.endm + +.macro calc_all4 + calc v16, v17, v18, v19 + b.eq 2f + calc v17, v18, v19, v16 + b.eq 2f + calc v18, v19, v16, v17 + b.eq 2f + calc v19, v16, v17, v18 + b.ne 1b +.endm + +.macro calc_all8 + calc v16, v17, v18, v19, v20, v21, v22, v23 + b.eq 2f + calc v18, v19, v20, v21, v22, v23, v16, v17 + b.eq 2f + calc v20, v21, v22, v23, v16, v17, v18, v19 + b.eq 2f + calc v22, v23, v16, v17, v18, v19, v20, v21 + b.ne 1b +.endm + +.macro calc_all12 + calc v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27 + b.eq 2f + calc v19, v20, v21, v22, v23, v24, v25, v26, v27, v16, v17, v18 + b.eq 2f + calc v22, v23, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 + b.eq 2f + calc v25, v26, v27, v16, v17, v18, v19, v20, v21, v22, v23, v24 + b.ne 1b +.endm + +.macro calc_all16 + calc v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + b.eq 2f + calc v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v16, v17, v18, v19 + b.eq 2f + calc v24, v25, v26, v27, v28, v29, v30, v31, v16, v17, v18, v19, v20, v21, v22, v23 + b.eq 2f + calc v28, v29, v30, v31, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27 + b.ne 1b +.endm + +function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + ld1 {v16.s}[0], [x2], x3 + ld1 {v17.s}[0], [x2], x3 + ld1 {v18.s}[0], [x2], x3 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().s}[0], [x2], x3 + movi v4.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + sqrshrun v4.8b, v4.8h, #6 + subs w4, w4, #1 + st1 {v4.s}[0], [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v6_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + sub x1, x1, #4 + ld1 {v16.8b}, [x2], x3 + ld1 {v17.8b}, [x2], x3 + ld1 {v18.8b}, [x2], x3 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().8b}, [x2], x3 + movi v4.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + sqrshrun v4.8b, v4.8h, #6 + st1 {v4.s}[0], [x0], #4 + subs w4, w4, #1 + st1 {v4.h}[2], [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v8_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + ld1 {v16.8b}, [x2], x3 + ld1 {v17.8b}, [x2], x3 + ld1 {v18.8b}, [x2], x3 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().8b}, [x2], x3 + movi v4.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + sqrshrun v4.8b, v4.8h, #6 + subs w4, w4, #1 + st1 {v4.8b}, [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v12_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + sub x1, x1, #8 + ld1 {v16.16b}, [x2], x3 + ld1 {v17.16b}, [x2], x3 + ld1 {v18.16b}, [x2], x3 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().16b}, [x2], x3 + movi v4.8h, #0 + movi v5.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + calc_epelb2 v5, \src0, \src1, \src2, \src3 + sqrshrun v4.8b, v4.8h, #6 + sqrshrun2 v4.16b, v5.8h, #6 + subs w4, w4, #1 + st1 {v4.8b}, [x0], #8 + st1 {v4.s}[2], [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v16_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + ld1 {v16.16b}, [x2], x3 + ld1 {v17.16b}, [x2], x3 + ld1 {v18.16b}, [x2], x3 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().16b}, [x2], x3 + movi v4.8h, #0 + movi v5.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + calc_epelb2 v5, \src0, \src1, \src2, \src3 + sqrshrun v4.8b, v4.8h, #6 + sqrshrun2 v4.16b, v5.8h, #6 + subs w4, w4, #1 + st1 {v4.16b}, [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v24_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + ld1 {v16.8b, v17.8b, v18.8b}, [x2], x3 + ld1 {v19.8b, v20.8b, v21.8b}, [x2], x3 + ld1 {v22.8b, v23.8b, v24.8b}, [x2], x3 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11 + ld1 {\src9\().8b, \src10\().8b, \src11\().8b}, [x2], x3 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + calc_epelb v4, \src0, \src3, \src6, \src9 + calc_epelb v5, \src1, \src4, \src7, \src10 + calc_epelb v6, \src2, \src5, \src8, \src11 + sqrshrun v4.8b, v4.8h, #6 + sqrshrun v5.8b, v5.8h, #6 + sqrshrun v6.8b, v6.8h, #6 + subs w4, w4, #1 + st1 {v4.8b-v6.8b}, [x0], x1 +.endm +1: calc_all12 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v32_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + ld1 {v16.16b, v17.16b}, [x2], x3 + ld1 {v18.16b, v19.16b}, [x2], x3 + ld1 {v20.16b, v21.16b}, [x2], x3 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7 + ld1 {\src6\().16b, \src7\().16b}, [x2], x3 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 + calc_epelb v4, \src0, \src2, \src4, \src6 + calc_epelb2 v5, \src0, \src2, \src4, \src6 + calc_epelb v6, \src1, \src3, \src5, \src7 + calc_epelb2 v7, \src1, \src3, \src5, \src7 + sqrshrun v4.8b, v4.8h, #6 + sqrshrun2 v4.16b, v5.8h, #6 + sqrshrun v5.8b, v6.8h, #6 + sqrshrun2 v5.16b, v7.8h, #6 + subs w4, w4, #1 + st1 {v4.16b, v5.16b}, [x0], x1 +.endm +1: calc_all8 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v48_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + ld1 {v16.16b, v17.16b, v18.16b}, [x2], x3 + ld1 {v19.16b, v20.16b, v21.16b}, [x2], x3 + ld1 {v22.16b, v23.16b, v24.16b}, [x2], x3 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11 + ld1 {\src9\().16b, \src10\().16b, \src11\().16b}, [x2], x3 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 + movi v28.8h, #0 + movi v29.8h, #0 + calc_epelb v4, \src0, \src3, \src6, \src9 + calc_epelb2 v5, \src0, \src3, \src6, \src9 + calc_epelb v6, \src1, \src4, \src7, \src10 + calc_epelb2 v7, \src1, \src4, \src7, \src10 + calc_epelb v28, \src2, \src5, \src8, \src11 + calc_epelb2 v29, \src2, \src5, \src8, \src11 + sqrshrun v4.8b, v4.8h, #6 + sqrshrun2 v4.16b, v5.8h, #6 + sqrshrun v5.8b, v6.8h, #6 + sqrshrun2 v5.16b, v7.8h, #6 + sqrshrun v6.8b, v28.8h, #6 + sqrshrun2 v6.16b, v29.8h, #6 + subs w4, w4, #1 + st1 {v4.16b, v5.16b, v6.16b}, [x0], x1 +.endm +1: calc_all12 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v64_8_neon, export=1 + load_epel_filterb x6, x5 + sub sp, sp, #32 + sxtw x3, w3 + sxtw x1, w1 + st1 {v8.8b-v11.8b}, [sp] + sub x2, x2, x3 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15 + ld1 {\src12\().16b, \src13\().16b, \src14\().16b, \src15\().16b}, [x2], x3 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 + movi v8.8h, #0 + movi v9.8h, #0 + movi v10.8h, #0 + movi v11.8h, #0 + calc_epelb v10, \src3, \src7, \src11, \src15 + calc_epelb2 v11, \src3, \src7, \src11, \src15 + calc_epelb v4, \src0, \src4, \src8, \src12 + calc_epelb2 v5, \src0, \src4, \src8, \src12 + calc_epelb v6, \src1, \src5, \src9, \src13 + calc_epelb2 v7, \src1, \src5, \src9, \src13 + calc_epelb v8, \src2, \src6, \src10, \src14 + calc_epelb2 v9, \src2, \src6, \src10, \src14 + sqrshrun v4.8b, v4.8h, #6 + sqrshrun2 v4.16b, v5.8h, #6 + sqrshrun v5.8b, v6.8h, #6 + sqrshrun2 v5.16b, v7.8h, #6 + sqrshrun v6.8b, v8.8h, #6 + sqrshrun2 v6.16b, v9.8h, #6 + sqrshrun v7.8b, v10.8h, #6 + sqrshrun2 v7.16b, v11.8h, #6 + subs w4, w4, #1 + st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1 +.endm +1: calc_all16 +.purgem calc +2: ld1 {v8.8b-v11.8b}, [sp] + add sp, sp, #32 + ret +endfunc + #if HAVE_I8MM .macro EPEL_H_HEADER diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index e125b0cfb2..f1e167c50b 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -161,6 +161,10 @@ NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width),); +NEON8_FNPROTO(epel_uni_v, (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, intptr_t mx, intptr_t my, int width),); + NEON8_FNPROTO(epel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, @@ -285,6 +289,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon; NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,); + NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,); NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,); NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,); NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,); -- 2.38.0.windows.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".