From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTP id 20B6D453BD for ; Wed, 29 Mar 2023 20:29:20 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id C1AF768C40C; Wed, 29 Mar 2023 23:29:16 +0300 (EEST) Received: from mail8.parnet.fi (mail8.parnet.fi [77.234.108.134]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 7FCF168BFA0 for ; Wed, 29 Mar 2023 23:29:10 +0300 (EEST) Received: from mail9.parnet.fi (mail9.parnet.fi [77.234.108.21]) by mail8.parnet.fi with ESMTP id 32TKT987030998-32TKT988030998 for ; Wed, 29 Mar 2023 23:29:09 +0300 Received: from foo.martin.st (host-97-187.parnet.fi [77.234.97.187]) by mail9.parnet.fi (Postfix) with ESMTPS id 86C7FA1469 for ; Wed, 29 Mar 2023 23:29:09 +0300 (EEST) Date: Wed, 29 Mar 2023 23:29:09 +0300 (EEST) From: =?ISO-8859-15?Q?Martin_Storsj=F6?= To: FFmpeg development discussions and patches In-Reply-To: <20230329141346.3718-2-jdek@itanimul.li> Message-ID: References: <20230329141346.3718-1-jdek@itanimul.li> <20230329141346.3718-2-jdek@itanimul.li> MIME-Version: 1.0 X-FE-Policy-ID: 3:14:2:SYSTEM Subject: Re: [FFmpeg-devel] [PATCH v4 2/2] lavc/aarch64: add hevc deblock chroma 8-12bit X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="us-ascii"; Format="flowed" Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: On Wed, 29 Mar 2023, J. Dekker wrote: > Benched on Ampere Altra: > > hevc_h_loop_filter_chroma8_c: 367.7 > hevc_h_loop_filter_chroma8_neon: 31.0 > hevc_h_loop_filter_chroma10_c: 396.7 > hevc_h_loop_filter_chroma10_neon: 27.5 > hevc_h_loop_filter_chroma12_c: 377.0 > hevc_h_loop_filter_chroma12_neon: 31.7 > hevc_v_loop_filter_chroma8_c: 369.0 > hevc_v_loop_filter_chroma8_neon: 55.0 > hevc_v_loop_filter_chroma10_c: 389.0 > hevc_v_loop_filter_chroma10_neon: 54.0 > hevc_v_loop_filter_chroma12_c: 389.5 > hevc_v_loop_filter_chroma12_neon: 53.0 > > Signed-off-by: J. Dekker > --- > > Included Martin's comments, decent speedup on vertical filter (~50%). > > libavcodec/aarch64/Makefile | 3 +- > libavcodec/aarch64/hevcdsp_deblock_neon.S | 180 ++++++++++++++++++++++ > libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 +++ > 3 files changed, 200 insertions(+), 1 deletion(-) > create mode 100644 libavcodec/aarch64/hevcdsp_deblock_neon.S > > diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile > index 02fb51c3ab..216191640c 100644 > --- a/libavcodec/aarch64/Makefile > +++ b/libavcodec/aarch64/Makefile > @@ -65,7 +65,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ > aarch64/vp9lpf_neon.o \ > aarch64/vp9mc_16bpp_neon.o \ > aarch64/vp9mc_neon.o > -NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \ > +NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_deblock_neon.o \ > + aarch64/hevcdsp_idct_neon.o \ > aarch64/hevcdsp_init_aarch64.o \ > aarch64/hevcdsp_qpel_neon.o \ > aarch64/hevcdsp_sao_neon.o > diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S > new file mode 100644 > index 0000000000..ed342e5ded > --- /dev/null > +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S > @@ -0,0 +1,180 @@ > +/* -*-arm64-*- > + * vim: syntax=arm64asm > + * > + * Copyright (c) 2014 Seppo Tomperi > + * Copyright (c) 2023 J. Dekker > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > + > +#include "libavutil/aarch64/asm.S" > +#include "neon.S" > + > +.macro hevc_loop_filter_chroma_start bitdepth > + mov x4, x30 > + ldr w14, [x2] > + ldr w15, [x2, #4] > +.if \bitdepth > 8 > + lsl w14, w14, #(\bitdepth - 8) > + lsl w15, w15, #(\bitdepth - 8) > +.endif > + adds w2, w14, w15 > + b.eq 1f > + dup v16.4h, w14 > + dup v17.4h, w15 > + trn1 v16.2d, v16.2d, v17.2d > +.if \bitdepth > 8 > + mvni v19.8h, #((0xff << (\bitdepth - 8)) & 0xff), lsl #8 > + movi v18.8h, #0 > +.endif > + neg v17.8h, v16.8h > +.endm > + > +.macro hevc_loop_filter_chroma_body bitdepth > +.if \bitdepth <= 8 > + uxtl v20.8h, v0.8b // p1 > + uxtl v1.8h, v1.8b // p0 > + uxtl v2.8h, v2.8b // q0 > + uxtl v23.8h, v3.8b // q1 > + va .req v20 > + vb .req v23 > +.else // required to specify both cases as we are unable to do: v0 .req v20 > + va .req v0 > + vb .req v3 > +.endif > + sub v5.8h, v2.8h, v1.8h // q0 - p0 > + sub v6.8h, va.8h, vb.8h // p1 - q1 > + shl v5.8h, v5.8h, #2 > + add v5.8h, v6.8h, v5.8h > + srshr v5.8h, v5.8h, #3 > + clip v17.8h, v16.8h, v5.8h > + sqadd v1.8h, v1.8h, v5.8h // p0 + delta > + sqsub v2.8h, v2.8h, v5.8h // q0 - delta > +.if \bitdepth <= 8 > + sqxtun v1.8b, v1.8h > + sqxtun v2.8b, v2.8h > +.unreq va > +.unreq vb Shouldn't the .unreq be outside of the .if/.else? > +.else > + clip v18.8h, v19.8h, v1.8h, v2.8h > +.endif > +.endm > + > +function hevc_loop_filter_chroma_body_8_neon, export=0 > + hevc_loop_filter_chroma_body 8 > + ret > +endfunc > + > +function hevc_loop_filter_chroma_body_10_neon, export=0 > +hevc_loop_filter_chroma_body_12_neon: > + hevc_loop_filter_chroma_body 10 > + ret > +endfunc > + > +// void ff_hevc_h_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); > + > +.macro hevc_h_loop_filter_chroma bitdepth > +function ff_hevc_h_loop_filter_chroma_\bitdepth\()_neon, export=1 > + hevc_loop_filter_chroma_start \bitdepth > + sub x0, x0, x1, lsl #1 > +.if \bitdepth > 8 > + ld1 {v0.8h}, [x0], x1 > + ld1 {v1.8h}, [x0], x1 > + ld1 {v2.8h}, [x0], x1 > + ld1 {v3.8h}, [x0] > +.else > + ld1 {v0.8b}, [x0], x1 > + ld1 {v1.8b}, [x0], x1 > + ld1 {v2.8b}, [x0], x1 > + ld1 {v3.8b}, [x0] > +.endif > + sub x0, x0, x1, lsl #1 > + bl hevc_loop_filter_chroma_body_\bitdepth\()_neon > +.if \bitdepth > 8 > + st1 {v1.8h}, [x0], x1 > + st1 {v2.8h}, [x0] > +.else > + st1 {v1.8b}, [x0], x1 > + st1 {v2.8b}, [x0] > +.endif > +1: ret x4 > +endfunc > +.endm > + > +.macro hevc_v_loop_filter_chroma bitdepth > +function ff_hevc_v_loop_filter_chroma_\bitdepth\()_neon, export=1 > + hevc_loop_filter_chroma_start \bitdepth > + sub x0, x0, #((0x5200 >> \bitdepth) & 0x6) // high -> 4, low -> 2 TBH, I think this is rather obfuscated - I'd prefer to just move the sub (and the two instructions inbetween) back inside of the .if/.else, to have the sub instruction say more explicitly exactly what it does. Other than that, this patch LGTM now. // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".