From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTP id 6A34745734 for ; Wed, 22 Mar 2023 09:27:07 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 41D9368C195; Wed, 22 Mar 2023 11:27:05 +0200 (EET) Received: from mail8.parnet.fi (mail8.parnet.fi [77.234.108.134]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 2987A68BFFF for ; Wed, 22 Mar 2023 11:26:59 +0200 (EET) Received: from mail9.parnet.fi (mail9.parnet.fi [77.234.108.21]) by mail8.parnet.fi with ESMTP id 32M9QwTi012992-32M9QwTj012992 for ; Wed, 22 Mar 2023 11:26:58 +0200 Received: from foo.martin.st (host-97-187.parnet.fi [77.234.97.187]) by mail9.parnet.fi (Postfix) with ESMTPS id 63D0BA1407 for ; Wed, 22 Mar 2023 11:26:58 +0200 (EET) Date: Wed, 22 Mar 2023 11:26:58 +0200 (EET) From: =?ISO-8859-15?Q?Martin_Storsj=F6?= To: FFmpeg development discussions and patches In-Reply-To: <20230322000710.47513-3-jdek@itanimul.li> Message-ID: <7aff2ff-4b80-5575-f8a6-fbf03c17646f@martin.st> References: <61cbba0-956c-86ff-340-26a23453e0d@martin.st> <20230322000710.47513-1-jdek@itanimul.li> <20230322000710.47513-3-jdek@itanimul.li> MIME-Version: 1.0 X-FE-Policy-ID: 3:14:2:SYSTEM Subject: Re: [FFmpeg-devel] [PATCH v3 3/3] lavc/aarch64: add hevc deblock chroma 8-12bit X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="us-ascii"; Format="flowed" Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: On Wed, 22 Mar 2023, J. Dekker wrote: > Signed-off-by: J. Dekker > --- > > - Using clip macro > - Avoided need for .elseif at all, just used better mvni Can you provide some benchmark numbers for it in the commit message, to get the ballpark figures? > libavcodec/aarch64/Makefile | 3 +- > libavcodec/aarch64/hevcdsp_deblock_neon.S | 180 ++++++++++++++++++++++ > libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 +++ > 3 files changed, 200 insertions(+), 1 deletion(-) > create mode 100644 libavcodec/aarch64/hevcdsp_deblock_neon.S > > diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile > index 02fb51c3ab..216191640c 100644 > --- a/libavcodec/aarch64/Makefile > +++ b/libavcodec/aarch64/Makefile > @@ -65,7 +65,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ > aarch64/vp9lpf_neon.o \ > aarch64/vp9mc_16bpp_neon.o \ > aarch64/vp9mc_neon.o > -NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \ > +NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_deblock_neon.o \ > + aarch64/hevcdsp_idct_neon.o \ > aarch64/hevcdsp_init_aarch64.o \ > aarch64/hevcdsp_qpel_neon.o \ > aarch64/hevcdsp_sao_neon.o > diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S > new file mode 100644 > index 0000000000..49b40f21c8 > --- /dev/null > +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S > @@ -0,0 +1,180 @@ > +/* -*-arm64-*- > + * vim: syntax=arm64asm > + * > + * Copyright (c) 2014 Seppo Tomperi > + * Copyright (c) 2023 J. Dekker > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > + > +#include "libavutil/aarch64/asm.S" > +#include "neon.S" > + > +.macro hevc_loop_filter_chroma_start bitdepth > + mov x4, x30 > + ldr w14, [x2] > + ldr w15, [x2, #4] > +.if \bitdepth > 8 > + lsl w14, w14, #(\bitdepth - 8) > + lsl w15, w15, #(\bitdepth - 8) > +.endif > + adds w2, w14, w15 > + b.eq 1f > + dup v16.4h, w14 > + dup v17.4h, w15 > + trn1 v16.2d, v16.2d, v17.2d > +.if \bitdepth > 8 > + mvni v19.8h, #((0xff << (\bitdepth - 8)) & 0xff), lsl #8 > + movi v18.8h, #0 > +.endif > + neg v17.8h, v16.8h > +.endm > + > +.macro hevc_loop_filter_chroma_body bitdepth > +.if \bitdepth <= 8 > + uxtl v0.8h, v0.8b // p1 > + uxtl v1.8h, v1.8b // p0 > + uxtl v2.8h, v2.8b // q0 > + uxtl v3.8h, v3.8b // q1 > +.endif > + sub v5.8h, v2.8h, v1.8h // q0 - p0 > + sub v6.8h, v0.8h, v3.8h // p1 - q1 > + shl v5.8h, v5.8h, #2 > + add v5.8h, v6.8h, v5.8h > + srshr v5.8h, v5.8h, #3 > + clip v17.8h, v16.8h, v5.8h > + sqadd v1.8h, v1.8h, v5.8h // p0 + delta > + sqsub v2.8h, v2.8h, v5.8h // q0 - delta For the bitdepth==8 case, I'm wondering if it'd be more straightforward to do like the arm code does, i.e. don't do uxtl at the start, but widen with usubl at the "q0-p0" stage, and then add with uaddw at the end. That makes the function less templateable though. Would you mind giving that a try just to get benchmark numbers for it? If there's not a big difference I guess this more templated form is fine too. If you try this, just comment out the more complex function that does transposes. (We can simplify that one a little if we don't need to reverse the uxtl.) > +.if \bitdepth <= 8 > + sqxtun v1.8b, v1.8h > + sqxtun v2.8b, v2.8h > +.else > + smin v1.8h, v1.8h, v19.8h > + smin v2.8h, v2.8h, v19.8h > + clip v18.8h, v19.8h, v1.8h, v2.8h Don't you do duplicate clipping here - the smin instructions are redundant with the clip macro? > +.endif > +.endm > + > +function hevc_loop_filter_chroma_body_8_neon, export=0 > + hevc_loop_filter_chroma_body 8 > + ret > +endfunc > + > +function hevc_loop_filter_chroma_body_10_neon, export=0 > +hevc_loop_filter_chroma_body_12_neon: > + hevc_loop_filter_chroma_body 10 > + ret > +endfunc > + > +// void ff_hevc_h_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); > + > +.macro hevc_h_loop_filter_chroma bitdepth > +function ff_hevc_h_loop_filter_chroma_\bitdepth\()_neon, export=1 > + hevc_loop_filter_chroma_start \bitdepth > + sub x0, x0, x1, lsl #1 > +.if \bitdepth > 8 > + ld1 {v0.8h}, [x0], x1 > + ld1 {v1.8h}, [x0], x1 > + ld1 {v2.8h}, [x0], x1 > + ld1 {v3.8h}, [x0] > +.else > + ld1 {v0.8b}, [x0], x1 > + ld1 {v1.8b}, [x0], x1 > + ld1 {v2.8b}, [x0], x1 > + ld1 {v3.8b}, [x0] > +.endif > + sub x0, x0, x1, lsl #1 > + bl hevc_loop_filter_chroma_body_\bitdepth\()_neon > +.if \bitdepth > 8 > + st1 {v1.8h}, [x0], x1 > + st1 {v2.8h}, [x0] > +.else > + st1 {v1.8b}, [x0], x1 > + st1 {v2.8b}, [x0] > +.endif > +1: ret x4 > +endfunc > +.endm > + > +.macro hevc_v_loop_filter_chroma bitdepth > +function ff_hevc_v_loop_filter_chroma_\bitdepth\()_neon, export=1 > + hevc_loop_filter_chroma_start \bitdepth > +.if \bitdepth > 8 > + sub x0, x0, #8 > + add x3, x0, x1 > + lsl x1, x1, #1 > + ld1 {v20.8h}, [x0], x1 > + ld1 {v21.8h}, [x3], x1 > + ld1 {v0.8h}, [x0], x1 > + ld1 {v1.8h}, [x3], x1 > + ld1 {v2.8h}, [x0], x1 > + ld1 {v3.8h}, [x3], x1 > + ld1 {v22.8h}, [x0], x1 > + ld1 {v23.8h}, [x3], x1 > + transpose_8x8H v20, v21, v0, v1, v2, v3, v22, v23, v24, v25 Reading, transposing and writing back all of 8x8 pixels here, when we really just want a 4x8 slice of pixels, is a bit excessive. I see that the existing 32 bit assembly does it like that though, so I guess it's acceptable - but I'd like to leave a remark that it can be done more optimized. I believe this should be doable by loading the first 4 rows into v0-v3 .d[0] for 10/12 bpp and .s[0] for 8bpp, the last 4 rows into .d[1] or .s[1], then do a transpose_4x8H or 4x8B on it, then the same in reverse at the end. // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".