Re: [FFmpeg-devel] [PATCH 5/6] lavc/apv: AVX2 transquant for x86-64

From: James Almer <jamrial@gmail.com>
To: ffmpeg-devel@ffmpeg.org
Subject: Re: [FFmpeg-devel] [PATCH 5/6] lavc/apv: AVX2 transquant for x86-64
Date: Sat, 19 Apr 2025 18:16:45 -0300
Message-ID: <ca34f4f4-0620-442d-b5b5-b68c7c4e8ee7@gmail.com> (raw)
In-Reply-To: <20250419190712.1265201-6-sw@jkqxz.net>

[-- Attachment #1.1.1: Type: text/plain, Size: 4791 bytes --]

On 4/19/2025 4:07 PM, Mark Thompson wrote:
> diff --git a/libavcodec/x86/apv_dsp.asm b/libavcodec/x86/apv_dsp.asm
> new file mode 100644
> index 0000000000..0329089f45
> --- /dev/null
> +++ b/libavcodec/x86/apv_dsp.asm
> @@ -0,0 +1,243 @@
> +;************************************************************************
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION .text
> +
> +align 32
> +const tmatrixh

SECTION_RODATA 32

tmatrixh: dw ...
tmatrixy: dw ...

etc. Add only functions to SECTION .text

> +    dw  64,  89,  84,  75,  64,  50,  35,  18
> +    dw  64,  75,  35, -18, -64, -89, -84, -50
> +    dw  64,  50, -35, -89, -64,  18,  84,  75
> +    dw  64,  18, -84, -50,  64,  75, -35, -89
> +    dw  64, -18, -84,  50,  64, -75, -35,  89
> +    dw  64, -50, -35,  89, -64, -18,  84, -75
> +    dw  64, -75,  35,  18, -64,  89, -84,  50
> +    dw  64, -89,  84, -75,  64, -50,  35, -18
> +const tmatrixv
> +    dw  64,  89,  84,  75,  64,  50,  35,  18
> +    dw  64, -18, -84,  50,  64, -75, -35,  89
> +    dw  64,  75,  35, -18, -64, -89, -84, -50
> +    dw  64, -50, -35,  89, -64, -18,  84, -75
> +    dw  64,  50, -35, -89, -64,  18,  84,  75
> +    dw  64, -75,  35,  18, -64,  89, -84,  50
> +    dw  64,  18, -84, -50,  64,  75, -35, -89
> +    dw  64, -89,  84, -75,  64, -50,  35, -18
> +
> +; Memory targets for vpbroadcastd (register version requires AVX512).
> +const one
> +    dd   1

There's pd_1 defined in constants.c, and you can include it here with

cextern pd_1

> +const sixtyfour
> +    dd  64
> +
> +; void ff_apv_decode_transquant_avx2(void *output,
> +;                                    ptrdiff_t pitch,
> +;                                    const int16_t *input,
> +;                                    const int16_t *qmatrix,
> +;                                    int64_t bit_depth,
> +;                                    int64_t qp_shift);
> +
> +INIT_YMM avx2
> +
> +cglobal apv_decode_transquant, 6, 6, 16, output, pitch, input, qmatrix, bit_depth, qp_shift
> +
> +    ; Load input and dequantise
> +
> +    lea       rax, [bit_depthq - 2]

Are you sure you're not overwriting a passed in argument with this? rax 
is different on Unix64, x86_32, and Win64 ABIs. You have qp_shift free 
after the mov to xm8 if you need a tmp register.
In general, you should use the names you gave the registers, or the r$ 
aliases from x86inc.

> +    movq      xm8, qp_shiftq

Both bit_depth and this fit in an int, so unless there's a real reason 
to use int64_t in the prototype, you can change them to int and read 32 
bits from the registers.

> +    movq      xm9, rax
> +    vpbroadcastd  m10, [one]
> +    vpslld    m10, m10, xm9
> +    vpsrld    m10, m10, 1

No need to add the v prefix to pre-AVX instructions. x86inc will do its 
magic and add emit the VEX encoded version for them as required. 
Similarly, if dst and src1 are the same, you can remove one of them and 
x86inc will also handle it, so just do:

        pslld    m10, xm9

And so. This is important to get yelled at by x86inc if you misuse an 
instruction in some cases, and if you use SWAP and other x86inc helpers 
so the correct register is used.

> +
> +    ; m8  = scalar qp_shift
> +    ; m9  = scalar bd_shift
> +    ; m10 = vector 1 << (bd_shift - 1)
> +    ; m11 = qmatrix load
> +%macro LOAD_AND_DEQUANT 2 ; (xmm input, constant offset)
> +    vpmovsxwd m%1, [inputq   + %2]
> +    vpmovsxwd m11, [qmatrixq + %2]
> +    vpmulld   m%1, m%1, m11

Can't you use pmaddwd here, seeing it's 16bit x 16bit -> 32bit? pmulld 
is super slow, like 10 cycles vs 3 or less from every other integer 
multiply instruction.

> +    vpslld    m%1, m%1, xm8
> +    vpaddd    m%1, m%1, m10
> +    vpsrad    m%1, m%1, xm9
> +    vpackssdw m%1, m%1, m%1
> +%endmacro

[-- Attachment #1.2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".