Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: James Almer <jamrial@gmail.com>
To: ffmpeg-devel@ffmpeg.org
Subject: Re: [FFmpeg-devel] [PATCH v3 5/7] lavc/apv: AVX2 transquant for x86-64
Date: Thu, 24 Apr 2025 18:41:17 -0300
Message-ID: <351277d3-387c-4700-afc3-3897a4ca714d@gmail.com> (raw)
In-Reply-To: <f69ef038-92b9-4b0f-9c5a-f1dc2b70ca2e@jkqxz.net>


[-- Attachment #1.1.1: Type: text/plain, Size: 7643 bytes --]

On 4/24/2025 5:37 PM, Mark Thompson wrote:
> On 24/04/2025 03:55, James Almer wrote:
>> On 4/23/2025 5:45 PM, Mark Thompson wrote:
>>> Typical checkasm result on Alder Lake:
>>>
>>> decode_transquant_8_c:                                 464.2 ( 1.00x)
>>> decode_transquant_8_avx2:                               86.2 ( 5.38x)
>>> decode_transquant_10_c:                                481.6 ( 1.00x)
>>> decode_transquant_10_avx2:                              83.5 ( 5.77x)
>>> ---
>>>    libavcodec/apv_dsp.c          |   4 +
>>>    libavcodec/apv_dsp.h          |   2 +
>>>    libavcodec/x86/Makefile       |   2 +
>>>    libavcodec/x86/apv_dsp.asm    | 311 ++++++++++++++++++++++++++++++++++
>>>    libavcodec/x86/apv_dsp_init.c |  44 +++++
>>>    tests/checkasm/Makefile       |   1 +
>>>    tests/checkasm/apv_dsp.c      | 109 ++++++++++++
>>>    tests/checkasm/checkasm.c     |   3 +
>>>    tests/checkasm/checkasm.h     |   1 +
>>>    tests/fate/checkasm.mak       |   1 +
>>>    10 files changed, 478 insertions(+)
>>>    create mode 100644 libavcodec/x86/apv_dsp.asm
>>>    create mode 100644 libavcodec/x86/apv_dsp_init.c
>>>    create mode 100644 tests/checkasm/apv_dsp.c
>>>
>>> ...
>>> diff --git a/libavcodec/x86/apv_dsp.asm b/libavcodec/x86/apv_dsp.asm
>>> new file mode 100644
>>> index 0000000000..12d96481de
>>> --- /dev/null
>>> +++ b/libavcodec/x86/apv_dsp.asm
>>> @@ -0,0 +1,311 @@
>>> +;************************************************************************
>>> +;* This file is part of FFmpeg.
>>> +;*
>>> +;* FFmpeg is free software; you can redistribute it and/or
>>> +;* modify it under the terms of the GNU Lesser General Public
>>> +;* License as published by the Free Software Foundation; either
>>> +;* version 2.1 of the License, or (at your option) any later version.
>>> +;*
>>> +;* FFmpeg is distributed in the hope that it will be useful,
>>> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +;* Lesser General Public License for more details.
>>> +;*
>>> +;* You should have received a copy of the GNU Lesser General Public
>>> +;* License along with FFmpeg; if not, write to the Free Software
>>> +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>>> +;******************************************************************************
>>> +
>>> +%include "libavutil/x86/x86util.asm"
>>> +
>>> +%if ARCH_X86_64
>>> +
>>> +SECTION_RODATA 32
>>> +
>>> +; Full matrix for row transform.
>>> +const tmatrix_row
>>> +    dw  64,  89,  84,  75,  64,  50,  35,  18
>>> +    dw  64, -18, -84,  50,  64, -75, -35,  89
>>> +    dw  64,  75,  35, -18, -64, -89, -84, -50
>>> +    dw  64, -50, -35,  89, -64, -18,  84, -75
>>> +    dw  64,  50, -35, -89, -64,  18,  84,  75
>>> +    dw  64, -75,  35,  18, -64,  89, -84,  50
>>> +    dw  64,  18, -84, -50,  64,  75, -35, -89
>>> +    dw  64, -89,  84, -75,  64, -50,  35, -18
>>> +
>>> +; Constant pairs for broadcast in column transform.
>>> +const tmatrix_col_even
>>> +    dw  64,  64,  64, -64
>>> +    dw  84,  35,  35, -84
>>> +const tmatrix_col_odd
>>> +    dw  89,  75,  50,  18
>>> +    dw  75, -18, -89, -50
>>> +    dw  50, -89,  18,  75
>>> +    dw  18, -50,  75, -89
>>> +
>>> +; Memory targets for vpbroadcastd (register version requires AVX512).
>>> +cextern pd_1
>>> +const sixtyfour
>>> +    dd  64
>>> +
>>> +SECTION .text
>>> +
>>> +; void ff_apv_decode_transquant_avx2(void *output,
>>> +;                                    ptrdiff_t pitch,
>>> +;                                    const int16_t *input,
>>> +;                                    const int16_t *qmatrix,
>>> +;                                    int bit_depth,
>>> +;                                    int qp_shift);
>>> +
>>> +INIT_YMM avx2
>>> +
>>> +cglobal apv_decode_transquant, 5, 7, 16, output, pitch, input, qmatrix, bit_depth, qp_shift, tmp
>>> +
>>> +    ; Load input and dequantise
>>> +
>>> +    vpbroadcastd  m10, [pd_1]
>>> +    lea       tmpd, [bit_depthd - 2]
>>> +    movd      xm8, qp_shiftm
>>> +    movd      xm9, tmpd
>>> +    vpslld    m10, m10, xm9
>>> +    vpsrld    m10, m10, 1
>>> +
>>> +    ; m8  = scalar qp_shift
>>> +    ; m9  = scalar bd_shift
>>> +    ; m10 = vector 1 << (bd_shift - 1)
>>> +    ; m11 = qmatrix load
>>> +
>>> +%macro LOAD_AND_DEQUANT 2 ; (xmm input, constant offset)
>>> +    vpmovsxwd m%1, [inputq   + %2]
>>> +    vpmovsxwd m11, [qmatrixq + %2]
>>> +    vpmaddwd  m%1, m%1, m11
>>> +    vpslld    m%1, m%1, xm8
>>> +    vpaddd    m%1, m%1, m10
>>> +    vpsrad    m%1, m%1, xm9
>>> +    vpackssdw m%1, m%1, m%1
>>> +%endmacro
>>> +
>>> +    LOAD_AND_DEQUANT 0, 0x00
>>> +    LOAD_AND_DEQUANT 1, 0x10
>>> +    LOAD_AND_DEQUANT 2, 0x20
>>> +    LOAD_AND_DEQUANT 3, 0x30
>>> +    LOAD_AND_DEQUANT 4, 0x40
>>> +    LOAD_AND_DEQUANT 5, 0x50
>>> +    LOAD_AND_DEQUANT 6, 0x60
>>> +    LOAD_AND_DEQUANT 7, 0x70
>>> +
>>> +    ; mN = row N words 0 1 2 3 0 1 2 3 4 5 6 7 4 5 6 7
>>> +
>>> +    ; Transform columns
>>> +    ; This applies a 1-D DCT butterfly
>>> +
>>> +    vpunpcklwd  m12, m0,  m4
>>> +    vpunpcklwd  m13, m2,  m6
>>> +    vpunpcklwd  m14, m1,  m3
>>> +    vpunpcklwd  m15, m5,  m7
>>> +
>>> +    ; m12 = rows 0 and 4 interleaved
>>> +    ; m13 = rows 2 and 6 interleaved
>>> +    ; m14 = rows 1 and 3 interleaved
>>> +    ; m15 = rows 5 and 7 interleaved
>>> +
>>> +    lea         tmpq, [tmatrix_col_even]
>>> +    vpbroadcastd   m0, [tmpq + 0x00]
>>> +    vpbroadcastd   m1, [tmpq + 0x04]
>>> +    vpbroadcastd   m2, [tmpq + 0x08]
>>> +    vpbroadcastd   m3, [tmpq + 0x0c]
>>
>> How about
>>
>>      vbroadcasti128   m0, [tmatrix_col_even]
>>      pshufd   m1, m0, q1111
>>      pshufd   m2, m0, q2222
>>      pshufd   m3, m0, q3333
>>      pshufd   m0, m0, q0000
>>
>> So you remove the lea, and do a single load from memory within a single cross-lane intruction, instead of four of each.
>>
>> Same below.
> 
> The broadcasts from memory are not slow, they don't read from either lane.
> 
> I can't measure a diffrence but instruction tables have vpbroadcastd as 1/3 and pshufd as 1/2 so I think I'll take that as a tie-break?  (lea is free and they will all load together, the vbroadcasti128 load is unaligned but pretty sure that is irrelevant.)

AVX doesn't care about alignment outside of intructions that are 
explicit about it (so movdqa/movaps). vbroadcasti128 in any case loads 
16 bytes and tmatrix_col_even seems to be 16 byte aligned.

Looking at Skylake and newer, vpbroadcastd has 4 cycle latency and 0.5 
throughput, so by the time the results are stored, the pmaddwd will be 
executed. Meanwhile, vbroadcasti128 has 3 latency, so the pshufd will 
not execute immediately.
I guess your version may be better.


[-- Attachment #1.2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

  reply	other threads:[~2025-04-24 21:41 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-23 20:45 [FFmpeg-devel] [PATCH v3 0/7] APV support Mark Thompson
2025-04-23 20:45 ` [FFmpeg-devel] [PATCH v3 1/7] lavc: APV codec ID and descriptor Mark Thompson
2025-04-23 20:45 ` [FFmpeg-devel] [PATCH v3 2/7] lavc/cbs: APV support Mark Thompson
2025-04-24  0:02   ` James Almer
2025-04-24 20:16     ` Mark Thompson
2025-04-23 20:45 ` [FFmpeg-devel] [PATCH v3 3/7] lavf: APV demuxer Mark Thompson
2025-04-24  0:10   ` James Almer
2025-04-24 20:15     ` Mark Thompson
2025-04-23 20:45 ` [FFmpeg-devel] [PATCH v3 4/7] lavc: APV decoder Mark Thompson
2025-04-24  3:04   ` James Almer
2025-04-25 17:25   ` Michael Niedermayer
2025-04-23 20:45 ` [FFmpeg-devel] [PATCH v3 5/7] lavc/apv: AVX2 transquant for x86-64 Mark Thompson
2025-04-24  2:55   ` James Almer
2025-04-24 20:37     ` Mark Thompson
2025-04-24 21:41       ` James Almer [this message]
2025-04-23 20:45 ` [FFmpeg-devel] [PATCH v3 6/7] lavc: APV metadata bitstream filter Mark Thompson
2025-04-23 20:45 ` [FFmpeg-devel] [PATCH v3 7/7] lavf: APV muxer Mark Thompson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=351277d3-387c-4700-afc3-3897a4ca714d@gmail.com \
    --to=jamrial@gmail.com \
    --cc=ffmpeg-devel@ffmpeg.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git