From: Shreesh Adiga <16567adigashreesh@gmail.com>
To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Subject: Re: [FFmpeg-devel] [PATCH v3] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes
Date: Wed, 29 Jan 2025 18:33:59 +0530
Message-ID: <CA+-x59bct+fUA-yXg14fFMs31YqT47R0OUNvsvrJgpmxZAJ5PQ@mail.gmail.com> (raw)
In-Reply-To: <DU0P250MB074761C12B0192B5718A493B8FEE2@DU0P250MB0747.EURP250.PROD.OUTLOOK.COM>
Hi Andreas,
I am not sure if that is needed. I can add the data observed on my machine
(AMD 7950x Zen 4),
I think this will vary from machine to machine. It is expected to be around
2x
compared to AVX2 and there is no core change apart from processing the
scalar loop with masked instructions.
The data doesn't entirely look consistent as per my expectations.
All the shuffle variants are equivalent in the work they do, yet the
speedups
are not consistent as per the report.
shuffle_bytes_0321_c: 56.5 ( 1.00x)
shuffle_bytes_0321_ssse3: 15.2 ( 3.70x)
shuffle_bytes_0321_avx2: 10.2 ( 5.51x)
shuffle_bytes_0321_avx512icl: 9.2 ( 6.11x)
shuffle_bytes_1230_c: 84.5 ( 1.00x)
shuffle_bytes_1230_ssse3: 14.2 ( 5.93x)
shuffle_bytes_1230_avx2: 15.2 ( 5.54x)
shuffle_bytes_1230_avx512icl: 11.2 ( 7.51x)
shuffle_bytes_2103_c: 48.5 ( 1.00x)
shuffle_bytes_2103_ssse3: 21.2 ( 2.28x)
shuffle_bytes_2103_avx2: 13.8 ( 3.53x)
shuffle_bytes_2103_avx512icl: 9.2 ( 5.24x)
shuffle_bytes_3012_c: 84.5 ( 1.00x)
shuffle_bytes_3012_ssse3: 14.2 ( 5.93x)
shuffle_bytes_3012_avx2: 16.2 ( 5.20x)
shuffle_bytes_3012_avx512icl: 10.2 ( 8.24x)
shuffle_bytes_3210_c: 89.2 ( 1.00x)
shuffle_bytes_3210_ssse3: 24.2 ( 3.68x)
shuffle_bytes_3210_avx2: 16.2 ( 5.49x)
shuffle_bytes_3210_avx512icl: 9.2 ( 9.65x)
I can add the details to commit message if you can confirm if it is needed.
Thanks,
Shreesh
On Wed, Jan 29, 2025 at 5:46 PM Andreas Rheinhardt <
andreas.rheinhardt@outlook.com> wrote:
> Shreesh Adiga:
> > Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
> > ---
> > v3: Fix build failure on older nasm by replacing "kmovw k, tmpw"
> > with "kmov k, tmpd" which matches "kmovw k, r32" syntax.
> > v2: Tried to align operands and improve indentation for ASM routine.
> > libswscale/x86/rgb2rgb.c | 21 +++++++++
> > libswscale/x86/rgb_2_rgb.asm | 90 +++++++++++++++++++++++-------------
> > 2 files changed, 80 insertions(+), 31 deletions(-)
> >
> > diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
> > index 6790551a38..4cbed54b35 100644
> > --- a/libswscale/x86/rgb2rgb.c
> > +++ b/libswscale/x86/rgb2rgb.c
> > @@ -2364,6 +2364,16 @@ void ff_shuffle_bytes_2013_avx2(const uint8_t
> *src, uint8_t *dst, int src_size);
> > void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int
> src_size);
> > void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int
> src_size);
> >
> > +void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +
> > void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> > const uint8_t *src, int width, int height,
> > int lumStride, int chromStride, int
> srcStride);
> > @@ -2454,6 +2464,17 @@ av_cold void rgb2rgb_init_x86(void)
> > shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
> > shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
> > }
> > + if (EXTERNAL_AVX512ICL(cpu_flags)) {
> > + shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl;
> > + shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl;
> > + shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl;
> > + shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl;
> > + shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl;
> > + shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl;
> > + shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl;
> > + shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl;
> > + shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl;
> > + }
> > if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> > uyvytoyuv422 = ff_uyvytoyuv422_avx2;
> > #endif
> > diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> > index b468beb12d..ca7a481255 100644
> > --- a/libswscale/x86/rgb_2_rgb.asm
> > +++ b/libswscale/x86/rgb_2_rgb.asm
> > @@ -57,40 +57,53 @@ SECTION .text
> > %macro SHUFFLE_BYTES 4
> > cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
> > VBROADCASTI128 m0, [pb_shuffle%1%2%3%4]
> > - movsxdifnidn wq, wd
> > - mov xq, wq
> > -
> > - add srcq, wq
> > - add dstq, wq
> > - neg wq
> > -
> > -;calc scalar loop
> > + movsxdifnidn wq, wd
> > + mov xq, wq
> > +
> > + add srcq, wq
> > + add dstq, wq
> > + neg wq
> > +
> > +%if mmsize == 64
> > + and xq, mmsize - 4
> > + shr xq, 2
> > + mov tmpd, -1
> > + shlx tmpd, tmpd, xd
> > + not tmpd
> > + kmovw k7, tmpd
> > + vmovdqu32 m1{k7}{z}, [srcq + wq]
> > + pshufb m1, m0
> > + vmovdqu32 [dstq + wq]{k7}, m1
> > + lea wq, [wq + 4 * xq]
> > +%else
> > + ;calc scalar loop
> > and xq, mmsize-4
> > je .loop_simd
> >
> > -.loop_scalar:
> > - mov tmpb, [srcq + wq + %1]
> > - mov [dstq+wq + 0], tmpb
> > - mov tmpb, [srcq + wq + %2]
> > - mov [dstq+wq + 1], tmpb
> > - mov tmpb, [srcq + wq + %3]
> > - mov [dstq+wq + 2], tmpb
> > - mov tmpb, [srcq + wq + %4]
> > - mov [dstq+wq + 3], tmpb
> > - add wq, 4
> > - sub xq, 4
> > - jg .loop_scalar
> > -
> > -;check if src_size < mmsize
> > -cmp wq, 0
> > -jge .end
> > -
> > -.loop_simd:
> > - movu m1, [srcq+wq]
> > - pshufb m1, m0
> > - movu [dstq+wq], m1
> > - add wq, mmsize
> > - jl .loop_simd
> > + .loop_scalar:
> > + mov tmpb, [srcq + wq + %1]
> > + mov [dstq+wq + 0], tmpb
> > + mov tmpb, [srcq + wq + %2]
> > + mov [dstq+wq + 1], tmpb
> > + mov tmpb, [srcq + wq + %3]
> > + mov [dstq+wq + 2], tmpb
> > + mov tmpb, [srcq + wq + %4]
> > + mov [dstq+wq + 3], tmpb
> > + add wq, 4
> > + sub xq, 4
> > + jg .loop_scalar
> > +%endif
> > +
> > + ;check if src_size < mmsize
> > + cmp wq, 0
> > + jge .end
> > +
> > + .loop_simd:
> > + movu m1, [srcq + wq]
> > + pshufb m1, m0
> > + movu [dstq + wq], m1
> > + add wq, mmsize
> > + jl .loop_simd
> >
> > .end:
> > RET
> > @@ -122,6 +135,21 @@ SHUFFLE_BYTES 1, 2, 0, 3
> > %endif
> > %endif
> >
> > +%if ARCH_X86_64
> > +%if HAVE_AVX512ICL_EXTERNAL
> > +INIT_ZMM avx512icl
> > +SHUFFLE_BYTES 2, 1, 0, 3
> > +SHUFFLE_BYTES 0, 3, 2, 1
> > +SHUFFLE_BYTES 1, 2, 3, 0
> > +SHUFFLE_BYTES 3, 0, 1, 2
> > +SHUFFLE_BYTES 3, 2, 1, 0
> > +SHUFFLE_BYTES 3, 1, 0, 2
> > +SHUFFLE_BYTES 2, 0, 1, 3
> > +SHUFFLE_BYTES 2, 1, 3, 0
> > +SHUFFLE_BYTES 1, 2, 0, 3
> > +%endif
> > +%endif
> > +
> >
> ;-----------------------------------------------------------------------------------------------
> > ; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> > ; const uint8_t *src, int width, int height,
> > --
> > 2.45.3
> >
> Why does the commit message not contain speedup numbers?
>
> - Andreas
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2025-01-29 13:04 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-01-28 15:48 Shreesh Adiga
2025-01-29 12:16 ` Andreas Rheinhardt
2025-01-29 13:03 ` Shreesh Adiga [this message]
2025-02-03 13:20 ` James Almer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CA+-x59bct+fUA-yXg14fFMs31YqT47R0OUNvsvrJgpmxZAJ5PQ@mail.gmail.com \
--to=16567adigashreesh@gmail.com \
--cc=ffmpeg-devel@ffmpeg.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git