Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH v3] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes
@ 2025-01-28 15:48 Shreesh Adiga
  2025-01-29 12:16 ` Andreas Rheinhardt
  0 siblings, 1 reply; 4+ messages in thread
From: Shreesh Adiga @ 2025-01-28 15:48 UTC (permalink / raw)
  To: ffmpeg-devel

Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
---
v3: Fix build failure on older nasm by replacing "kmovw k, tmpw"
    with "kmov k, tmpd" which matches "kmovw k, r32" syntax.
v2: Tried to align operands and improve indentation for ASM routine.
 libswscale/x86/rgb2rgb.c     | 21 +++++++++
 libswscale/x86/rgb_2_rgb.asm | 90 +++++++++++++++++++++++-------------
 2 files changed, 80 insertions(+), 31 deletions(-)

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 6790551a38..4cbed54b35 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -2364,6 +2364,16 @@ void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size);

+void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
+
 void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                           const uint8_t *src, int width, int height,
                           int lumStride, int chromStride, int srcStride);
@@ -2454,6 +2464,17 @@ av_cold void rgb2rgb_init_x86(void)
         shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
         shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
     }
+    if (EXTERNAL_AVX512ICL(cpu_flags)) {
+        shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl;
+        shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl;
+        shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl;
+        shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl;
+        shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl;
+        shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl;
+        shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl;
+        shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl;
+        shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl;
+    }
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
         uyvytoyuv422 = ff_uyvytoyuv422_avx2;
 #endif
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index b468beb12d..ca7a481255 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -57,40 +57,53 @@ SECTION .text
 %macro SHUFFLE_BYTES 4
 cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
     VBROADCASTI128    m0, [pb_shuffle%1%2%3%4]
-    movsxdifnidn wq, wd
-    mov xq, wq
-
-    add        srcq, wq
-    add        dstq, wq
-    neg          wq
-
-;calc scalar loop
+    movsxdifnidn      wq, wd
+    mov               xq, wq
+
+    add             srcq, wq
+    add             dstq, wq
+    neg               wq
+
+%if mmsize == 64
+    and                    xq, mmsize - 4
+    shr                    xq, 2
+    mov                  tmpd, -1
+    shlx                 tmpd, tmpd, xd
+    not                  tmpd
+    kmovw                  k7, tmpd
+    vmovdqu32       m1{k7}{z}, [srcq + wq]
+    pshufb                 m1, m0
+    vmovdqu32 [dstq + wq]{k7}, m1
+    lea                    wq, [wq + 4 * xq]
+%else
+    ;calc scalar loop
     and xq, mmsize-4
     je .loop_simd

-.loop_scalar:
-   mov          tmpb, [srcq + wq + %1]
-   mov [dstq+wq + 0], tmpb
-   mov          tmpb, [srcq + wq + %2]
-   mov [dstq+wq + 1], tmpb
-   mov          tmpb, [srcq + wq + %3]
-   mov [dstq+wq + 2], tmpb
-   mov          tmpb, [srcq + wq + %4]
-   mov [dstq+wq + 3], tmpb
-   add            wq, 4
-   sub            xq, 4
-   jg .loop_scalar
-
-;check if src_size < mmsize
-cmp wq, 0
-jge .end
-
-.loop_simd:
-    movu           m1, [srcq+wq]
-    pshufb         m1, m0
-    movu    [dstq+wq], m1
-    add            wq, mmsize
-    jl .loop_simd
+    .loop_scalar:
+        mov          tmpb, [srcq + wq + %1]
+        mov [dstq+wq + 0], tmpb
+        mov          tmpb, [srcq + wq + %2]
+        mov [dstq+wq + 1], tmpb
+        mov          tmpb, [srcq + wq + %3]
+        mov [dstq+wq + 2], tmpb
+        mov          tmpb, [srcq + wq + %4]
+        mov [dstq+wq + 3], tmpb
+        add            wq, 4
+        sub            xq, 4
+        jg .loop_scalar
+%endif
+
+    ;check if src_size < mmsize
+    cmp wq, 0
+    jge .end
+
+    .loop_simd:
+        movu            m1, [srcq + wq]
+        pshufb          m1, m0
+        movu   [dstq + wq], m1
+        add             wq, mmsize
+        jl .loop_simd

 .end:
     RET
@@ -122,6 +135,21 @@ SHUFFLE_BYTES 1, 2, 0, 3
 %endif
 %endif

+%if ARCH_X86_64
+%if HAVE_AVX512ICL_EXTERNAL
+INIT_ZMM avx512icl
+SHUFFLE_BYTES 2, 1, 0, 3
+SHUFFLE_BYTES 0, 3, 2, 1
+SHUFFLE_BYTES 1, 2, 3, 0
+SHUFFLE_BYTES 3, 0, 1, 2
+SHUFFLE_BYTES 3, 2, 1, 0
+SHUFFLE_BYTES 3, 1, 0, 2
+SHUFFLE_BYTES 2, 0, 1, 3
+SHUFFLE_BYTES 2, 1, 3, 0
+SHUFFLE_BYTES 1, 2, 0, 3
+%endif
+%endif
+
 ;-----------------------------------------------------------------------------------------------
 ; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
 ;              const uint8_t *src, int width, int height,
--
2.45.3

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [FFmpeg-devel] [PATCH v3] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes
  2025-01-28 15:48 [FFmpeg-devel] [PATCH v3] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes Shreesh Adiga
@ 2025-01-29 12:16 ` Andreas Rheinhardt
  2025-01-29 13:03   ` Shreesh Adiga
  0 siblings, 1 reply; 4+ messages in thread
From: Andreas Rheinhardt @ 2025-01-29 12:16 UTC (permalink / raw)
  To: ffmpeg-devel

Shreesh Adiga:
> Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
> ---
> v3: Fix build failure on older nasm by replacing "kmovw k, tmpw"
>     with "kmov k, tmpd" which matches "kmovw k, r32" syntax.
> v2: Tried to align operands and improve indentation for ASM routine.
>  libswscale/x86/rgb2rgb.c     | 21 +++++++++
>  libswscale/x86/rgb_2_rgb.asm | 90 +++++++++++++++++++++++-------------
>  2 files changed, 80 insertions(+), 31 deletions(-)
> 
> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
> index 6790551a38..4cbed54b35 100644
> --- a/libswscale/x86/rgb2rgb.c
> +++ b/libswscale/x86/rgb2rgb.c
> @@ -2364,6 +2364,16 @@ void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size);
>  void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size);
>  void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size);
> 
> +void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
> +
>  void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
>                            const uint8_t *src, int width, int height,
>                            int lumStride, int chromStride, int srcStride);
> @@ -2454,6 +2464,17 @@ av_cold void rgb2rgb_init_x86(void)
>          shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
>          shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
>      }
> +    if (EXTERNAL_AVX512ICL(cpu_flags)) {
> +        shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl;
> +        shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl;
> +        shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl;
> +        shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl;
> +        shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl;
> +        shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl;
> +        shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl;
> +        shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl;
> +        shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl;
> +    }
>      if (EXTERNAL_AVX2_FAST(cpu_flags)) {
>          uyvytoyuv422 = ff_uyvytoyuv422_avx2;
>  #endif
> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> index b468beb12d..ca7a481255 100644
> --- a/libswscale/x86/rgb_2_rgb.asm
> +++ b/libswscale/x86/rgb_2_rgb.asm
> @@ -57,40 +57,53 @@ SECTION .text
>  %macro SHUFFLE_BYTES 4
>  cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
>      VBROADCASTI128    m0, [pb_shuffle%1%2%3%4]
> -    movsxdifnidn wq, wd
> -    mov xq, wq
> -
> -    add        srcq, wq
> -    add        dstq, wq
> -    neg          wq
> -
> -;calc scalar loop
> +    movsxdifnidn      wq, wd
> +    mov               xq, wq
> +
> +    add             srcq, wq
> +    add             dstq, wq
> +    neg               wq
> +
> +%if mmsize == 64
> +    and                    xq, mmsize - 4
> +    shr                    xq, 2
> +    mov                  tmpd, -1
> +    shlx                 tmpd, tmpd, xd
> +    not                  tmpd
> +    kmovw                  k7, tmpd
> +    vmovdqu32       m1{k7}{z}, [srcq + wq]
> +    pshufb                 m1, m0
> +    vmovdqu32 [dstq + wq]{k7}, m1
> +    lea                    wq, [wq + 4 * xq]
> +%else
> +    ;calc scalar loop
>      and xq, mmsize-4
>      je .loop_simd
> 
> -.loop_scalar:
> -   mov          tmpb, [srcq + wq + %1]
> -   mov [dstq+wq + 0], tmpb
> -   mov          tmpb, [srcq + wq + %2]
> -   mov [dstq+wq + 1], tmpb
> -   mov          tmpb, [srcq + wq + %3]
> -   mov [dstq+wq + 2], tmpb
> -   mov          tmpb, [srcq + wq + %4]
> -   mov [dstq+wq + 3], tmpb
> -   add            wq, 4
> -   sub            xq, 4
> -   jg .loop_scalar
> -
> -;check if src_size < mmsize
> -cmp wq, 0
> -jge .end
> -
> -.loop_simd:
> -    movu           m1, [srcq+wq]
> -    pshufb         m1, m0
> -    movu    [dstq+wq], m1
> -    add            wq, mmsize
> -    jl .loop_simd
> +    .loop_scalar:
> +        mov          tmpb, [srcq + wq + %1]
> +        mov [dstq+wq + 0], tmpb
> +        mov          tmpb, [srcq + wq + %2]
> +        mov [dstq+wq + 1], tmpb
> +        mov          tmpb, [srcq + wq + %3]
> +        mov [dstq+wq + 2], tmpb
> +        mov          tmpb, [srcq + wq + %4]
> +        mov [dstq+wq + 3], tmpb
> +        add            wq, 4
> +        sub            xq, 4
> +        jg .loop_scalar
> +%endif
> +
> +    ;check if src_size < mmsize
> +    cmp wq, 0
> +    jge .end
> +
> +    .loop_simd:
> +        movu            m1, [srcq + wq]
> +        pshufb          m1, m0
> +        movu   [dstq + wq], m1
> +        add             wq, mmsize
> +        jl .loop_simd
> 
>  .end:
>      RET
> @@ -122,6 +135,21 @@ SHUFFLE_BYTES 1, 2, 0, 3
>  %endif
>  %endif
> 
> +%if ARCH_X86_64
> +%if HAVE_AVX512ICL_EXTERNAL
> +INIT_ZMM avx512icl
> +SHUFFLE_BYTES 2, 1, 0, 3
> +SHUFFLE_BYTES 0, 3, 2, 1
> +SHUFFLE_BYTES 1, 2, 3, 0
> +SHUFFLE_BYTES 3, 0, 1, 2
> +SHUFFLE_BYTES 3, 2, 1, 0
> +SHUFFLE_BYTES 3, 1, 0, 2
> +SHUFFLE_BYTES 2, 0, 1, 3
> +SHUFFLE_BYTES 2, 1, 3, 0
> +SHUFFLE_BYTES 1, 2, 0, 3
> +%endif
> +%endif
> +
>  ;-----------------------------------------------------------------------------------------------
>  ; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
>  ;              const uint8_t *src, int width, int height,
> --
> 2.45.3
> 
Why does the commit message not contain speedup numbers?

- Andreas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [FFmpeg-devel] [PATCH v3] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes
  2025-01-29 12:16 ` Andreas Rheinhardt
@ 2025-01-29 13:03   ` Shreesh Adiga
  2025-02-03 13:20     ` James Almer
  0 siblings, 1 reply; 4+ messages in thread
From: Shreesh Adiga @ 2025-01-29 13:03 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Hi Andreas,

I am not sure if that is needed. I can add the data observed on my machine
(AMD 7950x Zen 4),
I think this will vary from machine to machine. It is expected to be around
2x
compared to AVX2 and there is no core change apart from processing the
scalar loop with masked instructions.

The data doesn't entirely look consistent as per my expectations.
All the shuffle variants are equivalent in the work they do, yet the
speedups
are not consistent as per the report.

shuffle_bytes_0321_c:                                   56.5 ( 1.00x)
shuffle_bytes_0321_ssse3:                               15.2 ( 3.70x)
shuffle_bytes_0321_avx2:                                10.2 ( 5.51x)
shuffle_bytes_0321_avx512icl:                            9.2 ( 6.11x)
shuffle_bytes_1230_c:                                   84.5 ( 1.00x)
shuffle_bytes_1230_ssse3:                               14.2 ( 5.93x)
shuffle_bytes_1230_avx2:                                15.2 ( 5.54x)
shuffle_bytes_1230_avx512icl:                           11.2 ( 7.51x)
shuffle_bytes_2103_c:                                   48.5 ( 1.00x)
shuffle_bytes_2103_ssse3:                               21.2 ( 2.28x)
shuffle_bytes_2103_avx2:                                13.8 ( 3.53x)
shuffle_bytes_2103_avx512icl:                            9.2 ( 5.24x)
shuffle_bytes_3012_c:                                   84.5 ( 1.00x)
shuffle_bytes_3012_ssse3:                               14.2 ( 5.93x)
shuffle_bytes_3012_avx2:                                16.2 ( 5.20x)
shuffle_bytes_3012_avx512icl:                           10.2 ( 8.24x)
shuffle_bytes_3210_c:                                   89.2 ( 1.00x)
shuffle_bytes_3210_ssse3:                               24.2 ( 3.68x)
shuffle_bytes_3210_avx2:                                16.2 ( 5.49x)
shuffle_bytes_3210_avx512icl:                            9.2 ( 9.65x)

I can add the details to commit message if you can confirm if it is needed.

Thanks,
Shreesh

On Wed, Jan 29, 2025 at 5:46 PM Andreas Rheinhardt <
andreas.rheinhardt@outlook.com> wrote:

> Shreesh Adiga:
> > Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
> > ---
> > v3: Fix build failure on older nasm by replacing "kmovw k, tmpw"
> >     with "kmov k, tmpd" which matches "kmovw k, r32" syntax.
> > v2: Tried to align operands and improve indentation for ASM routine.
> >  libswscale/x86/rgb2rgb.c     | 21 +++++++++
> >  libswscale/x86/rgb_2_rgb.asm | 90 +++++++++++++++++++++++-------------
> >  2 files changed, 80 insertions(+), 31 deletions(-)
> >
> > diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
> > index 6790551a38..4cbed54b35 100644
> > --- a/libswscale/x86/rgb2rgb.c
> > +++ b/libswscale/x86/rgb2rgb.c
> > @@ -2364,6 +2364,16 @@ void ff_shuffle_bytes_2013_avx2(const uint8_t
> *src, uint8_t *dst, int src_size);
> >  void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int
> src_size);
> >  void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int
> src_size);
> >
> > +void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +
> >  void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> >                            const uint8_t *src, int width, int height,
> >                            int lumStride, int chromStride, int
> srcStride);
> > @@ -2454,6 +2464,17 @@ av_cold void rgb2rgb_init_x86(void)
> >          shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
> >          shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
> >      }
> > +    if (EXTERNAL_AVX512ICL(cpu_flags)) {
> > +        shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl;
> > +        shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl;
> > +        shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl;
> > +        shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl;
> > +        shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl;
> > +        shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl;
> > +        shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl;
> > +        shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl;
> > +        shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl;
> > +    }
> >      if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> >          uyvytoyuv422 = ff_uyvytoyuv422_avx2;
> >  #endif
> > diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> > index b468beb12d..ca7a481255 100644
> > --- a/libswscale/x86/rgb_2_rgb.asm
> > +++ b/libswscale/x86/rgb_2_rgb.asm
> > @@ -57,40 +57,53 @@ SECTION .text
> >  %macro SHUFFLE_BYTES 4
> >  cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
> >      VBROADCASTI128    m0, [pb_shuffle%1%2%3%4]
> > -    movsxdifnidn wq, wd
> > -    mov xq, wq
> > -
> > -    add        srcq, wq
> > -    add        dstq, wq
> > -    neg          wq
> > -
> > -;calc scalar loop
> > +    movsxdifnidn      wq, wd
> > +    mov               xq, wq
> > +
> > +    add             srcq, wq
> > +    add             dstq, wq
> > +    neg               wq
> > +
> > +%if mmsize == 64
> > +    and                    xq, mmsize - 4
> > +    shr                    xq, 2
> > +    mov                  tmpd, -1
> > +    shlx                 tmpd, tmpd, xd
> > +    not                  tmpd
> > +    kmovw                  k7, tmpd
> > +    vmovdqu32       m1{k7}{z}, [srcq + wq]
> > +    pshufb                 m1, m0
> > +    vmovdqu32 [dstq + wq]{k7}, m1
> > +    lea                    wq, [wq + 4 * xq]
> > +%else
> > +    ;calc scalar loop
> >      and xq, mmsize-4
> >      je .loop_simd
> >
> > -.loop_scalar:
> > -   mov          tmpb, [srcq + wq + %1]
> > -   mov [dstq+wq + 0], tmpb
> > -   mov          tmpb, [srcq + wq + %2]
> > -   mov [dstq+wq + 1], tmpb
> > -   mov          tmpb, [srcq + wq + %3]
> > -   mov [dstq+wq + 2], tmpb
> > -   mov          tmpb, [srcq + wq + %4]
> > -   mov [dstq+wq + 3], tmpb
> > -   add            wq, 4
> > -   sub            xq, 4
> > -   jg .loop_scalar
> > -
> > -;check if src_size < mmsize
> > -cmp wq, 0
> > -jge .end
> > -
> > -.loop_simd:
> > -    movu           m1, [srcq+wq]
> > -    pshufb         m1, m0
> > -    movu    [dstq+wq], m1
> > -    add            wq, mmsize
> > -    jl .loop_simd
> > +    .loop_scalar:
> > +        mov          tmpb, [srcq + wq + %1]
> > +        mov [dstq+wq + 0], tmpb
> > +        mov          tmpb, [srcq + wq + %2]
> > +        mov [dstq+wq + 1], tmpb
> > +        mov          tmpb, [srcq + wq + %3]
> > +        mov [dstq+wq + 2], tmpb
> > +        mov          tmpb, [srcq + wq + %4]
> > +        mov [dstq+wq + 3], tmpb
> > +        add            wq, 4
> > +        sub            xq, 4
> > +        jg .loop_scalar
> > +%endif
> > +
> > +    ;check if src_size < mmsize
> > +    cmp wq, 0
> > +    jge .end
> > +
> > +    .loop_simd:
> > +        movu            m1, [srcq + wq]
> > +        pshufb          m1, m0
> > +        movu   [dstq + wq], m1
> > +        add             wq, mmsize
> > +        jl .loop_simd
> >
> >  .end:
> >      RET
> > @@ -122,6 +135,21 @@ SHUFFLE_BYTES 1, 2, 0, 3
> >  %endif
> >  %endif
> >
> > +%if ARCH_X86_64
> > +%if HAVE_AVX512ICL_EXTERNAL
> > +INIT_ZMM avx512icl
> > +SHUFFLE_BYTES 2, 1, 0, 3
> > +SHUFFLE_BYTES 0, 3, 2, 1
> > +SHUFFLE_BYTES 1, 2, 3, 0
> > +SHUFFLE_BYTES 3, 0, 1, 2
> > +SHUFFLE_BYTES 3, 2, 1, 0
> > +SHUFFLE_BYTES 3, 1, 0, 2
> > +SHUFFLE_BYTES 2, 0, 1, 3
> > +SHUFFLE_BYTES 2, 1, 3, 0
> > +SHUFFLE_BYTES 1, 2, 0, 3
> > +%endif
> > +%endif
> > +
> >
> ;-----------------------------------------------------------------------------------------------
> >  ; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> >  ;              const uint8_t *src, int width, int height,
> > --
> > 2.45.3
> >
> Why does the commit message not contain speedup numbers?
>
> - Andreas
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [FFmpeg-devel] [PATCH v3] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes
  2025-01-29 13:03   ` Shreesh Adiga
@ 2025-02-03 13:20     ` James Almer
  0 siblings, 0 replies; 4+ messages in thread
From: James Almer @ 2025-02-03 13:20 UTC (permalink / raw)
  To: ffmpeg-devel


[-- Attachment #1.1.1: Type: text/plain, Size: 2191 bytes --]

On 1/29/2025 10:03 AM, Shreesh Adiga wrote:
> Hi Andreas,
> 
> I am not sure if that is needed. I can add the data observed on my machine
> (AMD 7950x Zen 4),
> I think this will vary from machine to machine. It is expected to be around
> 2x
> compared to AVX2 and there is no core change apart from processing the
> scalar loop with masked instructions.
> 
> The data doesn't entirely look consistent as per my expectations.
> All the shuffle variants are equivalent in the work they do, yet the
> speedups
> are not consistent as per the report.
> 
> shuffle_bytes_0321_c:                                   56.5 ( 1.00x)
> shuffle_bytes_0321_ssse3:                               15.2 ( 3.70x)
> shuffle_bytes_0321_avx2:                                10.2 ( 5.51x)
> shuffle_bytes_0321_avx512icl:                            9.2 ( 6.11x)
> shuffle_bytes_1230_c:                                   84.5 ( 1.00x)
> shuffle_bytes_1230_ssse3:                               14.2 ( 5.93x)
> shuffle_bytes_1230_avx2:                                15.2 ( 5.54x)
> shuffle_bytes_1230_avx512icl:                           11.2 ( 7.51x)
> shuffle_bytes_2103_c:                                   48.5 ( 1.00x)
> shuffle_bytes_2103_ssse3:                               21.2 ( 2.28x)
> shuffle_bytes_2103_avx2:                                13.8 ( 3.53x)
> shuffle_bytes_2103_avx512icl:                            9.2 ( 5.24x)
> shuffle_bytes_3012_c:                                   84.5 ( 1.00x)
> shuffle_bytes_3012_ssse3:                               14.2 ( 5.93x)
> shuffle_bytes_3012_avx2:                                16.2 ( 5.20x)
> shuffle_bytes_3012_avx512icl:                           10.2 ( 8.24x)
> shuffle_bytes_3210_c:                                   89.2 ( 1.00x)
> shuffle_bytes_3210_ssse3:                               24.2 ( 3.68x)
> shuffle_bytes_3210_avx2:                                16.2 ( 5.49x)
> shuffle_bytes_3210_avx512icl:                            9.2 ( 9.65x)
> 
> I can add the details to commit message if you can confirm if it is needed.
> 
> Thanks,
> Shreesh

Added the benchmarks and pushed the patch. Thanks.


[-- Attachment #1.2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-02-03 13:20 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-01-28 15:48 [FFmpeg-devel] [PATCH v3] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes Shreesh Adiga
2025-01-29 12:16 ` Andreas Rheinhardt
2025-01-29 13:03   ` Shreesh Adiga
2025-02-03 13:20     ` James Almer

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git