Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: replace shuffle_bytes_2103_mmxext with an SSE2 version
@ 2024-06-05 20:51 James Almer
  2024-06-06 14:15 ` [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions James Almer
  0 siblings, 1 reply; 10+ messages in thread
From: James Almer @ 2024-06-05 20:51 UTC (permalink / raw)
  To: ffmpeg-devel

shuffle_bytes_2103_c: 46.5
shuffle_bytes_2103_mmxext: 29.3
shuffle_bytes_2103_sse2: 12.5

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libswscale/x86/rgb2rgb.c     |  6 ++----
 libswscale/x86/rgb_2_rgb.asm | 30 +++++++++++-------------------
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 21ccfafe51..912fe431b3 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -116,7 +116,7 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
 
 #endif /* HAVE_INLINE_ASM */
 
-void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
@@ -154,10 +154,8 @@ av_cold void rgb2rgb_init_x86(void)
         rgb2rgb_init_avx();
 #endif /* HAVE_INLINE_ASM */
 
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
-        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
-    }
     if (EXTERNAL_SSE2(cpu_flags)) {
+        shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
 #if ARCH_X86_64
         uyvytoyuv422 = ff_uyvytoyuv422_sse2;
 #endif
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 0bf1278718..2d2ac778b7 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -25,7 +25,7 @@
 
 SECTION_RODATA
 
-pb_mask_shuffle2103_mmx times 8 dw 255
+pb_mask_shuffle2103 times 8 dw 255
 pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
 pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
 pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
@@ -50,11 +50,10 @@ SECTION .text
 ;------------------------------------------------------------------------------
 ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
 ;------------------------------------------------------------------------------
-INIT_MMX mmxext
-cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
-    mova   m6, [pb_mask_shuffle2103_mmx]
-    mova   m7, m6
-    psllq  m7, 8
+INIT_XMM sse2
+cglobal shuffle_bytes_2103, 3, 5, 4, src, dst, w, tmp, x
+    mova   m2, [pb_mask_shuffle2103]
+    psllq  m3, m2, 8
 
     movsxdifnidn wq, wd
     mov xq, wq
@@ -86,28 +85,21 @@ jge .end
 
 .loop_simd:
     movu     m0, [srcq+wq]
-    movu     m1, [srcq+wq+8]
-
-    pshufw   m3, m0, 177
-    pshufw   m5, m1, 177
 
-    pand     m0, m7
-    pand     m3, m6
+    pshuflw   m1, m0, 0xb1
+    pshufhw   m1, m1, 0xb1
 
-    pand     m1, m7
-    pand     m5, m6
+    pand     m0, m3
+    pand     m1, m2
 
-    por      m0, m3
-    por      m1, m5
+    por      m0, m1
 
     movu      [dstq+wq], m0
-    movu  [dstq+wq + 8], m1
 
-    add              wq, mmsize*2
+    add              wq, mmsize
     jl .loop_simd
 
 .end:
-    emms
     RET
 
 ;------------------------------------------------------------------------------
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
  2024-06-05 20:51 [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: replace shuffle_bytes_2103_mmxext with an SSE2 version James Almer
@ 2024-06-06 14:15 ` James Almer
  2024-06-06 14:48   ` Andreas Rheinhardt
  0 siblings, 1 reply; 10+ messages in thread
From: James Almer @ 2024-06-06 14:15 UTC (permalink / raw)
  To: ffmpeg-devel

And remove shuffle_bytes_2103_mmxext.

shuffle_bytes_0321_c: 28.1
shuffle_bytes_0321_sse2: 13.6
shuffle_bytes_0321_ssse3: 9.6
shuffle_bytes_0321_avx2: 7.1
shuffle_bytes_1230_c: 52.6
shuffle_bytes_1230_sse2: 12.1
shuffle_bytes_1230_ssse3: 8.6
shuffle_bytes_1230_avx2: 6.6
shuffle_bytes_2103_c: 29.1
shuffle_bytes_2103_mmxext: 29.3 // removed
shuffle_bytes_2103_sse2: 12.5
shuffle_bytes_2103_ssse3: 8.6
shuffle_bytes_2103_avx2: 7.1
shuffle_bytes_3012_c: 52.1
shuffle_bytes_3012_sse2: 12.1
shuffle_bytes_3012_ssse3: 8.6
shuffle_bytes_3012_avx2: 7.1
shuffle_bytes_3210_c: 50.6
shuffle_bytes_3210_sse2: 14.6
shuffle_bytes_3210_ssse3: 8.6
shuffle_bytes_3210_avx2: 7.1

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libswscale/x86/rgb2rgb.c     | 14 ++++--
 libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
 2 files changed, 69 insertions(+), 28 deletions(-)

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 21ccfafe51..9f6c8efc72 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
 
 #endif /* HAVE_INLINE_ASM */
 
-void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
@@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
         rgb2rgb_init_avx();
 #endif /* HAVE_INLINE_ASM */
 
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
-        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
-    }
     if (EXTERNAL_SSE2(cpu_flags)) {
+        shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
+        shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
+        shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
+        shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
+        shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
 #if ARCH_X86_64
         uyvytoyuv422 = ff_uyvytoyuv422_sse2;
 #endif
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 0bf1278718..9fc1974389 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -25,7 +25,6 @@
 
 SECTION_RODATA
 
-pb_mask_shuffle2103_mmx times 8 dw 255
 pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
 pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
 pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
@@ -50,11 +49,50 @@ SECTION .text
 ;------------------------------------------------------------------------------
 ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
 ;------------------------------------------------------------------------------
-INIT_MMX mmxext
-cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
-    mova   m6, [pb_mask_shuffle2103_mmx]
-    mova   m7, m6
-    psllq  m7, 8
+
+%macro SHUFFLE2103_SSE2 0
+    pshuflw   m1, m0, 0xb1
+    pshufhw   m1, m1, 0xb1
+
+    pand     m0, m3
+    pand     m1, m2
+%endmacro
+
+%macro SHUFFLE0321_SSE2 0
+    pshuflw   m1, m0, 0xb1
+    pshufhw   m1, m1, 0xb1
+
+    pand     m0, m2
+    pand     m1, m3
+%endmacro
+
+%macro SHUFFLE1230_SSE2 0
+    pslld    m1, m0, 24
+    psrld    m0, 8
+%endmacro
+
+%macro SHUFFLE3012_SSE2 0
+    pslld    m1, m0, 8
+    psrld    m0, 24
+%endmacro
+
+%macro SHUFFLE3210_SSE2 0
+    pshuflw   m1, m0, 0xb1
+    pshufhw   m1, m1, 0xb1
+
+    psrlw     m0, m1, 8
+    psllw     m1, 8
+%endmacro
+
+; %1-4 index shuffle
+; %5 load mask
+%macro SHUFFLE_BYTES_SSE2 5
+cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
+%if %5
+    pcmpeqw        m2, m2
+    psllw          m3, m2, 8 ; (word) { 0xff00 } x4
+    psrlw          m2, 8     ; (word) { 0x00ff } x4
+%endif
 
     movsxdifnidn wq, wd
     mov xq, wq
@@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
     je .loop_simd
 
 .loop_scalar:
-   mov          tmpb, [srcq + wq + 2]
+   mov          tmpb, [srcq + wq + %1]
    mov [dstq+wq + 0], tmpb
-   mov          tmpb, [srcq + wq + 1]
+   mov          tmpb, [srcq + wq + %2]
    mov [dstq+wq + 1], tmpb
-   mov          tmpb, [srcq + wq + 0]
+   mov          tmpb, [srcq + wq + %3]
    mov [dstq+wq + 2], tmpb
-   mov          tmpb, [srcq + wq + 3]
+   mov          tmpb, [srcq + wq + %4]
    mov [dstq+wq + 3], tmpb
    add            wq, 4
    sub            xq, 4
@@ -86,29 +124,26 @@ jge .end
 
 .loop_simd:
     movu     m0, [srcq+wq]
-    movu     m1, [srcq+wq+8]
-
-    pshufw   m3, m0, 177
-    pshufw   m5, m1, 177
-
-    pand     m0, m7
-    pand     m3, m6
 
-    pand     m1, m7
-    pand     m5, m6
+    SHUFFLE%1%2%3%4_SSE2
 
-    por      m0, m3
-    por      m1, m5
+    por      m0, m1
 
     movu      [dstq+wq], m0
-    movu  [dstq+wq + 8], m1
 
-    add              wq, mmsize*2
+    add              wq, mmsize
     jl .loop_simd
 
 .end:
-    emms
     RET
+%endmacro
+
+INIT_XMM sse2
+SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
+SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
+SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
+SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
+SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
 
 ;------------------------------------------------------------------------------
 ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
  2024-06-06 14:15 ` [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions James Almer
@ 2024-06-06 14:48   ` Andreas Rheinhardt
  2024-06-06 15:45     ` James Almer
  0 siblings, 1 reply; 10+ messages in thread
From: Andreas Rheinhardt @ 2024-06-06 14:48 UTC (permalink / raw)
  To: ffmpeg-devel

James Almer:
> And remove shuffle_bytes_2103_mmxext.
> 
> shuffle_bytes_0321_c: 28.1
> shuffle_bytes_0321_sse2: 13.6
> shuffle_bytes_0321_ssse3: 9.6
> shuffle_bytes_0321_avx2: 7.1
> shuffle_bytes_1230_c: 52.6
> shuffle_bytes_1230_sse2: 12.1
> shuffle_bytes_1230_ssse3: 8.6
> shuffle_bytes_1230_avx2: 6.6
> shuffle_bytes_2103_c: 29.1
> shuffle_bytes_2103_mmxext: 29.3 // removed
> shuffle_bytes_2103_sse2: 12.5
> shuffle_bytes_2103_ssse3: 8.6
> shuffle_bytes_2103_avx2: 7.1
> shuffle_bytes_3012_c: 52.1
> shuffle_bytes_3012_sse2: 12.1
> shuffle_bytes_3012_ssse3: 8.6
> shuffle_bytes_3012_avx2: 7.1
> shuffle_bytes_3210_c: 50.6
> shuffle_bytes_3210_sse2: 14.6
> shuffle_bytes_3210_ssse3: 8.6
> shuffle_bytes_3210_avx2: 7.1
> 
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  libswscale/x86/rgb2rgb.c     | 14 ++++--
>  libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
>  2 files changed, 69 insertions(+), 28 deletions(-)
> 
> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
> index 21ccfafe51..9f6c8efc72 100644
> --- a/libswscale/x86/rgb2rgb.c
> +++ b/libswscale/x86/rgb2rgb.c
> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
>  
>  #endif /* HAVE_INLINE_ASM */
>  
> -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>  void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>  void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>  void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
>          rgb2rgb_init_avx();
>  #endif /* HAVE_INLINE_ASM */
>  
> -    if (EXTERNAL_MMXEXT(cpu_flags)) {
> -        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
> -    }
>      if (EXTERNAL_SSE2(cpu_flags)) {
> +        shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
> +        shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
> +        shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
> +        shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
> +        shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
>  #if ARCH_X86_64
>          uyvytoyuv422 = ff_uyvytoyuv422_sse2;
>  #endif
> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> index 0bf1278718..9fc1974389 100644
> --- a/libswscale/x86/rgb_2_rgb.asm
> +++ b/libswscale/x86/rgb_2_rgb.asm
> @@ -25,7 +25,6 @@
>  
>  SECTION_RODATA
>  
> -pb_mask_shuffle2103_mmx times 8 dw 255
>  pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
>  pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
>  pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
> @@ -50,11 +49,50 @@ SECTION .text
>  ;------------------------------------------------------------------------------
>  ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
>  ;------------------------------------------------------------------------------
> -INIT_MMX mmxext
> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
> -    mova   m6, [pb_mask_shuffle2103_mmx]
> -    mova   m7, m6
> -    psllq  m7, 8
> +
> +%macro SHUFFLE2103_SSE2 0
> +    pshuflw   m1, m0, 0xb1
> +    pshufhw   m1, m1, 0xb1
> +
> +    pand     m0, m3
> +    pand     m1, m2
> +%endmacro
> +
> +%macro SHUFFLE0321_SSE2 0
> +    pshuflw   m1, m0, 0xb1
> +    pshufhw   m1, m1, 0xb1
> +
> +    pand     m0, m2
> +    pand     m1, m3
> +%endmacro
> +
> +%macro SHUFFLE1230_SSE2 0
> +    pslld    m1, m0, 24
> +    psrld    m0, 8
> +%endmacro
> +
> +%macro SHUFFLE3012_SSE2 0
> +    pslld    m1, m0, 8
> +    psrld    m0, 24
> +%endmacro
> +
> +%macro SHUFFLE3210_SSE2 0
> +    pshuflw   m1, m0, 0xb1
> +    pshufhw   m1, m1, 0xb1
> +
> +    psrlw     m0, m1, 8
> +    psllw     m1, 8
> +%endmacro
> +
> +; %1-4 index shuffle
> +; %5 load mask
> +%macro SHUFFLE_BYTES_SSE2 5
> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
> +%if %5
> +    pcmpeqw        m2, m2
> +    psllw          m3, m2, 8 ; (word) { 0xff00 } x4
> +    psrlw          m2, 8     ; (word) { 0x00ff } x4
> +%endif
>  
>      movsxdifnidn wq, wd
>      mov xq, wq
> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>      je .loop_simd
>  
>  .loop_scalar:
> -   mov          tmpb, [srcq + wq + 2]
> +   mov          tmpb, [srcq + wq + %1]
>     mov [dstq+wq + 0], tmpb
> -   mov          tmpb, [srcq + wq + 1]
> +   mov          tmpb, [srcq + wq + %2]
>     mov [dstq+wq + 1], tmpb
> -   mov          tmpb, [srcq + wq + 0]
> +   mov          tmpb, [srcq + wq + %3]
>     mov [dstq+wq + 2], tmpb
> -   mov          tmpb, [srcq + wq + 3]
> +   mov          tmpb, [srcq + wq + %4]
>     mov [dstq+wq + 3], tmpb
>     add            wq, 4
>     sub            xq, 4
> @@ -86,29 +124,26 @@ jge .end
>  
>  .loop_simd:
>      movu     m0, [srcq+wq]
> -    movu     m1, [srcq+wq+8]
> -
> -    pshufw   m3, m0, 177
> -    pshufw   m5, m1, 177
> -
> -    pand     m0, m7
> -    pand     m3, m6
>  
> -    pand     m1, m7
> -    pand     m5, m6
> +    SHUFFLE%1%2%3%4_SSE2
>  
> -    por      m0, m3
> -    por      m1, m5
> +    por      m0, m1
>  
>      movu      [dstq+wq], m0
> -    movu  [dstq+wq + 8], m1
>  
> -    add              wq, mmsize*2
> +    add              wq, mmsize
>      jl .loop_simd
>  
>  .end:
> -    emms
>      RET
> +%endmacro
> +
> +INIT_XMM sse2
> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>  
>  ;------------------------------------------------------------------------------
>  ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)

How old are the youngest processors with SSE2, but without SSSE3?
According to Wikipedia, nearly 15 years. Which makes me believe that the
SSE2 versions are not worth it (how many of these CPUs will use a new
FFmpeg anyway?).

- Andreas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
  2024-06-06 14:48   ` Andreas Rheinhardt
@ 2024-06-06 15:45     ` James Almer
  2024-06-08 15:55       ` Andreas Rheinhardt
  0 siblings, 1 reply; 10+ messages in thread
From: James Almer @ 2024-06-06 15:45 UTC (permalink / raw)
  To: ffmpeg-devel

On 6/6/2024 11:48 AM, Andreas Rheinhardt wrote:
> James Almer:
>> And remove shuffle_bytes_2103_mmxext.
>>
>> shuffle_bytes_0321_c: 28.1
>> shuffle_bytes_0321_sse2: 13.6
>> shuffle_bytes_0321_ssse3: 9.6
>> shuffle_bytes_0321_avx2: 7.1
>> shuffle_bytes_1230_c: 52.6
>> shuffle_bytes_1230_sse2: 12.1
>> shuffle_bytes_1230_ssse3: 8.6
>> shuffle_bytes_1230_avx2: 6.6
>> shuffle_bytes_2103_c: 29.1
>> shuffle_bytes_2103_mmxext: 29.3 // removed
>> shuffle_bytes_2103_sse2: 12.5
>> shuffle_bytes_2103_ssse3: 8.6
>> shuffle_bytes_2103_avx2: 7.1
>> shuffle_bytes_3012_c: 52.1
>> shuffle_bytes_3012_sse2: 12.1
>> shuffle_bytes_3012_ssse3: 8.6
>> shuffle_bytes_3012_avx2: 7.1
>> shuffle_bytes_3210_c: 50.6
>> shuffle_bytes_3210_sse2: 14.6
>> shuffle_bytes_3210_ssse3: 8.6
>> shuffle_bytes_3210_avx2: 7.1
>>
>> Signed-off-by: James Almer <jamrial@gmail.com>
>> ---
>>   libswscale/x86/rgb2rgb.c     | 14 ++++--
>>   libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
>>   2 files changed, 69 insertions(+), 28 deletions(-)
>>
>> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
>> index 21ccfafe51..9f6c8efc72 100644
>> --- a/libswscale/x86/rgb2rgb.c
>> +++ b/libswscale/x86/rgb2rgb.c
>> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
>>   
>>   #endif /* HAVE_INLINE_ASM */
>>   
>> -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
>> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>>   void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>>   void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>>   void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
>>           rgb2rgb_init_avx();
>>   #endif /* HAVE_INLINE_ASM */
>>   
>> -    if (EXTERNAL_MMXEXT(cpu_flags)) {
>> -        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
>> -    }
>>       if (EXTERNAL_SSE2(cpu_flags)) {
>> +        shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
>> +        shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
>> +        shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
>> +        shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
>> +        shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
>>   #if ARCH_X86_64
>>           uyvytoyuv422 = ff_uyvytoyuv422_sse2;
>>   #endif
>> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
>> index 0bf1278718..9fc1974389 100644
>> --- a/libswscale/x86/rgb_2_rgb.asm
>> +++ b/libswscale/x86/rgb_2_rgb.asm
>> @@ -25,7 +25,6 @@
>>   
>>   SECTION_RODATA
>>   
>> -pb_mask_shuffle2103_mmx times 8 dw 255
>>   pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
>>   pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
>>   pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
>> @@ -50,11 +49,50 @@ SECTION .text
>>   ;------------------------------------------------------------------------------
>>   ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
>>   ;------------------------------------------------------------------------------
>> -INIT_MMX mmxext
>> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>> -    mova   m6, [pb_mask_shuffle2103_mmx]
>> -    mova   m7, m6
>> -    psllq  m7, 8
>> +
>> +%macro SHUFFLE2103_SSE2 0
>> +    pshuflw   m1, m0, 0xb1
>> +    pshufhw   m1, m1, 0xb1
>> +
>> +    pand     m0, m3
>> +    pand     m1, m2
>> +%endmacro
>> +
>> +%macro SHUFFLE0321_SSE2 0
>> +    pshuflw   m1, m0, 0xb1
>> +    pshufhw   m1, m1, 0xb1
>> +
>> +    pand     m0, m2
>> +    pand     m1, m3
>> +%endmacro
>> +
>> +%macro SHUFFLE1230_SSE2 0
>> +    pslld    m1, m0, 24
>> +    psrld    m0, 8
>> +%endmacro
>> +
>> +%macro SHUFFLE3012_SSE2 0
>> +    pslld    m1, m0, 8
>> +    psrld    m0, 24
>> +%endmacro
>> +
>> +%macro SHUFFLE3210_SSE2 0
>> +    pshuflw   m1, m0, 0xb1
>> +    pshufhw   m1, m1, 0xb1
>> +
>> +    psrlw     m0, m1, 8
>> +    psllw     m1, 8
>> +%endmacro
>> +
>> +; %1-4 index shuffle
>> +; %5 load mask
>> +%macro SHUFFLE_BYTES_SSE2 5
>> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
>> +%if %5
>> +    pcmpeqw        m2, m2
>> +    psllw          m3, m2, 8 ; (word) { 0xff00 } x4
>> +    psrlw          m2, 8     ; (word) { 0x00ff } x4
>> +%endif
>>   
>>       movsxdifnidn wq, wd
>>       mov xq, wq
>> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>>       je .loop_simd
>>   
>>   .loop_scalar:
>> -   mov          tmpb, [srcq + wq + 2]
>> +   mov          tmpb, [srcq + wq + %1]
>>      mov [dstq+wq + 0], tmpb
>> -   mov          tmpb, [srcq + wq + 1]
>> +   mov          tmpb, [srcq + wq + %2]
>>      mov [dstq+wq + 1], tmpb
>> -   mov          tmpb, [srcq + wq + 0]
>> +   mov          tmpb, [srcq + wq + %3]
>>      mov [dstq+wq + 2], tmpb
>> -   mov          tmpb, [srcq + wq + 3]
>> +   mov          tmpb, [srcq + wq + %4]
>>      mov [dstq+wq + 3], tmpb
>>      add            wq, 4
>>      sub            xq, 4
>> @@ -86,29 +124,26 @@ jge .end
>>   
>>   .loop_simd:
>>       movu     m0, [srcq+wq]
>> -    movu     m1, [srcq+wq+8]
>> -
>> -    pshufw   m3, m0, 177
>> -    pshufw   m5, m1, 177
>> -
>> -    pand     m0, m7
>> -    pand     m3, m6
>>   
>> -    pand     m1, m7
>> -    pand     m5, m6
>> +    SHUFFLE%1%2%3%4_SSE2
>>   
>> -    por      m0, m3
>> -    por      m1, m5
>> +    por      m0, m1
>>   
>>       movu      [dstq+wq], m0
>> -    movu  [dstq+wq + 8], m1
>>   
>> -    add              wq, mmsize*2
>> +    add              wq, mmsize
>>       jl .loop_simd
>>   
>>   .end:
>> -    emms
>>       RET
>> +%endmacro
>> +
>> +INIT_XMM sse2
>> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
>> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
>> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
>> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
>> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>>   
>>   ;------------------------------------------------------------------------------
>>   ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
> 
> How old are the youngest processors with SSE2, but without SSSE3?

AMD Phenom/K10.

> According to Wikipedia, nearly 15 years. Which makes me believe that the
> SSE2 versions are not worth it (how many of these CPUs will use a new
> FFmpeg anyway?).

Simply by using the latest version of a video player that uses ffmpeg is 
enough to be able to run the newest code.
It was easy to write and i don't feel particularly interested enough to 
argue, so if you think it's not worth adding, i can just remove the 
mmxext version and skip adding anything.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
  2024-06-06 15:45     ` James Almer
@ 2024-06-08 15:55       ` Andreas Rheinhardt
  2024-06-08 16:21         ` Rémi Denis-Courmont
  2024-06-09 15:36         ` James Almer
  0 siblings, 2 replies; 10+ messages in thread
From: Andreas Rheinhardt @ 2024-06-08 15:55 UTC (permalink / raw)
  To: ffmpeg-devel

James Almer:
> On 6/6/2024 11:48 AM, Andreas Rheinhardt wrote:
>> James Almer:
>>> And remove shuffle_bytes_2103_mmxext.
>>>
>>> shuffle_bytes_0321_c: 28.1
>>> shuffle_bytes_0321_sse2: 13.6
>>> shuffle_bytes_0321_ssse3: 9.6
>>> shuffle_bytes_0321_avx2: 7.1
>>> shuffle_bytes_1230_c: 52.6
>>> shuffle_bytes_1230_sse2: 12.1
>>> shuffle_bytes_1230_ssse3: 8.6
>>> shuffle_bytes_1230_avx2: 6.6
>>> shuffle_bytes_2103_c: 29.1
>>> shuffle_bytes_2103_mmxext: 29.3 // removed
>>> shuffle_bytes_2103_sse2: 12.5
>>> shuffle_bytes_2103_ssse3: 8.6
>>> shuffle_bytes_2103_avx2: 7.1
>>> shuffle_bytes_3012_c: 52.1
>>> shuffle_bytes_3012_sse2: 12.1
>>> shuffle_bytes_3012_ssse3: 8.6
>>> shuffle_bytes_3012_avx2: 7.1
>>> shuffle_bytes_3210_c: 50.6
>>> shuffle_bytes_3210_sse2: 14.6
>>> shuffle_bytes_3210_ssse3: 8.6
>>> shuffle_bytes_3210_avx2: 7.1
>>>
>>> Signed-off-by: James Almer <jamrial@gmail.com>
>>> ---
>>>   libswscale/x86/rgb2rgb.c     | 14 ++++--
>>>   libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
>>>   2 files changed, 69 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
>>> index 21ccfafe51..9f6c8efc72 100644
>>> --- a/libswscale/x86/rgb2rgb.c
>>> +++ b/libswscale/x86/rgb2rgb.c
>>> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t,
>>> ff_bgr2UVOffset);
>>>     #endif /* HAVE_INLINE_ASM */
>>>   -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t
>>> *dst, int src_size);
>>> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>>   void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>>   void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>>   void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
>>>           rgb2rgb_init_avx();
>>>   #endif /* HAVE_INLINE_ASM */
>>>   -    if (EXTERNAL_MMXEXT(cpu_flags)) {
>>> -        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
>>> -    }
>>>       if (EXTERNAL_SSE2(cpu_flags)) {
>>> +        shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
>>> +        shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
>>> +        shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
>>> +        shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
>>> +        shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
>>>   #if ARCH_X86_64
>>>           uyvytoyuv422 = ff_uyvytoyuv422_sse2;
>>>   #endif
>>> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
>>> index 0bf1278718..9fc1974389 100644
>>> --- a/libswscale/x86/rgb_2_rgb.asm
>>> +++ b/libswscale/x86/rgb_2_rgb.asm
>>> @@ -25,7 +25,6 @@
>>>     SECTION_RODATA
>>>   -pb_mask_shuffle2103_mmx times 8 dw 255
>>>   pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13,
>>> 12, 15
>>>   pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15,
>>> 14, 13
>>>   pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14,
>>> 15, 12
>>> @@ -50,11 +49,50 @@ SECTION .text
>>>  
>>> ;------------------------------------------------------------------------------
>>>   ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int
>>> src_size)
>>>  
>>> ;------------------------------------------------------------------------------
>>> -INIT_MMX mmxext
>>> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>>> -    mova   m6, [pb_mask_shuffle2103_mmx]
>>> -    mova   m7, m6
>>> -    psllq  m7, 8
>>> +
>>> +%macro SHUFFLE2103_SSE2 0
>>> +    pshuflw   m1, m0, 0xb1
>>> +    pshufhw   m1, m1, 0xb1
>>> +
>>> +    pand     m0, m3
>>> +    pand     m1, m2
>>> +%endmacro
>>> +
>>> +%macro SHUFFLE0321_SSE2 0
>>> +    pshuflw   m1, m0, 0xb1
>>> +    pshufhw   m1, m1, 0xb1
>>> +
>>> +    pand     m0, m2
>>> +    pand     m1, m3
>>> +%endmacro
>>> +
>>> +%macro SHUFFLE1230_SSE2 0
>>> +    pslld    m1, m0, 24
>>> +    psrld    m0, 8
>>> +%endmacro
>>> +
>>> +%macro SHUFFLE3012_SSE2 0
>>> +    pslld    m1, m0, 8
>>> +    psrld    m0, 24
>>> +%endmacro
>>> +
>>> +%macro SHUFFLE3210_SSE2 0
>>> +    pshuflw   m1, m0, 0xb1
>>> +    pshufhw   m1, m1, 0xb1
>>> +
>>> +    psrlw     m0, m1, 8
>>> +    psllw     m1, 8
>>> +%endmacro
>>> +
>>> +; %1-4 index shuffle
>>> +; %5 load mask
>>> +%macro SHUFFLE_BYTES_SSE2 5
>>> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
>>> +%if %5
>>> +    pcmpeqw        m2, m2
>>> +    psllw          m3, m2, 8 ; (word) { 0xff00 } x4
>>> +    psrlw          m2, 8     ; (word) { 0x00ff } x4
>>> +%endif
>>>         movsxdifnidn wq, wd
>>>       mov xq, wq
>>> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst,
>>> w, tmp, x
>>>       je .loop_simd
>>>     .loop_scalar:
>>> -   mov          tmpb, [srcq + wq + 2]
>>> +   mov          tmpb, [srcq + wq + %1]
>>>      mov [dstq+wq + 0], tmpb
>>> -   mov          tmpb, [srcq + wq + 1]
>>> +   mov          tmpb, [srcq + wq + %2]
>>>      mov [dstq+wq + 1], tmpb
>>> -   mov          tmpb, [srcq + wq + 0]
>>> +   mov          tmpb, [srcq + wq + %3]
>>>      mov [dstq+wq + 2], tmpb
>>> -   mov          tmpb, [srcq + wq + 3]
>>> +   mov          tmpb, [srcq + wq + %4]
>>>      mov [dstq+wq + 3], tmpb
>>>      add            wq, 4
>>>      sub            xq, 4
>>> @@ -86,29 +124,26 @@ jge .end
>>>     .loop_simd:
>>>       movu     m0, [srcq+wq]
>>> -    movu     m1, [srcq+wq+8]
>>> -
>>> -    pshufw   m3, m0, 177
>>> -    pshufw   m5, m1, 177
>>> -
>>> -    pand     m0, m7
>>> -    pand     m3, m6
>>>   -    pand     m1, m7
>>> -    pand     m5, m6
>>> +    SHUFFLE%1%2%3%4_SSE2
>>>   -    por      m0, m3
>>> -    por      m1, m5
>>> +    por      m0, m1
>>>         movu      [dstq+wq], m0
>>> -    movu  [dstq+wq + 8], m1
>>>   -    add              wq, mmsize*2
>>> +    add              wq, mmsize
>>>       jl .loop_simd
>>>     .end:
>>> -    emms
>>>       RET
>>> +%endmacro
>>> +
>>> +INIT_XMM sse2
>>> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
>>> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
>>> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
>>> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
>>> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>>>    
>>> ;------------------------------------------------------------------------------
>>>   ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
>>
>> How old are the youngest processors with SSE2, but without SSSE3?
> 
> AMD Phenom/K10.
> 
>> According to Wikipedia, nearly 15 years. Which makes me believe that the
>> SSE2 versions are not worth it (how many of these CPUs will use a new
>> FFmpeg anyway?).
> 
> Simply by using the latest version of a video player that uses ffmpeg is
> enough to be able to run the newest code.

I asked "how many", not "how".

> It was easy to write and i don't feel particularly interested enough to
> argue, so if you think it's not worth adding, i can just remove the
> mmxext version and skip adding anything.

I think we should not optimize for CPUs that do not even have x86-64 v2.
So I would not add these SSE2 versions. But the one missing SSSE3
version (shuffle_bytes_2103_ssse3) is of course worth it.

- Andreas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
  2024-06-08 15:55       ` Andreas Rheinhardt
@ 2024-06-08 16:21         ` Rémi Denis-Courmont
  2024-06-09 15:36         ` James Almer
  1 sibling, 0 replies; 10+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-08 16:21 UTC (permalink / raw)
  To: ffmpeg-devel

Le lauantaina 8. kesäkuuta 2024, 18.55.53 EEST Andreas Rheinhardt a écrit :
> I think we should not optimize for CPUs that do not even have x86-64 v2.
> So I would not add these SSE2 versions.

We certainly should consider ditching SSE2 where SSSE3 is available now or in 
the near future. But in this particular case, James seems to be converting 
MMX(EXT) code into SSE2 code, more so that introducing pure new SSE2 code.

It took almost forever to agree to get rid of MMX. I would like to go ahead 
with that, and I like to think that many other people too. So can we at least 
tolerate porting MMX to SSE2 until we have gotten rid of MMX for good?

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
  2024-06-08 15:55       ` Andreas Rheinhardt
  2024-06-08 16:21         ` Rémi Denis-Courmont
@ 2024-06-09 15:36         ` James Almer
  2024-06-09 16:05           ` Rémi Denis-Courmont
  2024-06-10 17:06           ` James Almer
  1 sibling, 2 replies; 10+ messages in thread
From: James Almer @ 2024-06-09 15:36 UTC (permalink / raw)
  To: ffmpeg-devel

On 6/8/2024 12:55 PM, Andreas Rheinhardt wrote:
> James Almer:
>> On 6/6/2024 11:48 AM, Andreas Rheinhardt wrote:
>>> James Almer:
>>>> And remove shuffle_bytes_2103_mmxext.
>>>>
>>>> shuffle_bytes_0321_c: 28.1
>>>> shuffle_bytes_0321_sse2: 13.6
>>>> shuffle_bytes_0321_ssse3: 9.6
>>>> shuffle_bytes_0321_avx2: 7.1
>>>> shuffle_bytes_1230_c: 52.6
>>>> shuffle_bytes_1230_sse2: 12.1
>>>> shuffle_bytes_1230_ssse3: 8.6
>>>> shuffle_bytes_1230_avx2: 6.6
>>>> shuffle_bytes_2103_c: 29.1
>>>> shuffle_bytes_2103_mmxext: 29.3 // removed
>>>> shuffle_bytes_2103_sse2: 12.5
>>>> shuffle_bytes_2103_ssse3: 8.6
>>>> shuffle_bytes_2103_avx2: 7.1
>>>> shuffle_bytes_3012_c: 52.1
>>>> shuffle_bytes_3012_sse2: 12.1
>>>> shuffle_bytes_3012_ssse3: 8.6
>>>> shuffle_bytes_3012_avx2: 7.1
>>>> shuffle_bytes_3210_c: 50.6
>>>> shuffle_bytes_3210_sse2: 14.6
>>>> shuffle_bytes_3210_ssse3: 8.6
>>>> shuffle_bytes_3210_avx2: 7.1
>>>>
>>>> Signed-off-by: James Almer <jamrial@gmail.com>
>>>> ---
>>>>    libswscale/x86/rgb2rgb.c     | 14 ++++--
>>>>    libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
>>>>    2 files changed, 69 insertions(+), 28 deletions(-)
>>>>
>>>> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
>>>> index 21ccfafe51..9f6c8efc72 100644
>>>> --- a/libswscale/x86/rgb2rgb.c
>>>> +++ b/libswscale/x86/rgb2rgb.c
>>>> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t,
>>>> ff_bgr2UVOffset);
>>>>      #endif /* HAVE_INLINE_ASM */
>>>>    -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t
>>>> *dst, int src_size);
>>>> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>>    void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>>    void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>>    void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
>>>>            rgb2rgb_init_avx();
>>>>    #endif /* HAVE_INLINE_ASM */
>>>>    -    if (EXTERNAL_MMXEXT(cpu_flags)) {
>>>> -        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
>>>> -    }
>>>>        if (EXTERNAL_SSE2(cpu_flags)) {
>>>> +        shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
>>>> +        shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
>>>> +        shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
>>>> +        shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
>>>> +        shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
>>>>    #if ARCH_X86_64
>>>>            uyvytoyuv422 = ff_uyvytoyuv422_sse2;
>>>>    #endif
>>>> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
>>>> index 0bf1278718..9fc1974389 100644
>>>> --- a/libswscale/x86/rgb_2_rgb.asm
>>>> +++ b/libswscale/x86/rgb_2_rgb.asm
>>>> @@ -25,7 +25,6 @@
>>>>      SECTION_RODATA
>>>>    -pb_mask_shuffle2103_mmx times 8 dw 255
>>>>    pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13,
>>>> 12, 15
>>>>    pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15,
>>>> 14, 13
>>>>    pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14,
>>>> 15, 12
>>>> @@ -50,11 +49,50 @@ SECTION .text
>>>>   
>>>> ;------------------------------------------------------------------------------
>>>>    ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int
>>>> src_size)
>>>>   
>>>> ;------------------------------------------------------------------------------
>>>> -INIT_MMX mmxext
>>>> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>>>> -    mova   m6, [pb_mask_shuffle2103_mmx]
>>>> -    mova   m7, m6
>>>> -    psllq  m7, 8
>>>> +
>>>> +%macro SHUFFLE2103_SSE2 0
>>>> +    pshuflw   m1, m0, 0xb1
>>>> +    pshufhw   m1, m1, 0xb1
>>>> +
>>>> +    pand     m0, m3
>>>> +    pand     m1, m2
>>>> +%endmacro
>>>> +
>>>> +%macro SHUFFLE0321_SSE2 0
>>>> +    pshuflw   m1, m0, 0xb1
>>>> +    pshufhw   m1, m1, 0xb1
>>>> +
>>>> +    pand     m0, m2
>>>> +    pand     m1, m3
>>>> +%endmacro
>>>> +
>>>> +%macro SHUFFLE1230_SSE2 0
>>>> +    pslld    m1, m0, 24
>>>> +    psrld    m0, 8
>>>> +%endmacro
>>>> +
>>>> +%macro SHUFFLE3012_SSE2 0
>>>> +    pslld    m1, m0, 8
>>>> +    psrld    m0, 24
>>>> +%endmacro
>>>> +
>>>> +%macro SHUFFLE3210_SSE2 0
>>>> +    pshuflw   m1, m0, 0xb1
>>>> +    pshufhw   m1, m1, 0xb1
>>>> +
>>>> +    psrlw     m0, m1, 8
>>>> +    psllw     m1, 8
>>>> +%endmacro
>>>> +
>>>> +; %1-4 index shuffle
>>>> +; %5 load mask
>>>> +%macro SHUFFLE_BYTES_SSE2 5
>>>> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
>>>> +%if %5
>>>> +    pcmpeqw        m2, m2
>>>> +    psllw          m3, m2, 8 ; (word) { 0xff00 } x4
>>>> +    psrlw          m2, 8     ; (word) { 0x00ff } x4
>>>> +%endif
>>>>          movsxdifnidn wq, wd
>>>>        mov xq, wq
>>>> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst,
>>>> w, tmp, x
>>>>        je .loop_simd
>>>>      .loop_scalar:
>>>> -   mov          tmpb, [srcq + wq + 2]
>>>> +   mov          tmpb, [srcq + wq + %1]
>>>>       mov [dstq+wq + 0], tmpb
>>>> -   mov          tmpb, [srcq + wq + 1]
>>>> +   mov          tmpb, [srcq + wq + %2]
>>>>       mov [dstq+wq + 1], tmpb
>>>> -   mov          tmpb, [srcq + wq + 0]
>>>> +   mov          tmpb, [srcq + wq + %3]
>>>>       mov [dstq+wq + 2], tmpb
>>>> -   mov          tmpb, [srcq + wq + 3]
>>>> +   mov          tmpb, [srcq + wq + %4]
>>>>       mov [dstq+wq + 3], tmpb
>>>>       add            wq, 4
>>>>       sub            xq, 4
>>>> @@ -86,29 +124,26 @@ jge .end
>>>>      .loop_simd:
>>>>        movu     m0, [srcq+wq]
>>>> -    movu     m1, [srcq+wq+8]
>>>> -
>>>> -    pshufw   m3, m0, 177
>>>> -    pshufw   m5, m1, 177
>>>> -
>>>> -    pand     m0, m7
>>>> -    pand     m3, m6
>>>>    -    pand     m1, m7
>>>> -    pand     m5, m6
>>>> +    SHUFFLE%1%2%3%4_SSE2
>>>>    -    por      m0, m3
>>>> -    por      m1, m5
>>>> +    por      m0, m1
>>>>          movu      [dstq+wq], m0
>>>> -    movu  [dstq+wq + 8], m1
>>>>    -    add              wq, mmsize*2
>>>> +    add              wq, mmsize
>>>>        jl .loop_simd
>>>>      .end:
>>>> -    emms
>>>>        RET
>>>> +%endmacro
>>>> +
>>>> +INIT_XMM sse2
>>>> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
>>>> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
>>>> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
>>>> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
>>>> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>>>>     
>>>> ;------------------------------------------------------------------------------
>>>>    ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
>>>
>>> How old are the youngest processors with SSE2, but without SSSE3?
>>
>> AMD Phenom/K10.
>>
>>> According to Wikipedia, nearly 15 years. Which makes me believe that the
>>> SSE2 versions are not worth it (how many of these CPUs will use a new
>>> FFmpeg anyway?).
>>
>> Simply by using the latest version of a video player that uses ffmpeg is
>> enough to be able to run the newest code.
> 
> I asked "how many", not "how".

I obviously don't have that kind of information. You'd need to look at 
things like Steam's, Firefox's or Chrome's hardware surveys.

> 
>> It was easy to write and i don't feel particularly interested enough to
>> argue, so if you think it's not worth adding, i can just remove the
>> mmxext version and skip adding anything.
> 
> I think we should not optimize for CPUs that do not even have x86-64 v2.

What is x86-64 v2?

> So I would not add these SSE2 versions. But the one missing SSSE3

Ok, I'll just remove the mmxext one, then.

> version (shuffle_bytes_2103_ssse3) is of course worth it.

I will look into that.

> 
> - Andreas
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
  2024-06-09 15:36         ` James Almer
@ 2024-06-09 16:05           ` Rémi Denis-Courmont
  2024-06-10 17:06           ` James Almer
  1 sibling, 0 replies; 10+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-09 16:05 UTC (permalink / raw)
  To: ffmpeg-devel

Le sunnuntaina 9. kesäkuuta 2024, 18.36.35 EEST James Almer a écrit :
> I obviously don't have that kind of information. You'd need to look at
> things like Steam's, Firefox's or Chrome's hardware surveys.

As discussed on IRC yesterday, Steam claims that 106.85% of processors support
SSE2 (and as many SSE3) but "only" 106.63% support SSSE3 (seriously). What
100% are, I don't know. AVX2 is close with 99.83% but no cigar.

In any case, there is a tiny but observable gap between SSE2 and SSSE3 there.

> >> It was easy to write and i don't feel particularly interested enough to
> >> argue, so if you think it's not worth adding, i can just remove the
> >> mmxext version and skip adding anything.
> > 
> > I think we should not optimize for CPUs that do not even have x86-64 v2.
> 
> What is x86-64 v2?

See
https://developers.redhat.com/blog/2021/01/05/building-red-hat-enterprise-linux-9-for-the-x86-64-v2-microarchitecture-level

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
  2024-06-09 15:36         ` James Almer
  2024-06-09 16:05           ` Rémi Denis-Courmont
@ 2024-06-10 17:06           ` James Almer
  2024-06-11  5:18             ` Andreas Rheinhardt
  1 sibling, 1 reply; 10+ messages in thread
From: James Almer @ 2024-06-10 17:06 UTC (permalink / raw)
  To: ffmpeg-devel

On 6/9/2024 12:36 PM, James Almer wrote:
>> So I would not add these SSE2 versions. But the one missing SSSE3
>> version (shuffle_bytes_2103_ssse3) is of course worth it.
> 
> I will look into that.

I'm not sure why you said it's missing, because it's there.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
  2024-06-10 17:06           ` James Almer
@ 2024-06-11  5:18             ` Andreas Rheinhardt
  0 siblings, 0 replies; 10+ messages in thread
From: Andreas Rheinhardt @ 2024-06-11  5:18 UTC (permalink / raw)
  To: ffmpeg-devel

James Almer:
> On 6/9/2024 12:36 PM, James Almer wrote:
>>> So I would not add these SSE2 versions. But the one missing SSSE3
>>> version (shuffle_bytes_2103_ssse3) is of course worth it.
>>
>> I will look into that.
> 
> I'm not sure why you said it's missing, because it's there.

Sorry for having said garbage.

- Andreas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-06-11  5:19 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-05 20:51 [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: replace shuffle_bytes_2103_mmxext with an SSE2 version James Almer
2024-06-06 14:15 ` [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions James Almer
2024-06-06 14:48   ` Andreas Rheinhardt
2024-06-06 15:45     ` James Almer
2024-06-08 15:55       ` Andreas Rheinhardt
2024-06-08 16:21         ` Rémi Denis-Courmont
2024-06-09 15:36         ` James Almer
2024-06-09 16:05           ` Rémi Denis-Courmont
2024-06-10 17:06           ` James Almer
2024-06-11  5:18             ` Andreas Rheinhardt

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git