* [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: replace shuffle_bytes_2103_mmxext with an SSE2 version
@ 2024-06-05 20:51 James Almer
2024-06-06 14:15 ` [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions James Almer
0 siblings, 1 reply; 10+ messages in thread
From: James Almer @ 2024-06-05 20:51 UTC (permalink / raw)
To: ffmpeg-devel
shuffle_bytes_2103_c: 46.5
shuffle_bytes_2103_mmxext: 29.3
shuffle_bytes_2103_sse2: 12.5
Signed-off-by: James Almer <jamrial@gmail.com>
---
libswscale/x86/rgb2rgb.c | 6 ++----
libswscale/x86/rgb_2_rgb.asm | 30 +++++++++++-------------------
2 files changed, 13 insertions(+), 23 deletions(-)
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 21ccfafe51..912fe431b3 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -116,7 +116,7 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
#endif /* HAVE_INLINE_ASM */
-void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
@@ -154,10 +154,8 @@ av_cold void rgb2rgb_init_x86(void)
rgb2rgb_init_avx();
#endif /* HAVE_INLINE_ASM */
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
- }
if (EXTERNAL_SSE2(cpu_flags)) {
+ shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
#if ARCH_X86_64
uyvytoyuv422 = ff_uyvytoyuv422_sse2;
#endif
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 0bf1278718..2d2ac778b7 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -25,7 +25,7 @@
SECTION_RODATA
-pb_mask_shuffle2103_mmx times 8 dw 255
+pb_mask_shuffle2103 times 8 dw 255
pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
@@ -50,11 +50,10 @@ SECTION .text
;------------------------------------------------------------------------------
; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
;------------------------------------------------------------------------------
-INIT_MMX mmxext
-cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
- mova m6, [pb_mask_shuffle2103_mmx]
- mova m7, m6
- psllq m7, 8
+INIT_XMM sse2
+cglobal shuffle_bytes_2103, 3, 5, 4, src, dst, w, tmp, x
+ mova m2, [pb_mask_shuffle2103]
+ psllq m3, m2, 8
movsxdifnidn wq, wd
mov xq, wq
@@ -86,28 +85,21 @@ jge .end
.loop_simd:
movu m0, [srcq+wq]
- movu m1, [srcq+wq+8]
-
- pshufw m3, m0, 177
- pshufw m5, m1, 177
- pand m0, m7
- pand m3, m6
+ pshuflw m1, m0, 0xb1
+ pshufhw m1, m1, 0xb1
- pand m1, m7
- pand m5, m6
+ pand m0, m3
+ pand m1, m2
- por m0, m3
- por m1, m5
+ por m0, m1
movu [dstq+wq], m0
- movu [dstq+wq + 8], m1
- add wq, mmsize*2
+ add wq, mmsize
jl .loop_simd
.end:
- emms
RET
;------------------------------------------------------------------------------
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
2024-06-05 20:51 [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: replace shuffle_bytes_2103_mmxext with an SSE2 version James Almer
@ 2024-06-06 14:15 ` James Almer
2024-06-06 14:48 ` Andreas Rheinhardt
0 siblings, 1 reply; 10+ messages in thread
From: James Almer @ 2024-06-06 14:15 UTC (permalink / raw)
To: ffmpeg-devel
And remove shuffle_bytes_2103_mmxext.
shuffle_bytes_0321_c: 28.1
shuffle_bytes_0321_sse2: 13.6
shuffle_bytes_0321_ssse3: 9.6
shuffle_bytes_0321_avx2: 7.1
shuffle_bytes_1230_c: 52.6
shuffle_bytes_1230_sse2: 12.1
shuffle_bytes_1230_ssse3: 8.6
shuffle_bytes_1230_avx2: 6.6
shuffle_bytes_2103_c: 29.1
shuffle_bytes_2103_mmxext: 29.3 // removed
shuffle_bytes_2103_sse2: 12.5
shuffle_bytes_2103_ssse3: 8.6
shuffle_bytes_2103_avx2: 7.1
shuffle_bytes_3012_c: 52.1
shuffle_bytes_3012_sse2: 12.1
shuffle_bytes_3012_ssse3: 8.6
shuffle_bytes_3012_avx2: 7.1
shuffle_bytes_3210_c: 50.6
shuffle_bytes_3210_sse2: 14.6
shuffle_bytes_3210_ssse3: 8.6
shuffle_bytes_3210_avx2: 7.1
Signed-off-by: James Almer <jamrial@gmail.com>
---
libswscale/x86/rgb2rgb.c | 14 ++++--
libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
2 files changed, 69 insertions(+), 28 deletions(-)
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 21ccfafe51..9f6c8efc72 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
#endif /* HAVE_INLINE_ASM */
-void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
@@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
rgb2rgb_init_avx();
#endif /* HAVE_INLINE_ASM */
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
- }
if (EXTERNAL_SSE2(cpu_flags)) {
+ shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
+ shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
+ shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
+ shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
+ shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
#if ARCH_X86_64
uyvytoyuv422 = ff_uyvytoyuv422_sse2;
#endif
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 0bf1278718..9fc1974389 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -25,7 +25,6 @@
SECTION_RODATA
-pb_mask_shuffle2103_mmx times 8 dw 255
pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
@@ -50,11 +49,50 @@ SECTION .text
;------------------------------------------------------------------------------
; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
;------------------------------------------------------------------------------
-INIT_MMX mmxext
-cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
- mova m6, [pb_mask_shuffle2103_mmx]
- mova m7, m6
- psllq m7, 8
+
+%macro SHUFFLE2103_SSE2 0
+ pshuflw m1, m0, 0xb1
+ pshufhw m1, m1, 0xb1
+
+ pand m0, m3
+ pand m1, m2
+%endmacro
+
+%macro SHUFFLE0321_SSE2 0
+ pshuflw m1, m0, 0xb1
+ pshufhw m1, m1, 0xb1
+
+ pand m0, m2
+ pand m1, m3
+%endmacro
+
+%macro SHUFFLE1230_SSE2 0
+ pslld m1, m0, 24
+ psrld m0, 8
+%endmacro
+
+%macro SHUFFLE3012_SSE2 0
+ pslld m1, m0, 8
+ psrld m0, 24
+%endmacro
+
+%macro SHUFFLE3210_SSE2 0
+ pshuflw m1, m0, 0xb1
+ pshufhw m1, m1, 0xb1
+
+ psrlw m0, m1, 8
+ psllw m1, 8
+%endmacro
+
+; %1-4 index shuffle
+; %5 load mask
+%macro SHUFFLE_BYTES_SSE2 5
+cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
+%if %5
+ pcmpeqw m2, m2
+ psllw m3, m2, 8 ; (word) { 0xff00 } x4
+ psrlw m2, 8 ; (word) { 0x00ff } x4
+%endif
movsxdifnidn wq, wd
mov xq, wq
@@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
je .loop_simd
.loop_scalar:
- mov tmpb, [srcq + wq + 2]
+ mov tmpb, [srcq + wq + %1]
mov [dstq+wq + 0], tmpb
- mov tmpb, [srcq + wq + 1]
+ mov tmpb, [srcq + wq + %2]
mov [dstq+wq + 1], tmpb
- mov tmpb, [srcq + wq + 0]
+ mov tmpb, [srcq + wq + %3]
mov [dstq+wq + 2], tmpb
- mov tmpb, [srcq + wq + 3]
+ mov tmpb, [srcq + wq + %4]
mov [dstq+wq + 3], tmpb
add wq, 4
sub xq, 4
@@ -86,29 +124,26 @@ jge .end
.loop_simd:
movu m0, [srcq+wq]
- movu m1, [srcq+wq+8]
-
- pshufw m3, m0, 177
- pshufw m5, m1, 177
-
- pand m0, m7
- pand m3, m6
- pand m1, m7
- pand m5, m6
+ SHUFFLE%1%2%3%4_SSE2
- por m0, m3
- por m1, m5
+ por m0, m1
movu [dstq+wq], m0
- movu [dstq+wq + 8], m1
- add wq, mmsize*2
+ add wq, mmsize
jl .loop_simd
.end:
- emms
RET
+%endmacro
+
+INIT_XMM sse2
+SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
+SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
+SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
+SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
+SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
;------------------------------------------------------------------------------
; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
2024-06-06 14:15 ` [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions James Almer
@ 2024-06-06 14:48 ` Andreas Rheinhardt
2024-06-06 15:45 ` James Almer
0 siblings, 1 reply; 10+ messages in thread
From: Andreas Rheinhardt @ 2024-06-06 14:48 UTC (permalink / raw)
To: ffmpeg-devel
James Almer:
> And remove shuffle_bytes_2103_mmxext.
>
> shuffle_bytes_0321_c: 28.1
> shuffle_bytes_0321_sse2: 13.6
> shuffle_bytes_0321_ssse3: 9.6
> shuffle_bytes_0321_avx2: 7.1
> shuffle_bytes_1230_c: 52.6
> shuffle_bytes_1230_sse2: 12.1
> shuffle_bytes_1230_ssse3: 8.6
> shuffle_bytes_1230_avx2: 6.6
> shuffle_bytes_2103_c: 29.1
> shuffle_bytes_2103_mmxext: 29.3 // removed
> shuffle_bytes_2103_sse2: 12.5
> shuffle_bytes_2103_ssse3: 8.6
> shuffle_bytes_2103_avx2: 7.1
> shuffle_bytes_3012_c: 52.1
> shuffle_bytes_3012_sse2: 12.1
> shuffle_bytes_3012_ssse3: 8.6
> shuffle_bytes_3012_avx2: 7.1
> shuffle_bytes_3210_c: 50.6
> shuffle_bytes_3210_sse2: 14.6
> shuffle_bytes_3210_ssse3: 8.6
> shuffle_bytes_3210_avx2: 7.1
>
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
> libswscale/x86/rgb2rgb.c | 14 ++++--
> libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
> 2 files changed, 69 insertions(+), 28 deletions(-)
>
> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
> index 21ccfafe51..9f6c8efc72 100644
> --- a/libswscale/x86/rgb2rgb.c
> +++ b/libswscale/x86/rgb2rgb.c
> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
>
> #endif /* HAVE_INLINE_ASM */
>
> -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
> void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
> void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
> rgb2rgb_init_avx();
> #endif /* HAVE_INLINE_ASM */
>
> - if (EXTERNAL_MMXEXT(cpu_flags)) {
> - shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
> - }
> if (EXTERNAL_SSE2(cpu_flags)) {
> + shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
> + shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
> + shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
> + shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
> + shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
> #if ARCH_X86_64
> uyvytoyuv422 = ff_uyvytoyuv422_sse2;
> #endif
> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> index 0bf1278718..9fc1974389 100644
> --- a/libswscale/x86/rgb_2_rgb.asm
> +++ b/libswscale/x86/rgb_2_rgb.asm
> @@ -25,7 +25,6 @@
>
> SECTION_RODATA
>
> -pb_mask_shuffle2103_mmx times 8 dw 255
> pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
> pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
> pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
> @@ -50,11 +49,50 @@ SECTION .text
> ;------------------------------------------------------------------------------
> ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
> ;------------------------------------------------------------------------------
> -INIT_MMX mmxext
> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
> - mova m6, [pb_mask_shuffle2103_mmx]
> - mova m7, m6
> - psllq m7, 8
> +
> +%macro SHUFFLE2103_SSE2 0
> + pshuflw m1, m0, 0xb1
> + pshufhw m1, m1, 0xb1
> +
> + pand m0, m3
> + pand m1, m2
> +%endmacro
> +
> +%macro SHUFFLE0321_SSE2 0
> + pshuflw m1, m0, 0xb1
> + pshufhw m1, m1, 0xb1
> +
> + pand m0, m2
> + pand m1, m3
> +%endmacro
> +
> +%macro SHUFFLE1230_SSE2 0
> + pslld m1, m0, 24
> + psrld m0, 8
> +%endmacro
> +
> +%macro SHUFFLE3012_SSE2 0
> + pslld m1, m0, 8
> + psrld m0, 24
> +%endmacro
> +
> +%macro SHUFFLE3210_SSE2 0
> + pshuflw m1, m0, 0xb1
> + pshufhw m1, m1, 0xb1
> +
> + psrlw m0, m1, 8
> + psllw m1, 8
> +%endmacro
> +
> +; %1-4 index shuffle
> +; %5 load mask
> +%macro SHUFFLE_BYTES_SSE2 5
> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
> +%if %5
> + pcmpeqw m2, m2
> + psllw m3, m2, 8 ; (word) { 0xff00 } x4
> + psrlw m2, 8 ; (word) { 0x00ff } x4
> +%endif
>
> movsxdifnidn wq, wd
> mov xq, wq
> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
> je .loop_simd
>
> .loop_scalar:
> - mov tmpb, [srcq + wq + 2]
> + mov tmpb, [srcq + wq + %1]
> mov [dstq+wq + 0], tmpb
> - mov tmpb, [srcq + wq + 1]
> + mov tmpb, [srcq + wq + %2]
> mov [dstq+wq + 1], tmpb
> - mov tmpb, [srcq + wq + 0]
> + mov tmpb, [srcq + wq + %3]
> mov [dstq+wq + 2], tmpb
> - mov tmpb, [srcq + wq + 3]
> + mov tmpb, [srcq + wq + %4]
> mov [dstq+wq + 3], tmpb
> add wq, 4
> sub xq, 4
> @@ -86,29 +124,26 @@ jge .end
>
> .loop_simd:
> movu m0, [srcq+wq]
> - movu m1, [srcq+wq+8]
> -
> - pshufw m3, m0, 177
> - pshufw m5, m1, 177
> -
> - pand m0, m7
> - pand m3, m6
>
> - pand m1, m7
> - pand m5, m6
> + SHUFFLE%1%2%3%4_SSE2
>
> - por m0, m3
> - por m1, m5
> + por m0, m1
>
> movu [dstq+wq], m0
> - movu [dstq+wq + 8], m1
>
> - add wq, mmsize*2
> + add wq, mmsize
> jl .loop_simd
>
> .end:
> - emms
> RET
> +%endmacro
> +
> +INIT_XMM sse2
> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>
> ;------------------------------------------------------------------------------
> ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
How old are the youngest processors with SSE2, but without SSSE3?
According to Wikipedia, nearly 15 years. Which makes me believe that the
SSE2 versions are not worth it (how many of these CPUs will use a new
FFmpeg anyway?).
- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
2024-06-06 14:48 ` Andreas Rheinhardt
@ 2024-06-06 15:45 ` James Almer
2024-06-08 15:55 ` Andreas Rheinhardt
0 siblings, 1 reply; 10+ messages in thread
From: James Almer @ 2024-06-06 15:45 UTC (permalink / raw)
To: ffmpeg-devel
On 6/6/2024 11:48 AM, Andreas Rheinhardt wrote:
> James Almer:
>> And remove shuffle_bytes_2103_mmxext.
>>
>> shuffle_bytes_0321_c: 28.1
>> shuffle_bytes_0321_sse2: 13.6
>> shuffle_bytes_0321_ssse3: 9.6
>> shuffle_bytes_0321_avx2: 7.1
>> shuffle_bytes_1230_c: 52.6
>> shuffle_bytes_1230_sse2: 12.1
>> shuffle_bytes_1230_ssse3: 8.6
>> shuffle_bytes_1230_avx2: 6.6
>> shuffle_bytes_2103_c: 29.1
>> shuffle_bytes_2103_mmxext: 29.3 // removed
>> shuffle_bytes_2103_sse2: 12.5
>> shuffle_bytes_2103_ssse3: 8.6
>> shuffle_bytes_2103_avx2: 7.1
>> shuffle_bytes_3012_c: 52.1
>> shuffle_bytes_3012_sse2: 12.1
>> shuffle_bytes_3012_ssse3: 8.6
>> shuffle_bytes_3012_avx2: 7.1
>> shuffle_bytes_3210_c: 50.6
>> shuffle_bytes_3210_sse2: 14.6
>> shuffle_bytes_3210_ssse3: 8.6
>> shuffle_bytes_3210_avx2: 7.1
>>
>> Signed-off-by: James Almer <jamrial@gmail.com>
>> ---
>> libswscale/x86/rgb2rgb.c | 14 ++++--
>> libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
>> 2 files changed, 69 insertions(+), 28 deletions(-)
>>
>> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
>> index 21ccfafe51..9f6c8efc72 100644
>> --- a/libswscale/x86/rgb2rgb.c
>> +++ b/libswscale/x86/rgb2rgb.c
>> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
>>
>> #endif /* HAVE_INLINE_ASM */
>>
>> -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
>> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>> void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>> void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>> void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
>> rgb2rgb_init_avx();
>> #endif /* HAVE_INLINE_ASM */
>>
>> - if (EXTERNAL_MMXEXT(cpu_flags)) {
>> - shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
>> - }
>> if (EXTERNAL_SSE2(cpu_flags)) {
>> + shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
>> + shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
>> + shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
>> + shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
>> + shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
>> #if ARCH_X86_64
>> uyvytoyuv422 = ff_uyvytoyuv422_sse2;
>> #endif
>> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
>> index 0bf1278718..9fc1974389 100644
>> --- a/libswscale/x86/rgb_2_rgb.asm
>> +++ b/libswscale/x86/rgb_2_rgb.asm
>> @@ -25,7 +25,6 @@
>>
>> SECTION_RODATA
>>
>> -pb_mask_shuffle2103_mmx times 8 dw 255
>> pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
>> pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
>> pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
>> @@ -50,11 +49,50 @@ SECTION .text
>> ;------------------------------------------------------------------------------
>> ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
>> ;------------------------------------------------------------------------------
>> -INIT_MMX mmxext
>> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>> - mova m6, [pb_mask_shuffle2103_mmx]
>> - mova m7, m6
>> - psllq m7, 8
>> +
>> +%macro SHUFFLE2103_SSE2 0
>> + pshuflw m1, m0, 0xb1
>> + pshufhw m1, m1, 0xb1
>> +
>> + pand m0, m3
>> + pand m1, m2
>> +%endmacro
>> +
>> +%macro SHUFFLE0321_SSE2 0
>> + pshuflw m1, m0, 0xb1
>> + pshufhw m1, m1, 0xb1
>> +
>> + pand m0, m2
>> + pand m1, m3
>> +%endmacro
>> +
>> +%macro SHUFFLE1230_SSE2 0
>> + pslld m1, m0, 24
>> + psrld m0, 8
>> +%endmacro
>> +
>> +%macro SHUFFLE3012_SSE2 0
>> + pslld m1, m0, 8
>> + psrld m0, 24
>> +%endmacro
>> +
>> +%macro SHUFFLE3210_SSE2 0
>> + pshuflw m1, m0, 0xb1
>> + pshufhw m1, m1, 0xb1
>> +
>> + psrlw m0, m1, 8
>> + psllw m1, 8
>> +%endmacro
>> +
>> +; %1-4 index shuffle
>> +; %5 load mask
>> +%macro SHUFFLE_BYTES_SSE2 5
>> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
>> +%if %5
>> + pcmpeqw m2, m2
>> + psllw m3, m2, 8 ; (word) { 0xff00 } x4
>> + psrlw m2, 8 ; (word) { 0x00ff } x4
>> +%endif
>>
>> movsxdifnidn wq, wd
>> mov xq, wq
>> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>> je .loop_simd
>>
>> .loop_scalar:
>> - mov tmpb, [srcq + wq + 2]
>> + mov tmpb, [srcq + wq + %1]
>> mov [dstq+wq + 0], tmpb
>> - mov tmpb, [srcq + wq + 1]
>> + mov tmpb, [srcq + wq + %2]
>> mov [dstq+wq + 1], tmpb
>> - mov tmpb, [srcq + wq + 0]
>> + mov tmpb, [srcq + wq + %3]
>> mov [dstq+wq + 2], tmpb
>> - mov tmpb, [srcq + wq + 3]
>> + mov tmpb, [srcq + wq + %4]
>> mov [dstq+wq + 3], tmpb
>> add wq, 4
>> sub xq, 4
>> @@ -86,29 +124,26 @@ jge .end
>>
>> .loop_simd:
>> movu m0, [srcq+wq]
>> - movu m1, [srcq+wq+8]
>> -
>> - pshufw m3, m0, 177
>> - pshufw m5, m1, 177
>> -
>> - pand m0, m7
>> - pand m3, m6
>>
>> - pand m1, m7
>> - pand m5, m6
>> + SHUFFLE%1%2%3%4_SSE2
>>
>> - por m0, m3
>> - por m1, m5
>> + por m0, m1
>>
>> movu [dstq+wq], m0
>> - movu [dstq+wq + 8], m1
>>
>> - add wq, mmsize*2
>> + add wq, mmsize
>> jl .loop_simd
>>
>> .end:
>> - emms
>> RET
>> +%endmacro
>> +
>> +INIT_XMM sse2
>> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
>> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
>> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
>> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
>> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>>
>> ;------------------------------------------------------------------------------
>> ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
>
> How old are the youngest processors with SSE2, but without SSSE3?
AMD Phenom/K10.
> According to Wikipedia, nearly 15 years. Which makes me believe that the
> SSE2 versions are not worth it (how many of these CPUs will use a new
> FFmpeg anyway?).
Simply by using the latest version of a video player that uses ffmpeg is
enough to be able to run the newest code.
It was easy to write and i don't feel particularly interested enough to
argue, so if you think it's not worth adding, i can just remove the
mmxext version and skip adding anything.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
2024-06-06 15:45 ` James Almer
@ 2024-06-08 15:55 ` Andreas Rheinhardt
2024-06-08 16:21 ` Rémi Denis-Courmont
2024-06-09 15:36 ` James Almer
0 siblings, 2 replies; 10+ messages in thread
From: Andreas Rheinhardt @ 2024-06-08 15:55 UTC (permalink / raw)
To: ffmpeg-devel
James Almer:
> On 6/6/2024 11:48 AM, Andreas Rheinhardt wrote:
>> James Almer:
>>> And remove shuffle_bytes_2103_mmxext.
>>>
>>> shuffle_bytes_0321_c: 28.1
>>> shuffle_bytes_0321_sse2: 13.6
>>> shuffle_bytes_0321_ssse3: 9.6
>>> shuffle_bytes_0321_avx2: 7.1
>>> shuffle_bytes_1230_c: 52.6
>>> shuffle_bytes_1230_sse2: 12.1
>>> shuffle_bytes_1230_ssse3: 8.6
>>> shuffle_bytes_1230_avx2: 6.6
>>> shuffle_bytes_2103_c: 29.1
>>> shuffle_bytes_2103_mmxext: 29.3 // removed
>>> shuffle_bytes_2103_sse2: 12.5
>>> shuffle_bytes_2103_ssse3: 8.6
>>> shuffle_bytes_2103_avx2: 7.1
>>> shuffle_bytes_3012_c: 52.1
>>> shuffle_bytes_3012_sse2: 12.1
>>> shuffle_bytes_3012_ssse3: 8.6
>>> shuffle_bytes_3012_avx2: 7.1
>>> shuffle_bytes_3210_c: 50.6
>>> shuffle_bytes_3210_sse2: 14.6
>>> shuffle_bytes_3210_ssse3: 8.6
>>> shuffle_bytes_3210_avx2: 7.1
>>>
>>> Signed-off-by: James Almer <jamrial@gmail.com>
>>> ---
>>> libswscale/x86/rgb2rgb.c | 14 ++++--
>>> libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
>>> 2 files changed, 69 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
>>> index 21ccfafe51..9f6c8efc72 100644
>>> --- a/libswscale/x86/rgb2rgb.c
>>> +++ b/libswscale/x86/rgb2rgb.c
>>> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t,
>>> ff_bgr2UVOffset);
>>> #endif /* HAVE_INLINE_ASM */
>>> -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t
>>> *dst, int src_size);
>>> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst,
>>> int src_size);
>>> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
>>> rgb2rgb_init_avx();
>>> #endif /* HAVE_INLINE_ASM */
>>> - if (EXTERNAL_MMXEXT(cpu_flags)) {
>>> - shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
>>> - }
>>> if (EXTERNAL_SSE2(cpu_flags)) {
>>> + shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
>>> + shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
>>> + shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
>>> + shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
>>> + shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
>>> #if ARCH_X86_64
>>> uyvytoyuv422 = ff_uyvytoyuv422_sse2;
>>> #endif
>>> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
>>> index 0bf1278718..9fc1974389 100644
>>> --- a/libswscale/x86/rgb_2_rgb.asm
>>> +++ b/libswscale/x86/rgb_2_rgb.asm
>>> @@ -25,7 +25,6 @@
>>> SECTION_RODATA
>>> -pb_mask_shuffle2103_mmx times 8 dw 255
>>> pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13,
>>> 12, 15
>>> pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15,
>>> 14, 13
>>> pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14,
>>> 15, 12
>>> @@ -50,11 +49,50 @@ SECTION .text
>>>
>>> ;------------------------------------------------------------------------------
>>> ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int
>>> src_size)
>>>
>>> ;------------------------------------------------------------------------------
>>> -INIT_MMX mmxext
>>> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>>> - mova m6, [pb_mask_shuffle2103_mmx]
>>> - mova m7, m6
>>> - psllq m7, 8
>>> +
>>> +%macro SHUFFLE2103_SSE2 0
>>> + pshuflw m1, m0, 0xb1
>>> + pshufhw m1, m1, 0xb1
>>> +
>>> + pand m0, m3
>>> + pand m1, m2
>>> +%endmacro
>>> +
>>> +%macro SHUFFLE0321_SSE2 0
>>> + pshuflw m1, m0, 0xb1
>>> + pshufhw m1, m1, 0xb1
>>> +
>>> + pand m0, m2
>>> + pand m1, m3
>>> +%endmacro
>>> +
>>> +%macro SHUFFLE1230_SSE2 0
>>> + pslld m1, m0, 24
>>> + psrld m0, 8
>>> +%endmacro
>>> +
>>> +%macro SHUFFLE3012_SSE2 0
>>> + pslld m1, m0, 8
>>> + psrld m0, 24
>>> +%endmacro
>>> +
>>> +%macro SHUFFLE3210_SSE2 0
>>> + pshuflw m1, m0, 0xb1
>>> + pshufhw m1, m1, 0xb1
>>> +
>>> + psrlw m0, m1, 8
>>> + psllw m1, 8
>>> +%endmacro
>>> +
>>> +; %1-4 index shuffle
>>> +; %5 load mask
>>> +%macro SHUFFLE_BYTES_SSE2 5
>>> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
>>> +%if %5
>>> + pcmpeqw m2, m2
>>> + psllw m3, m2, 8 ; (word) { 0xff00 } x4
>>> + psrlw m2, 8 ; (word) { 0x00ff } x4
>>> +%endif
>>> movsxdifnidn wq, wd
>>> mov xq, wq
>>> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst,
>>> w, tmp, x
>>> je .loop_simd
>>> .loop_scalar:
>>> - mov tmpb, [srcq + wq + 2]
>>> + mov tmpb, [srcq + wq + %1]
>>> mov [dstq+wq + 0], tmpb
>>> - mov tmpb, [srcq + wq + 1]
>>> + mov tmpb, [srcq + wq + %2]
>>> mov [dstq+wq + 1], tmpb
>>> - mov tmpb, [srcq + wq + 0]
>>> + mov tmpb, [srcq + wq + %3]
>>> mov [dstq+wq + 2], tmpb
>>> - mov tmpb, [srcq + wq + 3]
>>> + mov tmpb, [srcq + wq + %4]
>>> mov [dstq+wq + 3], tmpb
>>> add wq, 4
>>> sub xq, 4
>>> @@ -86,29 +124,26 @@ jge .end
>>> .loop_simd:
>>> movu m0, [srcq+wq]
>>> - movu m1, [srcq+wq+8]
>>> -
>>> - pshufw m3, m0, 177
>>> - pshufw m5, m1, 177
>>> -
>>> - pand m0, m7
>>> - pand m3, m6
>>> - pand m1, m7
>>> - pand m5, m6
>>> + SHUFFLE%1%2%3%4_SSE2
>>> - por m0, m3
>>> - por m1, m5
>>> + por m0, m1
>>> movu [dstq+wq], m0
>>> - movu [dstq+wq + 8], m1
>>> - add wq, mmsize*2
>>> + add wq, mmsize
>>> jl .loop_simd
>>> .end:
>>> - emms
>>> RET
>>> +%endmacro
>>> +
>>> +INIT_XMM sse2
>>> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
>>> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
>>> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
>>> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
>>> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>>>
>>> ;------------------------------------------------------------------------------
>>> ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
>>
>> How old are the youngest processors with SSE2, but without SSSE3?
>
> AMD Phenom/K10.
>
>> According to Wikipedia, nearly 15 years. Which makes me believe that the
>> SSE2 versions are not worth it (how many of these CPUs will use a new
>> FFmpeg anyway?).
>
> Simply by using the latest version of a video player that uses ffmpeg is
> enough to be able to run the newest code.
I asked "how many", not "how".
> It was easy to write and i don't feel particularly interested enough to
> argue, so if you think it's not worth adding, i can just remove the
> mmxext version and skip adding anything.
I think we should not optimize for CPUs that do not even have x86-64 v2.
So I would not add these SSE2 versions. But the one missing SSSE3
version (shuffle_bytes_2103_ssse3) is of course worth it.
- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
2024-06-08 15:55 ` Andreas Rheinhardt
@ 2024-06-08 16:21 ` Rémi Denis-Courmont
2024-06-09 15:36 ` James Almer
1 sibling, 0 replies; 10+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-08 16:21 UTC (permalink / raw)
To: ffmpeg-devel
Le lauantaina 8. kesäkuuta 2024, 18.55.53 EEST Andreas Rheinhardt a écrit :
> I think we should not optimize for CPUs that do not even have x86-64 v2.
> So I would not add these SSE2 versions.
We certainly should consider ditching SSE2 where SSSE3 is available now or in
the near future. But in this particular case, James seems to be converting
MMX(EXT) code into SSE2 code, more so that introducing pure new SSE2 code.
It took almost forever to agree to get rid of MMX. I would like to go ahead
with that, and I like to think that many other people too. So can we at least
tolerate porting MMX to SSE2 until we have gotten rid of MMX for good?
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
2024-06-08 15:55 ` Andreas Rheinhardt
2024-06-08 16:21 ` Rémi Denis-Courmont
@ 2024-06-09 15:36 ` James Almer
2024-06-09 16:05 ` Rémi Denis-Courmont
2024-06-10 17:06 ` James Almer
1 sibling, 2 replies; 10+ messages in thread
From: James Almer @ 2024-06-09 15:36 UTC (permalink / raw)
To: ffmpeg-devel
On 6/8/2024 12:55 PM, Andreas Rheinhardt wrote:
> James Almer:
>> On 6/6/2024 11:48 AM, Andreas Rheinhardt wrote:
>>> James Almer:
>>>> And remove shuffle_bytes_2103_mmxext.
>>>>
>>>> shuffle_bytes_0321_c: 28.1
>>>> shuffle_bytes_0321_sse2: 13.6
>>>> shuffle_bytes_0321_ssse3: 9.6
>>>> shuffle_bytes_0321_avx2: 7.1
>>>> shuffle_bytes_1230_c: 52.6
>>>> shuffle_bytes_1230_sse2: 12.1
>>>> shuffle_bytes_1230_ssse3: 8.6
>>>> shuffle_bytes_1230_avx2: 6.6
>>>> shuffle_bytes_2103_c: 29.1
>>>> shuffle_bytes_2103_mmxext: 29.3 // removed
>>>> shuffle_bytes_2103_sse2: 12.5
>>>> shuffle_bytes_2103_ssse3: 8.6
>>>> shuffle_bytes_2103_avx2: 7.1
>>>> shuffle_bytes_3012_c: 52.1
>>>> shuffle_bytes_3012_sse2: 12.1
>>>> shuffle_bytes_3012_ssse3: 8.6
>>>> shuffle_bytes_3012_avx2: 7.1
>>>> shuffle_bytes_3210_c: 50.6
>>>> shuffle_bytes_3210_sse2: 14.6
>>>> shuffle_bytes_3210_ssse3: 8.6
>>>> shuffle_bytes_3210_avx2: 7.1
>>>>
>>>> Signed-off-by: James Almer <jamrial@gmail.com>
>>>> ---
>>>> libswscale/x86/rgb2rgb.c | 14 ++++--
>>>> libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
>>>> 2 files changed, 69 insertions(+), 28 deletions(-)
>>>>
>>>> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
>>>> index 21ccfafe51..9f6c8efc72 100644
>>>> --- a/libswscale/x86/rgb2rgb.c
>>>> +++ b/libswscale/x86/rgb2rgb.c
>>>> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t,
>>>> ff_bgr2UVOffset);
>>>> #endif /* HAVE_INLINE_ASM */
>>>> -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t
>>>> *dst, int src_size);
>>>> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst,
>>>> int src_size);
>>>> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
>>>> rgb2rgb_init_avx();
>>>> #endif /* HAVE_INLINE_ASM */
>>>> - if (EXTERNAL_MMXEXT(cpu_flags)) {
>>>> - shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
>>>> - }
>>>> if (EXTERNAL_SSE2(cpu_flags)) {
>>>> + shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
>>>> + shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
>>>> + shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
>>>> + shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
>>>> + shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
>>>> #if ARCH_X86_64
>>>> uyvytoyuv422 = ff_uyvytoyuv422_sse2;
>>>> #endif
>>>> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
>>>> index 0bf1278718..9fc1974389 100644
>>>> --- a/libswscale/x86/rgb_2_rgb.asm
>>>> +++ b/libswscale/x86/rgb_2_rgb.asm
>>>> @@ -25,7 +25,6 @@
>>>> SECTION_RODATA
>>>> -pb_mask_shuffle2103_mmx times 8 dw 255
>>>> pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13,
>>>> 12, 15
>>>> pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15,
>>>> 14, 13
>>>> pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14,
>>>> 15, 12
>>>> @@ -50,11 +49,50 @@ SECTION .text
>>>>
>>>> ;------------------------------------------------------------------------------
>>>> ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int
>>>> src_size)
>>>>
>>>> ;------------------------------------------------------------------------------
>>>> -INIT_MMX mmxext
>>>> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>>>> - mova m6, [pb_mask_shuffle2103_mmx]
>>>> - mova m7, m6
>>>> - psllq m7, 8
>>>> +
>>>> +%macro SHUFFLE2103_SSE2 0
>>>> + pshuflw m1, m0, 0xb1
>>>> + pshufhw m1, m1, 0xb1
>>>> +
>>>> + pand m0, m3
>>>> + pand m1, m2
>>>> +%endmacro
>>>> +
>>>> +%macro SHUFFLE0321_SSE2 0
>>>> + pshuflw m1, m0, 0xb1
>>>> + pshufhw m1, m1, 0xb1
>>>> +
>>>> + pand m0, m2
>>>> + pand m1, m3
>>>> +%endmacro
>>>> +
>>>> +%macro SHUFFLE1230_SSE2 0
>>>> + pslld m1, m0, 24
>>>> + psrld m0, 8
>>>> +%endmacro
>>>> +
>>>> +%macro SHUFFLE3012_SSE2 0
>>>> + pslld m1, m0, 8
>>>> + psrld m0, 24
>>>> +%endmacro
>>>> +
>>>> +%macro SHUFFLE3210_SSE2 0
>>>> + pshuflw m1, m0, 0xb1
>>>> + pshufhw m1, m1, 0xb1
>>>> +
>>>> + psrlw m0, m1, 8
>>>> + psllw m1, 8
>>>> +%endmacro
>>>> +
>>>> +; %1-4 index shuffle
>>>> +; %5 load mask
>>>> +%macro SHUFFLE_BYTES_SSE2 5
>>>> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
>>>> +%if %5
>>>> + pcmpeqw m2, m2
>>>> + psllw m3, m2, 8 ; (word) { 0xff00 } x4
>>>> + psrlw m2, 8 ; (word) { 0x00ff } x4
>>>> +%endif
>>>> movsxdifnidn wq, wd
>>>> mov xq, wq
>>>> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst,
>>>> w, tmp, x
>>>> je .loop_simd
>>>> .loop_scalar:
>>>> - mov tmpb, [srcq + wq + 2]
>>>> + mov tmpb, [srcq + wq + %1]
>>>> mov [dstq+wq + 0], tmpb
>>>> - mov tmpb, [srcq + wq + 1]
>>>> + mov tmpb, [srcq + wq + %2]
>>>> mov [dstq+wq + 1], tmpb
>>>> - mov tmpb, [srcq + wq + 0]
>>>> + mov tmpb, [srcq + wq + %3]
>>>> mov [dstq+wq + 2], tmpb
>>>> - mov tmpb, [srcq + wq + 3]
>>>> + mov tmpb, [srcq + wq + %4]
>>>> mov [dstq+wq + 3], tmpb
>>>> add wq, 4
>>>> sub xq, 4
>>>> @@ -86,29 +124,26 @@ jge .end
>>>> .loop_simd:
>>>> movu m0, [srcq+wq]
>>>> - movu m1, [srcq+wq+8]
>>>> -
>>>> - pshufw m3, m0, 177
>>>> - pshufw m5, m1, 177
>>>> -
>>>> - pand m0, m7
>>>> - pand m3, m6
>>>> - pand m1, m7
>>>> - pand m5, m6
>>>> + SHUFFLE%1%2%3%4_SSE2
>>>> - por m0, m3
>>>> - por m1, m5
>>>> + por m0, m1
>>>> movu [dstq+wq], m0
>>>> - movu [dstq+wq + 8], m1
>>>> - add wq, mmsize*2
>>>> + add wq, mmsize
>>>> jl .loop_simd
>>>> .end:
>>>> - emms
>>>> RET
>>>> +%endmacro
>>>> +
>>>> +INIT_XMM sse2
>>>> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
>>>> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
>>>> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
>>>> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
>>>> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>>>>
>>>> ;------------------------------------------------------------------------------
>>>> ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
>>>
>>> How old are the youngest processors with SSE2, but without SSSE3?
>>
>> AMD Phenom/K10.
>>
>>> According to Wikipedia, nearly 15 years. Which makes me believe that the
>>> SSE2 versions are not worth it (how many of these CPUs will use a new
>>> FFmpeg anyway?).
>>
>> Simply by using the latest version of a video player that uses ffmpeg is
>> enough to be able to run the newest code.
>
> I asked "how many", not "how".
I obviously don't have that kind of information. You'd need to look at
things like Steam's, Firefox's or Chrome's hardware surveys.
>
>> It was easy to write and i don't feel particularly interested enough to
>> argue, so if you think it's not worth adding, i can just remove the
>> mmxext version and skip adding anything.
>
> I think we should not optimize for CPUs that do not even have x86-64 v2.
What is x86-64 v2?
> So I would not add these SSE2 versions. But the one missing SSSE3
Ok, I'll just remove the mmxext one, then.
> version (shuffle_bytes_2103_ssse3) is of course worth it.
I will look into that.
>
> - Andreas
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
2024-06-09 15:36 ` James Almer
@ 2024-06-09 16:05 ` Rémi Denis-Courmont
2024-06-10 17:06 ` James Almer
1 sibling, 0 replies; 10+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-09 16:05 UTC (permalink / raw)
To: ffmpeg-devel
Le sunnuntaina 9. kesäkuuta 2024, 18.36.35 EEST James Almer a écrit :
> I obviously don't have that kind of information. You'd need to look at
> things like Steam's, Firefox's or Chrome's hardware surveys.
As discussed on IRC yesterday, Steam claims that 106.85% of processors support
SSE2 (and as many SSE3) but "only" 106.63% support SSSE3 (seriously). What
100% are, I don't know. AVX2 is close with 99.83% but no cigar.
In any case, there is a tiny but observable gap between SSE2 and SSSE3 there.
> >> It was easy to write and i don't feel particularly interested enough to
> >> argue, so if you think it's not worth adding, i can just remove the
> >> mmxext version and skip adding anything.
> >
> > I think we should not optimize for CPUs that do not even have x86-64 v2.
>
> What is x86-64 v2?
See
https://developers.redhat.com/blog/2021/01/05/building-red-hat-enterprise-linux-9-for-the-x86-64-v2-microarchitecture-level
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
2024-06-09 15:36 ` James Almer
2024-06-09 16:05 ` Rémi Denis-Courmont
@ 2024-06-10 17:06 ` James Almer
2024-06-11 5:18 ` Andreas Rheinhardt
1 sibling, 1 reply; 10+ messages in thread
From: James Almer @ 2024-06-10 17:06 UTC (permalink / raw)
To: ffmpeg-devel
On 6/9/2024 12:36 PM, James Almer wrote:
>> So I would not add these SSE2 versions. But the one missing SSSE3
>> version (shuffle_bytes_2103_ssse3) is of course worth it.
>
> I will look into that.
I'm not sure why you said it's missing, because it's there.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
2024-06-10 17:06 ` James Almer
@ 2024-06-11 5:18 ` Andreas Rheinhardt
0 siblings, 0 replies; 10+ messages in thread
From: Andreas Rheinhardt @ 2024-06-11 5:18 UTC (permalink / raw)
To: ffmpeg-devel
James Almer:
> On 6/9/2024 12:36 PM, James Almer wrote:
>>> So I would not add these SSE2 versions. But the one missing SSSE3
>>> version (shuffle_bytes_2103_ssse3) is of course worth it.
>>
>> I will look into that.
>
> I'm not sure why you said it's missing, because it's there.
Sorry for having said garbage.
- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2024-06-11 5:19 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-05 20:51 [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: replace shuffle_bytes_2103_mmxext with an SSE2 version James Almer
2024-06-06 14:15 ` [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions James Almer
2024-06-06 14:48 ` Andreas Rheinhardt
2024-06-06 15:45 ` James Almer
2024-06-08 15:55 ` Andreas Rheinhardt
2024-06-08 16:21 ` Rémi Denis-Courmont
2024-06-09 15:36 ` James Almer
2024-06-09 16:05 ` Rémi Denis-Courmont
2024-06-10 17:06 ` James Almer
2024-06-11 5:18 ` Andreas Rheinhardt
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git