* [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: replace shuffle_bytes_2103_mmxext with an SSE2 version @ 2024-06-05 20:51 James Almer 2024-06-06 14:15 ` [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions James Almer 0 siblings, 1 reply; 10+ messages in thread From: James Almer @ 2024-06-05 20:51 UTC (permalink / raw) To: ffmpeg-devel shuffle_bytes_2103_c: 46.5 shuffle_bytes_2103_mmxext: 29.3 shuffle_bytes_2103_sse2: 12.5 Signed-off-by: James Almer <jamrial@gmail.com> --- libswscale/x86/rgb2rgb.c | 6 ++---- libswscale/x86/rgb_2_rgb.asm | 30 +++++++++++------------------- 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index 21ccfafe51..912fe431b3 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -116,7 +116,7 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset); #endif /* HAVE_INLINE_ASM */ -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size); @@ -154,10 +154,8 @@ av_cold void rgb2rgb_init_x86(void) rgb2rgb_init_avx(); #endif /* HAVE_INLINE_ASM */ - if (EXTERNAL_MMXEXT(cpu_flags)) { - shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext; - } if (EXTERNAL_SSE2(cpu_flags)) { + shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2; #if ARCH_X86_64 uyvytoyuv422 = ff_uyvytoyuv422_sse2; #endif diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm index 0bf1278718..2d2ac778b7 100644 --- a/libswscale/x86/rgb_2_rgb.asm +++ b/libswscale/x86/rgb_2_rgb.asm @@ -25,7 +25,7 @@ SECTION_RODATA -pb_mask_shuffle2103_mmx times 8 dw 255 +pb_mask_shuffle2103 times 8 dw 255 pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13 pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 @@ -50,11 +50,10 @@ SECTION .text ;------------------------------------------------------------------------------ ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size) ;------------------------------------------------------------------------------ -INIT_MMX mmxext -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x - mova m6, [pb_mask_shuffle2103_mmx] - mova m7, m6 - psllq m7, 8 +INIT_XMM sse2 +cglobal shuffle_bytes_2103, 3, 5, 4, src, dst, w, tmp, x + mova m2, [pb_mask_shuffle2103] + psllq m3, m2, 8 movsxdifnidn wq, wd mov xq, wq @@ -86,28 +85,21 @@ jge .end .loop_simd: movu m0, [srcq+wq] - movu m1, [srcq+wq+8] - - pshufw m3, m0, 177 - pshufw m5, m1, 177 - pand m0, m7 - pand m3, m6 + pshuflw m1, m0, 0xb1 + pshufhw m1, m1, 0xb1 - pand m1, m7 - pand m5, m6 + pand m0, m3 + pand m1, m2 - por m0, m3 - por m1, m5 + por m0, m1 movu [dstq+wq], m0 - movu [dstq+wq + 8], m1 - add wq, mmsize*2 + add wq, mmsize jl .loop_simd .end: - emms RET ;------------------------------------------------------------------------------ -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions 2024-06-05 20:51 [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: replace shuffle_bytes_2103_mmxext with an SSE2 version James Almer @ 2024-06-06 14:15 ` James Almer 2024-06-06 14:48 ` Andreas Rheinhardt 0 siblings, 1 reply; 10+ messages in thread From: James Almer @ 2024-06-06 14:15 UTC (permalink / raw) To: ffmpeg-devel And remove shuffle_bytes_2103_mmxext. shuffle_bytes_0321_c: 28.1 shuffle_bytes_0321_sse2: 13.6 shuffle_bytes_0321_ssse3: 9.6 shuffle_bytes_0321_avx2: 7.1 shuffle_bytes_1230_c: 52.6 shuffle_bytes_1230_sse2: 12.1 shuffle_bytes_1230_ssse3: 8.6 shuffle_bytes_1230_avx2: 6.6 shuffle_bytes_2103_c: 29.1 shuffle_bytes_2103_mmxext: 29.3 // removed shuffle_bytes_2103_sse2: 12.5 shuffle_bytes_2103_ssse3: 8.6 shuffle_bytes_2103_avx2: 7.1 shuffle_bytes_3012_c: 52.1 shuffle_bytes_3012_sse2: 12.1 shuffle_bytes_3012_ssse3: 8.6 shuffle_bytes_3012_avx2: 7.1 shuffle_bytes_3210_c: 50.6 shuffle_bytes_3210_sse2: 14.6 shuffle_bytes_3210_ssse3: 8.6 shuffle_bytes_3210_avx2: 7.1 Signed-off-by: James Almer <jamrial@gmail.com> --- libswscale/x86/rgb2rgb.c | 14 ++++-- libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++----------- 2 files changed, 69 insertions(+), 28 deletions(-) diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index 21ccfafe51..9f6c8efc72 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset); #endif /* HAVE_INLINE_ASM */ -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size); @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void) rgb2rgb_init_avx(); #endif /* HAVE_INLINE_ASM */ - if (EXTERNAL_MMXEXT(cpu_flags)) { - shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext; - } if (EXTERNAL_SSE2(cpu_flags)) { + shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2; + shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2; + shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2; + shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2; + shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2; #if ARCH_X86_64 uyvytoyuv422 = ff_uyvytoyuv422_sse2; #endif diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm index 0bf1278718..9fc1974389 100644 --- a/libswscale/x86/rgb_2_rgb.asm +++ b/libswscale/x86/rgb_2_rgb.asm @@ -25,7 +25,6 @@ SECTION_RODATA -pb_mask_shuffle2103_mmx times 8 dw 255 pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13 pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 @@ -50,11 +49,50 @@ SECTION .text ;------------------------------------------------------------------------------ ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size) ;------------------------------------------------------------------------------ -INIT_MMX mmxext -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x - mova m6, [pb_mask_shuffle2103_mmx] - mova m7, m6 - psllq m7, 8 + +%macro SHUFFLE2103_SSE2 0 + pshuflw m1, m0, 0xb1 + pshufhw m1, m1, 0xb1 + + pand m0, m3 + pand m1, m2 +%endmacro + +%macro SHUFFLE0321_SSE2 0 + pshuflw m1, m0, 0xb1 + pshufhw m1, m1, 0xb1 + + pand m0, m2 + pand m1, m3 +%endmacro + +%macro SHUFFLE1230_SSE2 0 + pslld m1, m0, 24 + psrld m0, 8 +%endmacro + +%macro SHUFFLE3012_SSE2 0 + pslld m1, m0, 8 + psrld m0, 24 +%endmacro + +%macro SHUFFLE3210_SSE2 0 + pshuflw m1, m0, 0xb1 + pshufhw m1, m1, 0xb1 + + psrlw m0, m1, 8 + psllw m1, 8 +%endmacro + +; %1-4 index shuffle +; %5 load mask +%macro SHUFFLE_BYTES_SSE2 5 +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x +%if %5 + pcmpeqw m2, m2 + psllw m3, m2, 8 ; (word) { 0xff00 } x4 + psrlw m2, 8 ; (word) { 0x00ff } x4 +%endif movsxdifnidn wq, wd mov xq, wq @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x je .loop_simd .loop_scalar: - mov tmpb, [srcq + wq + 2] + mov tmpb, [srcq + wq + %1] mov [dstq+wq + 0], tmpb - mov tmpb, [srcq + wq + 1] + mov tmpb, [srcq + wq + %2] mov [dstq+wq + 1], tmpb - mov tmpb, [srcq + wq + 0] + mov tmpb, [srcq + wq + %3] mov [dstq+wq + 2], tmpb - mov tmpb, [srcq + wq + 3] + mov tmpb, [srcq + wq + %4] mov [dstq+wq + 3], tmpb add wq, 4 sub xq, 4 @@ -86,29 +124,26 @@ jge .end .loop_simd: movu m0, [srcq+wq] - movu m1, [srcq+wq+8] - - pshufw m3, m0, 177 - pshufw m5, m1, 177 - - pand m0, m7 - pand m3, m6 - pand m1, m7 - pand m5, m6 + SHUFFLE%1%2%3%4_SSE2 - por m0, m3 - por m1, m5 + por m0, m1 movu [dstq+wq], m0 - movu [dstq+wq + 8], m1 - add wq, mmsize*2 + add wq, mmsize jl .loop_simd .end: - emms RET +%endmacro + +INIT_XMM sse2 +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1 +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1 +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0 +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0 +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0 ;------------------------------------------------------------------------------ ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size) -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions 2024-06-06 14:15 ` [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions James Almer @ 2024-06-06 14:48 ` Andreas Rheinhardt 2024-06-06 15:45 ` James Almer 0 siblings, 1 reply; 10+ messages in thread From: Andreas Rheinhardt @ 2024-06-06 14:48 UTC (permalink / raw) To: ffmpeg-devel James Almer: > And remove shuffle_bytes_2103_mmxext. > > shuffle_bytes_0321_c: 28.1 > shuffle_bytes_0321_sse2: 13.6 > shuffle_bytes_0321_ssse3: 9.6 > shuffle_bytes_0321_avx2: 7.1 > shuffle_bytes_1230_c: 52.6 > shuffle_bytes_1230_sse2: 12.1 > shuffle_bytes_1230_ssse3: 8.6 > shuffle_bytes_1230_avx2: 6.6 > shuffle_bytes_2103_c: 29.1 > shuffle_bytes_2103_mmxext: 29.3 // removed > shuffle_bytes_2103_sse2: 12.5 > shuffle_bytes_2103_ssse3: 8.6 > shuffle_bytes_2103_avx2: 7.1 > shuffle_bytes_3012_c: 52.1 > shuffle_bytes_3012_sse2: 12.1 > shuffle_bytes_3012_ssse3: 8.6 > shuffle_bytes_3012_avx2: 7.1 > shuffle_bytes_3210_c: 50.6 > shuffle_bytes_3210_sse2: 14.6 > shuffle_bytes_3210_ssse3: 8.6 > shuffle_bytes_3210_avx2: 7.1 > > Signed-off-by: James Almer <jamrial@gmail.com> > --- > libswscale/x86/rgb2rgb.c | 14 ++++-- > libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++----------- > 2 files changed, 69 insertions(+), 28 deletions(-) > > diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c > index 21ccfafe51..9f6c8efc72 100644 > --- a/libswscale/x86/rgb2rgb.c > +++ b/libswscale/x86/rgb2rgb.c > @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset); > > #endif /* HAVE_INLINE_ASM */ > > -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size); > +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size); > +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size); > +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size); > +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size); > +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size); > void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size); > void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size); > void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size); > @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void) > rgb2rgb_init_avx(); > #endif /* HAVE_INLINE_ASM */ > > - if (EXTERNAL_MMXEXT(cpu_flags)) { > - shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext; > - } > if (EXTERNAL_SSE2(cpu_flags)) { > + shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2; > + shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2; > + shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2; > + shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2; > + shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2; > #if ARCH_X86_64 > uyvytoyuv422 = ff_uyvytoyuv422_sse2; > #endif > diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm > index 0bf1278718..9fc1974389 100644 > --- a/libswscale/x86/rgb_2_rgb.asm > +++ b/libswscale/x86/rgb_2_rgb.asm > @@ -25,7 +25,6 @@ > > SECTION_RODATA > > -pb_mask_shuffle2103_mmx times 8 dw 255 > pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 > pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13 > pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 > @@ -50,11 +49,50 @@ SECTION .text > ;------------------------------------------------------------------------------ > ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size) > ;------------------------------------------------------------------------------ > -INIT_MMX mmxext > -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x > - mova m6, [pb_mask_shuffle2103_mmx] > - mova m7, m6 > - psllq m7, 8 > + > +%macro SHUFFLE2103_SSE2 0 > + pshuflw m1, m0, 0xb1 > + pshufhw m1, m1, 0xb1 > + > + pand m0, m3 > + pand m1, m2 > +%endmacro > + > +%macro SHUFFLE0321_SSE2 0 > + pshuflw m1, m0, 0xb1 > + pshufhw m1, m1, 0xb1 > + > + pand m0, m2 > + pand m1, m3 > +%endmacro > + > +%macro SHUFFLE1230_SSE2 0 > + pslld m1, m0, 24 > + psrld m0, 8 > +%endmacro > + > +%macro SHUFFLE3012_SSE2 0 > + pslld m1, m0, 8 > + psrld m0, 24 > +%endmacro > + > +%macro SHUFFLE3210_SSE2 0 > + pshuflw m1, m0, 0xb1 > + pshufhw m1, m1, 0xb1 > + > + psrlw m0, m1, 8 > + psllw m1, 8 > +%endmacro > + > +; %1-4 index shuffle > +; %5 load mask > +%macro SHUFFLE_BYTES_SSE2 5 > +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x > +%if %5 > + pcmpeqw m2, m2 > + psllw m3, m2, 8 ; (word) { 0xff00 } x4 > + psrlw m2, 8 ; (word) { 0x00ff } x4 > +%endif > > movsxdifnidn wq, wd > mov xq, wq > @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x > je .loop_simd > > .loop_scalar: > - mov tmpb, [srcq + wq + 2] > + mov tmpb, [srcq + wq + %1] > mov [dstq+wq + 0], tmpb > - mov tmpb, [srcq + wq + 1] > + mov tmpb, [srcq + wq + %2] > mov [dstq+wq + 1], tmpb > - mov tmpb, [srcq + wq + 0] > + mov tmpb, [srcq + wq + %3] > mov [dstq+wq + 2], tmpb > - mov tmpb, [srcq + wq + 3] > + mov tmpb, [srcq + wq + %4] > mov [dstq+wq + 3], tmpb > add wq, 4 > sub xq, 4 > @@ -86,29 +124,26 @@ jge .end > > .loop_simd: > movu m0, [srcq+wq] > - movu m1, [srcq+wq+8] > - > - pshufw m3, m0, 177 > - pshufw m5, m1, 177 > - > - pand m0, m7 > - pand m3, m6 > > - pand m1, m7 > - pand m5, m6 > + SHUFFLE%1%2%3%4_SSE2 > > - por m0, m3 > - por m1, m5 > + por m0, m1 > > movu [dstq+wq], m0 > - movu [dstq+wq + 8], m1 > > - add wq, mmsize*2 > + add wq, mmsize > jl .loop_simd > > .end: > - emms > RET > +%endmacro > + > +INIT_XMM sse2 > +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1 > +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1 > +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0 > +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0 > +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0 > > ;------------------------------------------------------------------------------ > ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size) How old are the youngest processors with SSE2, but without SSSE3? According to Wikipedia, nearly 15 years. Which makes me believe that the SSE2 versions are not worth it (how many of these CPUs will use a new FFmpeg anyway?). - Andreas _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions 2024-06-06 14:48 ` Andreas Rheinhardt @ 2024-06-06 15:45 ` James Almer 2024-06-08 15:55 ` Andreas Rheinhardt 0 siblings, 1 reply; 10+ messages in thread From: James Almer @ 2024-06-06 15:45 UTC (permalink / raw) To: ffmpeg-devel On 6/6/2024 11:48 AM, Andreas Rheinhardt wrote: > James Almer: >> And remove shuffle_bytes_2103_mmxext. >> >> shuffle_bytes_0321_c: 28.1 >> shuffle_bytes_0321_sse2: 13.6 >> shuffle_bytes_0321_ssse3: 9.6 >> shuffle_bytes_0321_avx2: 7.1 >> shuffle_bytes_1230_c: 52.6 >> shuffle_bytes_1230_sse2: 12.1 >> shuffle_bytes_1230_ssse3: 8.6 >> shuffle_bytes_1230_avx2: 6.6 >> shuffle_bytes_2103_c: 29.1 >> shuffle_bytes_2103_mmxext: 29.3 // removed >> shuffle_bytes_2103_sse2: 12.5 >> shuffle_bytes_2103_ssse3: 8.6 >> shuffle_bytes_2103_avx2: 7.1 >> shuffle_bytes_3012_c: 52.1 >> shuffle_bytes_3012_sse2: 12.1 >> shuffle_bytes_3012_ssse3: 8.6 >> shuffle_bytes_3012_avx2: 7.1 >> shuffle_bytes_3210_c: 50.6 >> shuffle_bytes_3210_sse2: 14.6 >> shuffle_bytes_3210_ssse3: 8.6 >> shuffle_bytes_3210_avx2: 7.1 >> >> Signed-off-by: James Almer <jamrial@gmail.com> >> --- >> libswscale/x86/rgb2rgb.c | 14 ++++-- >> libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++----------- >> 2 files changed, 69 insertions(+), 28 deletions(-) >> >> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c >> index 21ccfafe51..9f6c8efc72 100644 >> --- a/libswscale/x86/rgb2rgb.c >> +++ b/libswscale/x86/rgb2rgb.c >> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset); >> >> #endif /* HAVE_INLINE_ASM */ >> >> -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size); >> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size); >> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size); >> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size); >> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size); >> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size); >> void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size); >> void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size); >> void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size); >> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void) >> rgb2rgb_init_avx(); >> #endif /* HAVE_INLINE_ASM */ >> >> - if (EXTERNAL_MMXEXT(cpu_flags)) { >> - shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext; >> - } >> if (EXTERNAL_SSE2(cpu_flags)) { >> + shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2; >> + shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2; >> + shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2; >> + shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2; >> + shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2; >> #if ARCH_X86_64 >> uyvytoyuv422 = ff_uyvytoyuv422_sse2; >> #endif >> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm >> index 0bf1278718..9fc1974389 100644 >> --- a/libswscale/x86/rgb_2_rgb.asm >> +++ b/libswscale/x86/rgb_2_rgb.asm >> @@ -25,7 +25,6 @@ >> >> SECTION_RODATA >> >> -pb_mask_shuffle2103_mmx times 8 dw 255 >> pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 >> pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13 >> pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 >> @@ -50,11 +49,50 @@ SECTION .text >> ;------------------------------------------------------------------------------ >> ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size) >> ;------------------------------------------------------------------------------ >> -INIT_MMX mmxext >> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x >> - mova m6, [pb_mask_shuffle2103_mmx] >> - mova m7, m6 >> - psllq m7, 8 >> + >> +%macro SHUFFLE2103_SSE2 0 >> + pshuflw m1, m0, 0xb1 >> + pshufhw m1, m1, 0xb1 >> + >> + pand m0, m3 >> + pand m1, m2 >> +%endmacro >> + >> +%macro SHUFFLE0321_SSE2 0 >> + pshuflw m1, m0, 0xb1 >> + pshufhw m1, m1, 0xb1 >> + >> + pand m0, m2 >> + pand m1, m3 >> +%endmacro >> + >> +%macro SHUFFLE1230_SSE2 0 >> + pslld m1, m0, 24 >> + psrld m0, 8 >> +%endmacro >> + >> +%macro SHUFFLE3012_SSE2 0 >> + pslld m1, m0, 8 >> + psrld m0, 24 >> +%endmacro >> + >> +%macro SHUFFLE3210_SSE2 0 >> + pshuflw m1, m0, 0xb1 >> + pshufhw m1, m1, 0xb1 >> + >> + psrlw m0, m1, 8 >> + psllw m1, 8 >> +%endmacro >> + >> +; %1-4 index shuffle >> +; %5 load mask >> +%macro SHUFFLE_BYTES_SSE2 5 >> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x >> +%if %5 >> + pcmpeqw m2, m2 >> + psllw m3, m2, 8 ; (word) { 0xff00 } x4 >> + psrlw m2, 8 ; (word) { 0x00ff } x4 >> +%endif >> >> movsxdifnidn wq, wd >> mov xq, wq >> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x >> je .loop_simd >> >> .loop_scalar: >> - mov tmpb, [srcq + wq + 2] >> + mov tmpb, [srcq + wq + %1] >> mov [dstq+wq + 0], tmpb >> - mov tmpb, [srcq + wq + 1] >> + mov tmpb, [srcq + wq + %2] >> mov [dstq+wq + 1], tmpb >> - mov tmpb, [srcq + wq + 0] >> + mov tmpb, [srcq + wq + %3] >> mov [dstq+wq + 2], tmpb >> - mov tmpb, [srcq + wq + 3] >> + mov tmpb, [srcq + wq + %4] >> mov [dstq+wq + 3], tmpb >> add wq, 4 >> sub xq, 4 >> @@ -86,29 +124,26 @@ jge .end >> >> .loop_simd: >> movu m0, [srcq+wq] >> - movu m1, [srcq+wq+8] >> - >> - pshufw m3, m0, 177 >> - pshufw m5, m1, 177 >> - >> - pand m0, m7 >> - pand m3, m6 >> >> - pand m1, m7 >> - pand m5, m6 >> + SHUFFLE%1%2%3%4_SSE2 >> >> - por m0, m3 >> - por m1, m5 >> + por m0, m1 >> >> movu [dstq+wq], m0 >> - movu [dstq+wq + 8], m1 >> >> - add wq, mmsize*2 >> + add wq, mmsize >> jl .loop_simd >> >> .end: >> - emms >> RET >> +%endmacro >> + >> +INIT_XMM sse2 >> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1 >> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1 >> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0 >> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0 >> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0 >> >> ;------------------------------------------------------------------------------ >> ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size) > > How old are the youngest processors with SSE2, but without SSSE3? AMD Phenom/K10. > According to Wikipedia, nearly 15 years. Which makes me believe that the > SSE2 versions are not worth it (how many of these CPUs will use a new > FFmpeg anyway?). Simply by using the latest version of a video player that uses ffmpeg is enough to be able to run the newest code. It was easy to write and i don't feel particularly interested enough to argue, so if you think it's not worth adding, i can just remove the mmxext version and skip adding anything. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions 2024-06-06 15:45 ` James Almer @ 2024-06-08 15:55 ` Andreas Rheinhardt 2024-06-08 16:21 ` Rémi Denis-Courmont 2024-06-09 15:36 ` James Almer 0 siblings, 2 replies; 10+ messages in thread From: Andreas Rheinhardt @ 2024-06-08 15:55 UTC (permalink / raw) To: ffmpeg-devel James Almer: > On 6/6/2024 11:48 AM, Andreas Rheinhardt wrote: >> James Almer: >>> And remove shuffle_bytes_2103_mmxext. >>> >>> shuffle_bytes_0321_c: 28.1 >>> shuffle_bytes_0321_sse2: 13.6 >>> shuffle_bytes_0321_ssse3: 9.6 >>> shuffle_bytes_0321_avx2: 7.1 >>> shuffle_bytes_1230_c: 52.6 >>> shuffle_bytes_1230_sse2: 12.1 >>> shuffle_bytes_1230_ssse3: 8.6 >>> shuffle_bytes_1230_avx2: 6.6 >>> shuffle_bytes_2103_c: 29.1 >>> shuffle_bytes_2103_mmxext: 29.3 // removed >>> shuffle_bytes_2103_sse2: 12.5 >>> shuffle_bytes_2103_ssse3: 8.6 >>> shuffle_bytes_2103_avx2: 7.1 >>> shuffle_bytes_3012_c: 52.1 >>> shuffle_bytes_3012_sse2: 12.1 >>> shuffle_bytes_3012_ssse3: 8.6 >>> shuffle_bytes_3012_avx2: 7.1 >>> shuffle_bytes_3210_c: 50.6 >>> shuffle_bytes_3210_sse2: 14.6 >>> shuffle_bytes_3210_ssse3: 8.6 >>> shuffle_bytes_3210_avx2: 7.1 >>> >>> Signed-off-by: James Almer <jamrial@gmail.com> >>> --- >>> libswscale/x86/rgb2rgb.c | 14 ++++-- >>> libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++----------- >>> 2 files changed, 69 insertions(+), 28 deletions(-) >>> >>> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c >>> index 21ccfafe51..9f6c8efc72 100644 >>> --- a/libswscale/x86/rgb2rgb.c >>> +++ b/libswscale/x86/rgb2rgb.c >>> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, >>> ff_bgr2UVOffset); >>> #endif /* HAVE_INLINE_ASM */ >>> -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t >>> *dst, int src_size); >>> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, >>> int src_size); >>> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, >>> int src_size); >>> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, >>> int src_size); >>> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, >>> int src_size); >>> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, >>> int src_size); >>> void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, >>> int src_size); >>> void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, >>> int src_size); >>> void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, >>> int src_size); >>> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void) >>> rgb2rgb_init_avx(); >>> #endif /* HAVE_INLINE_ASM */ >>> - if (EXTERNAL_MMXEXT(cpu_flags)) { >>> - shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext; >>> - } >>> if (EXTERNAL_SSE2(cpu_flags)) { >>> + shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2; >>> + shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2; >>> + shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2; >>> + shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2; >>> + shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2; >>> #if ARCH_X86_64 >>> uyvytoyuv422 = ff_uyvytoyuv422_sse2; >>> #endif >>> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm >>> index 0bf1278718..9fc1974389 100644 >>> --- a/libswscale/x86/rgb_2_rgb.asm >>> +++ b/libswscale/x86/rgb_2_rgb.asm >>> @@ -25,7 +25,6 @@ >>> SECTION_RODATA >>> -pb_mask_shuffle2103_mmx times 8 dw 255 >>> pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, >>> 12, 15 >>> pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, >>> 14, 13 >>> pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, >>> 15, 12 >>> @@ -50,11 +49,50 @@ SECTION .text >>> >>> ;------------------------------------------------------------------------------ >>> ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int >>> src_size) >>> >>> ;------------------------------------------------------------------------------ >>> -INIT_MMX mmxext >>> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x >>> - mova m6, [pb_mask_shuffle2103_mmx] >>> - mova m7, m6 >>> - psllq m7, 8 >>> + >>> +%macro SHUFFLE2103_SSE2 0 >>> + pshuflw m1, m0, 0xb1 >>> + pshufhw m1, m1, 0xb1 >>> + >>> + pand m0, m3 >>> + pand m1, m2 >>> +%endmacro >>> + >>> +%macro SHUFFLE0321_SSE2 0 >>> + pshuflw m1, m0, 0xb1 >>> + pshufhw m1, m1, 0xb1 >>> + >>> + pand m0, m2 >>> + pand m1, m3 >>> +%endmacro >>> + >>> +%macro SHUFFLE1230_SSE2 0 >>> + pslld m1, m0, 24 >>> + psrld m0, 8 >>> +%endmacro >>> + >>> +%macro SHUFFLE3012_SSE2 0 >>> + pslld m1, m0, 8 >>> + psrld m0, 24 >>> +%endmacro >>> + >>> +%macro SHUFFLE3210_SSE2 0 >>> + pshuflw m1, m0, 0xb1 >>> + pshufhw m1, m1, 0xb1 >>> + >>> + psrlw m0, m1, 8 >>> + psllw m1, 8 >>> +%endmacro >>> + >>> +; %1-4 index shuffle >>> +; %5 load mask >>> +%macro SHUFFLE_BYTES_SSE2 5 >>> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x >>> +%if %5 >>> + pcmpeqw m2, m2 >>> + psllw m3, m2, 8 ; (word) { 0xff00 } x4 >>> + psrlw m2, 8 ; (word) { 0x00ff } x4 >>> +%endif >>> movsxdifnidn wq, wd >>> mov xq, wq >>> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, >>> w, tmp, x >>> je .loop_simd >>> .loop_scalar: >>> - mov tmpb, [srcq + wq + 2] >>> + mov tmpb, [srcq + wq + %1] >>> mov [dstq+wq + 0], tmpb >>> - mov tmpb, [srcq + wq + 1] >>> + mov tmpb, [srcq + wq + %2] >>> mov [dstq+wq + 1], tmpb >>> - mov tmpb, [srcq + wq + 0] >>> + mov tmpb, [srcq + wq + %3] >>> mov [dstq+wq + 2], tmpb >>> - mov tmpb, [srcq + wq + 3] >>> + mov tmpb, [srcq + wq + %4] >>> mov [dstq+wq + 3], tmpb >>> add wq, 4 >>> sub xq, 4 >>> @@ -86,29 +124,26 @@ jge .end >>> .loop_simd: >>> movu m0, [srcq+wq] >>> - movu m1, [srcq+wq+8] >>> - >>> - pshufw m3, m0, 177 >>> - pshufw m5, m1, 177 >>> - >>> - pand m0, m7 >>> - pand m3, m6 >>> - pand m1, m7 >>> - pand m5, m6 >>> + SHUFFLE%1%2%3%4_SSE2 >>> - por m0, m3 >>> - por m1, m5 >>> + por m0, m1 >>> movu [dstq+wq], m0 >>> - movu [dstq+wq + 8], m1 >>> - add wq, mmsize*2 >>> + add wq, mmsize >>> jl .loop_simd >>> .end: >>> - emms >>> RET >>> +%endmacro >>> + >>> +INIT_XMM sse2 >>> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1 >>> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1 >>> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0 >>> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0 >>> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0 >>> >>> ;------------------------------------------------------------------------------ >>> ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size) >> >> How old are the youngest processors with SSE2, but without SSSE3? > > AMD Phenom/K10. > >> According to Wikipedia, nearly 15 years. Which makes me believe that the >> SSE2 versions are not worth it (how many of these CPUs will use a new >> FFmpeg anyway?). > > Simply by using the latest version of a video player that uses ffmpeg is > enough to be able to run the newest code. I asked "how many", not "how". > It was easy to write and i don't feel particularly interested enough to > argue, so if you think it's not worth adding, i can just remove the > mmxext version and skip adding anything. I think we should not optimize for CPUs that do not even have x86-64 v2. So I would not add these SSE2 versions. But the one missing SSSE3 version (shuffle_bytes_2103_ssse3) is of course worth it. - Andreas _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions 2024-06-08 15:55 ` Andreas Rheinhardt @ 2024-06-08 16:21 ` Rémi Denis-Courmont 2024-06-09 15:36 ` James Almer 1 sibling, 0 replies; 10+ messages in thread From: Rémi Denis-Courmont @ 2024-06-08 16:21 UTC (permalink / raw) To: ffmpeg-devel Le lauantaina 8. kesäkuuta 2024, 18.55.53 EEST Andreas Rheinhardt a écrit : > I think we should not optimize for CPUs that do not even have x86-64 v2. > So I would not add these SSE2 versions. We certainly should consider ditching SSE2 where SSSE3 is available now or in the near future. But in this particular case, James seems to be converting MMX(EXT) code into SSE2 code, more so that introducing pure new SSE2 code. It took almost forever to agree to get rid of MMX. I would like to go ahead with that, and I like to think that many other people too. So can we at least tolerate porting MMX to SSE2 until we have gotten rid of MMX for good? -- 雷米‧德尼-库尔蒙 http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions 2024-06-08 15:55 ` Andreas Rheinhardt 2024-06-08 16:21 ` Rémi Denis-Courmont @ 2024-06-09 15:36 ` James Almer 2024-06-09 16:05 ` Rémi Denis-Courmont 2024-06-10 17:06 ` James Almer 1 sibling, 2 replies; 10+ messages in thread From: James Almer @ 2024-06-09 15:36 UTC (permalink / raw) To: ffmpeg-devel On 6/8/2024 12:55 PM, Andreas Rheinhardt wrote: > James Almer: >> On 6/6/2024 11:48 AM, Andreas Rheinhardt wrote: >>> James Almer: >>>> And remove shuffle_bytes_2103_mmxext. >>>> >>>> shuffle_bytes_0321_c: 28.1 >>>> shuffle_bytes_0321_sse2: 13.6 >>>> shuffle_bytes_0321_ssse3: 9.6 >>>> shuffle_bytes_0321_avx2: 7.1 >>>> shuffle_bytes_1230_c: 52.6 >>>> shuffle_bytes_1230_sse2: 12.1 >>>> shuffle_bytes_1230_ssse3: 8.6 >>>> shuffle_bytes_1230_avx2: 6.6 >>>> shuffle_bytes_2103_c: 29.1 >>>> shuffle_bytes_2103_mmxext: 29.3 // removed >>>> shuffle_bytes_2103_sse2: 12.5 >>>> shuffle_bytes_2103_ssse3: 8.6 >>>> shuffle_bytes_2103_avx2: 7.1 >>>> shuffle_bytes_3012_c: 52.1 >>>> shuffle_bytes_3012_sse2: 12.1 >>>> shuffle_bytes_3012_ssse3: 8.6 >>>> shuffle_bytes_3012_avx2: 7.1 >>>> shuffle_bytes_3210_c: 50.6 >>>> shuffle_bytes_3210_sse2: 14.6 >>>> shuffle_bytes_3210_ssse3: 8.6 >>>> shuffle_bytes_3210_avx2: 7.1 >>>> >>>> Signed-off-by: James Almer <jamrial@gmail.com> >>>> --- >>>> libswscale/x86/rgb2rgb.c | 14 ++++-- >>>> libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++----------- >>>> 2 files changed, 69 insertions(+), 28 deletions(-) >>>> >>>> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c >>>> index 21ccfafe51..9f6c8efc72 100644 >>>> --- a/libswscale/x86/rgb2rgb.c >>>> +++ b/libswscale/x86/rgb2rgb.c >>>> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, >>>> ff_bgr2UVOffset); >>>> #endif /* HAVE_INLINE_ASM */ >>>> -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t >>>> *dst, int src_size); >>>> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, >>>> int src_size); >>>> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, >>>> int src_size); >>>> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, >>>> int src_size); >>>> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, >>>> int src_size); >>>> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, >>>> int src_size); >>>> void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, >>>> int src_size); >>>> void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, >>>> int src_size); >>>> void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, >>>> int src_size); >>>> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void) >>>> rgb2rgb_init_avx(); >>>> #endif /* HAVE_INLINE_ASM */ >>>> - if (EXTERNAL_MMXEXT(cpu_flags)) { >>>> - shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext; >>>> - } >>>> if (EXTERNAL_SSE2(cpu_flags)) { >>>> + shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2; >>>> + shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2; >>>> + shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2; >>>> + shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2; >>>> + shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2; >>>> #if ARCH_X86_64 >>>> uyvytoyuv422 = ff_uyvytoyuv422_sse2; >>>> #endif >>>> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm >>>> index 0bf1278718..9fc1974389 100644 >>>> --- a/libswscale/x86/rgb_2_rgb.asm >>>> +++ b/libswscale/x86/rgb_2_rgb.asm >>>> @@ -25,7 +25,6 @@ >>>> SECTION_RODATA >>>> -pb_mask_shuffle2103_mmx times 8 dw 255 >>>> pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, >>>> 12, 15 >>>> pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, >>>> 14, 13 >>>> pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, >>>> 15, 12 >>>> @@ -50,11 +49,50 @@ SECTION .text >>>> >>>> ;------------------------------------------------------------------------------ >>>> ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int >>>> src_size) >>>> >>>> ;------------------------------------------------------------------------------ >>>> -INIT_MMX mmxext >>>> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x >>>> - mova m6, [pb_mask_shuffle2103_mmx] >>>> - mova m7, m6 >>>> - psllq m7, 8 >>>> + >>>> +%macro SHUFFLE2103_SSE2 0 >>>> + pshuflw m1, m0, 0xb1 >>>> + pshufhw m1, m1, 0xb1 >>>> + >>>> + pand m0, m3 >>>> + pand m1, m2 >>>> +%endmacro >>>> + >>>> +%macro SHUFFLE0321_SSE2 0 >>>> + pshuflw m1, m0, 0xb1 >>>> + pshufhw m1, m1, 0xb1 >>>> + >>>> + pand m0, m2 >>>> + pand m1, m3 >>>> +%endmacro >>>> + >>>> +%macro SHUFFLE1230_SSE2 0 >>>> + pslld m1, m0, 24 >>>> + psrld m0, 8 >>>> +%endmacro >>>> + >>>> +%macro SHUFFLE3012_SSE2 0 >>>> + pslld m1, m0, 8 >>>> + psrld m0, 24 >>>> +%endmacro >>>> + >>>> +%macro SHUFFLE3210_SSE2 0 >>>> + pshuflw m1, m0, 0xb1 >>>> + pshufhw m1, m1, 0xb1 >>>> + >>>> + psrlw m0, m1, 8 >>>> + psllw m1, 8 >>>> +%endmacro >>>> + >>>> +; %1-4 index shuffle >>>> +; %5 load mask >>>> +%macro SHUFFLE_BYTES_SSE2 5 >>>> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x >>>> +%if %5 >>>> + pcmpeqw m2, m2 >>>> + psllw m3, m2, 8 ; (word) { 0xff00 } x4 >>>> + psrlw m2, 8 ; (word) { 0x00ff } x4 >>>> +%endif >>>> movsxdifnidn wq, wd >>>> mov xq, wq >>>> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, >>>> w, tmp, x >>>> je .loop_simd >>>> .loop_scalar: >>>> - mov tmpb, [srcq + wq + 2] >>>> + mov tmpb, [srcq + wq + %1] >>>> mov [dstq+wq + 0], tmpb >>>> - mov tmpb, [srcq + wq + 1] >>>> + mov tmpb, [srcq + wq + %2] >>>> mov [dstq+wq + 1], tmpb >>>> - mov tmpb, [srcq + wq + 0] >>>> + mov tmpb, [srcq + wq + %3] >>>> mov [dstq+wq + 2], tmpb >>>> - mov tmpb, [srcq + wq + 3] >>>> + mov tmpb, [srcq + wq + %4] >>>> mov [dstq+wq + 3], tmpb >>>> add wq, 4 >>>> sub xq, 4 >>>> @@ -86,29 +124,26 @@ jge .end >>>> .loop_simd: >>>> movu m0, [srcq+wq] >>>> - movu m1, [srcq+wq+8] >>>> - >>>> - pshufw m3, m0, 177 >>>> - pshufw m5, m1, 177 >>>> - >>>> - pand m0, m7 >>>> - pand m3, m6 >>>> - pand m1, m7 >>>> - pand m5, m6 >>>> + SHUFFLE%1%2%3%4_SSE2 >>>> - por m0, m3 >>>> - por m1, m5 >>>> + por m0, m1 >>>> movu [dstq+wq], m0 >>>> - movu [dstq+wq + 8], m1 >>>> - add wq, mmsize*2 >>>> + add wq, mmsize >>>> jl .loop_simd >>>> .end: >>>> - emms >>>> RET >>>> +%endmacro >>>> + >>>> +INIT_XMM sse2 >>>> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1 >>>> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1 >>>> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0 >>>> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0 >>>> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0 >>>> >>>> ;------------------------------------------------------------------------------ >>>> ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size) >>> >>> How old are the youngest processors with SSE2, but without SSSE3? >> >> AMD Phenom/K10. >> >>> According to Wikipedia, nearly 15 years. Which makes me believe that the >>> SSE2 versions are not worth it (how many of these CPUs will use a new >>> FFmpeg anyway?). >> >> Simply by using the latest version of a video player that uses ffmpeg is >> enough to be able to run the newest code. > > I asked "how many", not "how". I obviously don't have that kind of information. You'd need to look at things like Steam's, Firefox's or Chrome's hardware surveys. > >> It was easy to write and i don't feel particularly interested enough to >> argue, so if you think it's not worth adding, i can just remove the >> mmxext version and skip adding anything. > > I think we should not optimize for CPUs that do not even have x86-64 v2. What is x86-64 v2? > So I would not add these SSE2 versions. But the one missing SSSE3 Ok, I'll just remove the mmxext one, then. > version (shuffle_bytes_2103_ssse3) is of course worth it. I will look into that. > > - Andreas > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions 2024-06-09 15:36 ` James Almer @ 2024-06-09 16:05 ` Rémi Denis-Courmont 2024-06-10 17:06 ` James Almer 1 sibling, 0 replies; 10+ messages in thread From: Rémi Denis-Courmont @ 2024-06-09 16:05 UTC (permalink / raw) To: ffmpeg-devel Le sunnuntaina 9. kesäkuuta 2024, 18.36.35 EEST James Almer a écrit : > I obviously don't have that kind of information. You'd need to look at > things like Steam's, Firefox's or Chrome's hardware surveys. As discussed on IRC yesterday, Steam claims that 106.85% of processors support SSE2 (and as many SSE3) but "only" 106.63% support SSSE3 (seriously). What 100% are, I don't know. AVX2 is close with 99.83% but no cigar. In any case, there is a tiny but observable gap between SSE2 and SSSE3 there. > >> It was easy to write and i don't feel particularly interested enough to > >> argue, so if you think it's not worth adding, i can just remove the > >> mmxext version and skip adding anything. > > > > I think we should not optimize for CPUs that do not even have x86-64 v2. > > What is x86-64 v2? See https://developers.redhat.com/blog/2021/01/05/building-red-hat-enterprise-linux-9-for-the-x86-64-v2-microarchitecture-level -- 雷米‧德尼-库尔蒙 http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions 2024-06-09 15:36 ` James Almer 2024-06-09 16:05 ` Rémi Denis-Courmont @ 2024-06-10 17:06 ` James Almer 2024-06-11 5:18 ` Andreas Rheinhardt 1 sibling, 1 reply; 10+ messages in thread From: James Almer @ 2024-06-10 17:06 UTC (permalink / raw) To: ffmpeg-devel On 6/9/2024 12:36 PM, James Almer wrote: >> So I would not add these SSE2 versions. But the one missing SSSE3 >> version (shuffle_bytes_2103_ssse3) is of course worth it. > > I will look into that. I'm not sure why you said it's missing, because it's there. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions 2024-06-10 17:06 ` James Almer @ 2024-06-11 5:18 ` Andreas Rheinhardt 0 siblings, 0 replies; 10+ messages in thread From: Andreas Rheinhardt @ 2024-06-11 5:18 UTC (permalink / raw) To: ffmpeg-devel James Almer: > On 6/9/2024 12:36 PM, James Almer wrote: >>> So I would not add these SSE2 versions. But the one missing SSSE3 >>> version (shuffle_bytes_2103_ssse3) is of course worth it. >> >> I will look into that. > > I'm not sure why you said it's missing, because it's there. Sorry for having said garbage. - Andreas _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2024-06-11 5:19 UTC | newest] Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2024-06-05 20:51 [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: replace shuffle_bytes_2103_mmxext with an SSE2 version James Almer 2024-06-06 14:15 ` [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions James Almer 2024-06-06 14:48 ` Andreas Rheinhardt 2024-06-06 15:45 ` James Almer 2024-06-08 15:55 ` Andreas Rheinhardt 2024-06-08 16:21 ` Rémi Denis-Courmont 2024-06-09 15:36 ` James Almer 2024-06-09 16:05 ` Rémi Denis-Courmont 2024-06-10 17:06 ` James Almer 2024-06-11 5:18 ` Andreas Rheinhardt
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git