From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
To: ffmpeg-devel@ffmpeg.org
Subject: Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
Date: Thu, 6 Jun 2024 16:48:44 +0200
Message-ID: <AS8P250MB07441B4682A3DE6F4B2D9AD18FFA2@AS8P250MB0744.EURP250.PROD.OUTLOOK.COM> (raw)
In-Reply-To: <20240606141505.132-1-jamrial@gmail.com>
James Almer:
> And remove shuffle_bytes_2103_mmxext.
>
> shuffle_bytes_0321_c: 28.1
> shuffle_bytes_0321_sse2: 13.6
> shuffle_bytes_0321_ssse3: 9.6
> shuffle_bytes_0321_avx2: 7.1
> shuffle_bytes_1230_c: 52.6
> shuffle_bytes_1230_sse2: 12.1
> shuffle_bytes_1230_ssse3: 8.6
> shuffle_bytes_1230_avx2: 6.6
> shuffle_bytes_2103_c: 29.1
> shuffle_bytes_2103_mmxext: 29.3 // removed
> shuffle_bytes_2103_sse2: 12.5
> shuffle_bytes_2103_ssse3: 8.6
> shuffle_bytes_2103_avx2: 7.1
> shuffle_bytes_3012_c: 52.1
> shuffle_bytes_3012_sse2: 12.1
> shuffle_bytes_3012_ssse3: 8.6
> shuffle_bytes_3012_avx2: 7.1
> shuffle_bytes_3210_c: 50.6
> shuffle_bytes_3210_sse2: 14.6
> shuffle_bytes_3210_ssse3: 8.6
> shuffle_bytes_3210_avx2: 7.1
>
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
> libswscale/x86/rgb2rgb.c | 14 ++++--
> libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
> 2 files changed, 69 insertions(+), 28 deletions(-)
>
> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
> index 21ccfafe51..9f6c8efc72 100644
> --- a/libswscale/x86/rgb2rgb.c
> +++ b/libswscale/x86/rgb2rgb.c
> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
>
> #endif /* HAVE_INLINE_ASM */
>
> -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
> void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
> void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
> rgb2rgb_init_avx();
> #endif /* HAVE_INLINE_ASM */
>
> - if (EXTERNAL_MMXEXT(cpu_flags)) {
> - shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
> - }
> if (EXTERNAL_SSE2(cpu_flags)) {
> + shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
> + shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
> + shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
> + shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
> + shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
> #if ARCH_X86_64
> uyvytoyuv422 = ff_uyvytoyuv422_sse2;
> #endif
> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> index 0bf1278718..9fc1974389 100644
> --- a/libswscale/x86/rgb_2_rgb.asm
> +++ b/libswscale/x86/rgb_2_rgb.asm
> @@ -25,7 +25,6 @@
>
> SECTION_RODATA
>
> -pb_mask_shuffle2103_mmx times 8 dw 255
> pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
> pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
> pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
> @@ -50,11 +49,50 @@ SECTION .text
> ;------------------------------------------------------------------------------
> ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
> ;------------------------------------------------------------------------------
> -INIT_MMX mmxext
> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
> - mova m6, [pb_mask_shuffle2103_mmx]
> - mova m7, m6
> - psllq m7, 8
> +
> +%macro SHUFFLE2103_SSE2 0
> + pshuflw m1, m0, 0xb1
> + pshufhw m1, m1, 0xb1
> +
> + pand m0, m3
> + pand m1, m2
> +%endmacro
> +
> +%macro SHUFFLE0321_SSE2 0
> + pshuflw m1, m0, 0xb1
> + pshufhw m1, m1, 0xb1
> +
> + pand m0, m2
> + pand m1, m3
> +%endmacro
> +
> +%macro SHUFFLE1230_SSE2 0
> + pslld m1, m0, 24
> + psrld m0, 8
> +%endmacro
> +
> +%macro SHUFFLE3012_SSE2 0
> + pslld m1, m0, 8
> + psrld m0, 24
> +%endmacro
> +
> +%macro SHUFFLE3210_SSE2 0
> + pshuflw m1, m0, 0xb1
> + pshufhw m1, m1, 0xb1
> +
> + psrlw m0, m1, 8
> + psllw m1, 8
> +%endmacro
> +
> +; %1-4 index shuffle
> +; %5 load mask
> +%macro SHUFFLE_BYTES_SSE2 5
> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
> +%if %5
> + pcmpeqw m2, m2
> + psllw m3, m2, 8 ; (word) { 0xff00 } x4
> + psrlw m2, 8 ; (word) { 0x00ff } x4
> +%endif
>
> movsxdifnidn wq, wd
> mov xq, wq
> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
> je .loop_simd
>
> .loop_scalar:
> - mov tmpb, [srcq + wq + 2]
> + mov tmpb, [srcq + wq + %1]
> mov [dstq+wq + 0], tmpb
> - mov tmpb, [srcq + wq + 1]
> + mov tmpb, [srcq + wq + %2]
> mov [dstq+wq + 1], tmpb
> - mov tmpb, [srcq + wq + 0]
> + mov tmpb, [srcq + wq + %3]
> mov [dstq+wq + 2], tmpb
> - mov tmpb, [srcq + wq + 3]
> + mov tmpb, [srcq + wq + %4]
> mov [dstq+wq + 3], tmpb
> add wq, 4
> sub xq, 4
> @@ -86,29 +124,26 @@ jge .end
>
> .loop_simd:
> movu m0, [srcq+wq]
> - movu m1, [srcq+wq+8]
> -
> - pshufw m3, m0, 177
> - pshufw m5, m1, 177
> -
> - pand m0, m7
> - pand m3, m6
>
> - pand m1, m7
> - pand m5, m6
> + SHUFFLE%1%2%3%4_SSE2
>
> - por m0, m3
> - por m1, m5
> + por m0, m1
>
> movu [dstq+wq], m0
> - movu [dstq+wq + 8], m1
>
> - add wq, mmsize*2
> + add wq, mmsize
> jl .loop_simd
>
> .end:
> - emms
> RET
> +%endmacro
> +
> +INIT_XMM sse2
> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>
> ;------------------------------------------------------------------------------
> ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
How old are the youngest processors with SSE2, but without SSSE3?
According to Wikipedia, nearly 15 years. Which makes me believe that the
SSE2 versions are not worth it (how many of these CPUs will use a new
FFmpeg anyway?).
- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2024-06-06 15:04 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-06-05 20:51 [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: replace shuffle_bytes_2103_mmxext with an SSE2 version James Almer
2024-06-06 14:15 ` [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions James Almer
2024-06-06 14:48 ` Andreas Rheinhardt [this message]
2024-06-06 15:45 ` James Almer
2024-06-08 15:55 ` Andreas Rheinhardt
2024-06-08 16:21 ` Rémi Denis-Courmont
2024-06-09 15:36 ` James Almer
2024-06-09 16:05 ` Rémi Denis-Courmont
2024-06-10 17:06 ` James Almer
2024-06-11 5:18 ` Andreas Rheinhardt
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=AS8P250MB07441B4682A3DE6F4B2D9AD18FFA2@AS8P250MB0744.EURP250.PROD.OUTLOOK.COM \
--to=andreas.rheinhardt@outlook.com \
--cc=ffmpeg-devel@ffmpeg.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git