Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
To: ffmpeg-devel@ffmpeg.org
Subject: Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
Date: Thu, 6 Jun 2024 16:48:44 +0200
Message-ID: <AS8P250MB07441B4682A3DE6F4B2D9AD18FFA2@AS8P250MB0744.EURP250.PROD.OUTLOOK.COM> (raw)
In-Reply-To: <20240606141505.132-1-jamrial@gmail.com>

James Almer:
> And remove shuffle_bytes_2103_mmxext.
> 
> shuffle_bytes_0321_c: 28.1
> shuffle_bytes_0321_sse2: 13.6
> shuffle_bytes_0321_ssse3: 9.6
> shuffle_bytes_0321_avx2: 7.1
> shuffle_bytes_1230_c: 52.6
> shuffle_bytes_1230_sse2: 12.1
> shuffle_bytes_1230_ssse3: 8.6
> shuffle_bytes_1230_avx2: 6.6
> shuffle_bytes_2103_c: 29.1
> shuffle_bytes_2103_mmxext: 29.3 // removed
> shuffle_bytes_2103_sse2: 12.5
> shuffle_bytes_2103_ssse3: 8.6
> shuffle_bytes_2103_avx2: 7.1
> shuffle_bytes_3012_c: 52.1
> shuffle_bytes_3012_sse2: 12.1
> shuffle_bytes_3012_ssse3: 8.6
> shuffle_bytes_3012_avx2: 7.1
> shuffle_bytes_3210_c: 50.6
> shuffle_bytes_3210_sse2: 14.6
> shuffle_bytes_3210_ssse3: 8.6
> shuffle_bytes_3210_avx2: 7.1
> 
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  libswscale/x86/rgb2rgb.c     | 14 ++++--
>  libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
>  2 files changed, 69 insertions(+), 28 deletions(-)
> 
> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
> index 21ccfafe51..9f6c8efc72 100644
> --- a/libswscale/x86/rgb2rgb.c
> +++ b/libswscale/x86/rgb2rgb.c
> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
>  
>  #endif /* HAVE_INLINE_ASM */
>  
> -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>  void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>  void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>  void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
>          rgb2rgb_init_avx();
>  #endif /* HAVE_INLINE_ASM */
>  
> -    if (EXTERNAL_MMXEXT(cpu_flags)) {
> -        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
> -    }
>      if (EXTERNAL_SSE2(cpu_flags)) {
> +        shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
> +        shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
> +        shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
> +        shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
> +        shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
>  #if ARCH_X86_64
>          uyvytoyuv422 = ff_uyvytoyuv422_sse2;
>  #endif
> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> index 0bf1278718..9fc1974389 100644
> --- a/libswscale/x86/rgb_2_rgb.asm
> +++ b/libswscale/x86/rgb_2_rgb.asm
> @@ -25,7 +25,6 @@
>  
>  SECTION_RODATA
>  
> -pb_mask_shuffle2103_mmx times 8 dw 255
>  pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
>  pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
>  pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
> @@ -50,11 +49,50 @@ SECTION .text
>  ;------------------------------------------------------------------------------
>  ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
>  ;------------------------------------------------------------------------------
> -INIT_MMX mmxext
> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
> -    mova   m6, [pb_mask_shuffle2103_mmx]
> -    mova   m7, m6
> -    psllq  m7, 8
> +
> +%macro SHUFFLE2103_SSE2 0
> +    pshuflw   m1, m0, 0xb1
> +    pshufhw   m1, m1, 0xb1
> +
> +    pand     m0, m3
> +    pand     m1, m2
> +%endmacro
> +
> +%macro SHUFFLE0321_SSE2 0
> +    pshuflw   m1, m0, 0xb1
> +    pshufhw   m1, m1, 0xb1
> +
> +    pand     m0, m2
> +    pand     m1, m3
> +%endmacro
> +
> +%macro SHUFFLE1230_SSE2 0
> +    pslld    m1, m0, 24
> +    psrld    m0, 8
> +%endmacro
> +
> +%macro SHUFFLE3012_SSE2 0
> +    pslld    m1, m0, 8
> +    psrld    m0, 24
> +%endmacro
> +
> +%macro SHUFFLE3210_SSE2 0
> +    pshuflw   m1, m0, 0xb1
> +    pshufhw   m1, m1, 0xb1
> +
> +    psrlw     m0, m1, 8
> +    psllw     m1, 8
> +%endmacro
> +
> +; %1-4 index shuffle
> +; %5 load mask
> +%macro SHUFFLE_BYTES_SSE2 5
> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
> +%if %5
> +    pcmpeqw        m2, m2
> +    psllw          m3, m2, 8 ; (word) { 0xff00 } x4
> +    psrlw          m2, 8     ; (word) { 0x00ff } x4
> +%endif
>  
>      movsxdifnidn wq, wd
>      mov xq, wq
> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>      je .loop_simd
>  
>  .loop_scalar:
> -   mov          tmpb, [srcq + wq + 2]
> +   mov          tmpb, [srcq + wq + %1]
>     mov [dstq+wq + 0], tmpb
> -   mov          tmpb, [srcq + wq + 1]
> +   mov          tmpb, [srcq + wq + %2]
>     mov [dstq+wq + 1], tmpb
> -   mov          tmpb, [srcq + wq + 0]
> +   mov          tmpb, [srcq + wq + %3]
>     mov [dstq+wq + 2], tmpb
> -   mov          tmpb, [srcq + wq + 3]
> +   mov          tmpb, [srcq + wq + %4]
>     mov [dstq+wq + 3], tmpb
>     add            wq, 4
>     sub            xq, 4
> @@ -86,29 +124,26 @@ jge .end
>  
>  .loop_simd:
>      movu     m0, [srcq+wq]
> -    movu     m1, [srcq+wq+8]
> -
> -    pshufw   m3, m0, 177
> -    pshufw   m5, m1, 177
> -
> -    pand     m0, m7
> -    pand     m3, m6
>  
> -    pand     m1, m7
> -    pand     m5, m6
> +    SHUFFLE%1%2%3%4_SSE2
>  
> -    por      m0, m3
> -    por      m1, m5
> +    por      m0, m1
>  
>      movu      [dstq+wq], m0
> -    movu  [dstq+wq + 8], m1
>  
> -    add              wq, mmsize*2
> +    add              wq, mmsize
>      jl .loop_simd
>  
>  .end:
> -    emms
>      RET
> +%endmacro
> +
> +INIT_XMM sse2
> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>  
>  ;------------------------------------------------------------------------------
>  ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)

How old are the youngest processors with SSE2, but without SSSE3?
According to Wikipedia, nearly 15 years. Which makes me believe that the
SSE2 versions are not worth it (how many of these CPUs will use a new
FFmpeg anyway?).

- Andreas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

  reply	other threads:[~2024-06-06 15:04 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-06-05 20:51 [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: replace shuffle_bytes_2103_mmxext with an SSE2 version James Almer
2024-06-06 14:15 ` [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions James Almer
2024-06-06 14:48   ` Andreas Rheinhardt [this message]
2024-06-06 15:45     ` James Almer
2024-06-08 15:55       ` Andreas Rheinhardt
2024-06-08 16:21         ` Rémi Denis-Courmont
2024-06-09 15:36         ` James Almer
2024-06-09 16:05           ` Rémi Denis-Courmont
2024-06-10 17:06           ` James Almer
2024-06-11  5:18             ` Andreas Rheinhardt

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=AS8P250MB07441B4682A3DE6F4B2D9AD18FFA2@AS8P250MB0744.EURP250.PROD.OUTLOOK.COM \
    --to=andreas.rheinhardt@outlook.com \
    --cc=ffmpeg-devel@ffmpeg.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git