Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/2] swscale/x86/input: add AVX2 optimized RGB32 to YUV functions
@ 2024-06-05 20:28 James Almer
  2024-06-05 20:28 ` [FFmpeg-devel] [PATCH 2/2] swscale/x86/input: add AVX2 optimized uyvytoyuv422 James Almer
  0 siblings, 1 reply; 6+ messages in thread
From: James Almer @ 2024-06-05 20:28 UTC (permalink / raw)
  To: ffmpeg-devel

abgr_to_uv_8_c: 43.3
abgr_to_uv_8_sse2: 14.3
abgr_to_uv_8_avx: 15.3
abgr_to_uv_8_avx2: 18.8
abgr_to_uv_128_c: 650.3
abgr_to_uv_128_sse2: 110.8
abgr_to_uv_128_avx: 112.3
abgr_to_uv_128_avx2: 64.8
abgr_to_uv_1080_c: 5456.3
abgr_to_uv_1080_sse2: 888.8
abgr_to_uv_1080_avx: 900.8
abgr_to_uv_1080_avx2: 518.3
abgr_to_uv_1920_c: 9692.3
abgr_to_uv_1920_sse2: 1593.8
abgr_to_uv_1920_avx: 1613.3
abgr_to_uv_1920_avx2: 864.8
abgr_to_y_8_c: 23.3
abgr_to_y_8_sse2: 12.8
abgr_to_y_8_avx: 13.3
abgr_to_y_8_avx2: 17.3
abgr_to_y_128_c: 308.3
abgr_to_y_128_sse2: 67.3
abgr_to_y_128_avx: 66.8
abgr_to_y_128_avx2: 44.8
abgr_to_y_1080_c: 2371.3
abgr_to_y_1080_sse2: 512.8
abgr_to_y_1080_avx: 505.8
abgr_to_y_1080_avx2: 314.3
abgr_to_y_1920_c: 4177.3
abgr_to_y_1920_sse2: 915.8
abgr_to_y_1920_avx: 926.8
abgr_to_y_1920_avx2: 519.3
bgra_to_uv_8_c: 37.3
bgra_to_uv_8_sse2: 13.3
bgra_to_uv_8_avx: 14.8
bgra_to_uv_8_avx2: 19.8
bgra_to_uv_128_c: 563.8
bgra_to_uv_128_sse2: 111.3
bgra_to_uv_128_avx: 112.3
bgra_to_uv_128_avx2: 64.8
bgra_to_uv_1080_c: 4691.8
bgra_to_uv_1080_sse2: 893.8
bgra_to_uv_1080_avx: 899.8
bgra_to_uv_1080_avx2: 517.8
bgra_to_uv_1920_c: 8332.8
bgra_to_uv_1920_sse2: 1590.8
bgra_to_uv_1920_avx: 1605.8
bgra_to_uv_1920_avx2: 867.3
bgra_to_y_8_c: 22.3
bgra_to_y_8_sse2: 12.8
bgra_to_y_8_avx: 12.8
bgra_to_y_8_avx2: 17.3
bgra_to_y_128_c: 291.3
bgra_to_y_128_sse2: 67.8
bgra_to_y_128_avx: 69.3
bgra_to_y_128_avx2: 45.3
bgra_to_y_1080_c: 2357.3
bgra_to_y_1080_sse2: 508.3
bgra_to_y_1080_avx: 518.3
bgra_to_y_1080_avx2: 399.8
bgra_to_y_1920_c: 4202.8
bgra_to_y_1920_sse2: 906.8
bgra_to_y_1920_avx: 907.3
bgra_to_y_1920_avx2: 526.3

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libswscale/x86/input.asm | 51 ++++++++++++++++++++++++++++++++++++----
 libswscale/x86/swscale.c |  8 +++++++
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm
index e79fe11405..f1ad6a53fd 100644
--- a/libswscale/x86/input.asm
+++ b/libswscale/x86/input.asm
@@ -380,8 +380,13 @@ RGB24_FUNCS 11, 13
 ; %2-5 = rgba, bgra, argb or abgr (in individual characters)
 %macro RGB32_TO_Y_FN 5-6
 cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table
+%if mmsize == 32
+    vbroadcasti128 m5, [rgba_Ycoeff_%2%4]
+    vbroadcasti128 m6, [rgba_Ycoeff_%3%5]
+%else
     mova           m5, [rgba_Ycoeff_%2%4]
     mova           m6, [rgba_Ycoeff_%3%5]
+%endif
 %if %0 == 6
     jmp mangle(private_prefix %+ _ %+ %6 %+ ToY %+ SUFFIX).body
 %else ; %0 == 6
@@ -394,13 +399,21 @@ cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table
     lea          srcq, [srcq+wq*2]
     add          dstq, wq
     neg            wq
+%if mmsize == 32
+    vbroadcasti128 m4, [rgb_Yrnd]
+%else
     mova           m4, [rgb_Yrnd]
+%endif
     pcmpeqb        m7, m7
     psrlw          m7, 8                  ; (word) { 0x00ff } x4
 .loop:
     ; FIXME check alignment and use mova
-    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
-    movu           m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    movu          xm0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu          xm2, [srcq+wq*2+16]     ; (byte) { Bx, Gx, Rx, xx }[4-7]
+%if mmsize == 32
+    vinserti128    m0, m0, [srcq+wq*2+32], 1
+    vinserti128    m2, m2, [srcq+wq*2+48], 1
+%endif
     DEINTB          1,  0,  3,  2,  7     ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
     pmaddwd        m1, m5                 ; (dword) { Bx*BY + Rx*RY }[0-3]
     pmaddwd        m0, m6                 ; (dword) { Gx*GY }[0-3]
@@ -421,6 +434,7 @@ cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table
     add            srcq, 2*mmsize - 2
     add            dstq, mmsize - 1
 .loop2:
+INIT_XMM cpuname
     movd           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
     DEINTB          1,  0,  3,  2,  7     ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
     pmaddwd        m1, m5                 ; (dword) { Bx*BY + Rx*RY }[0-3]
@@ -433,6 +447,9 @@ cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table
     add            wq, 2
     jl .loop2
 .end:
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
     RET
 %endif ; %0 == 3
 %endmacro
@@ -442,10 +459,17 @@ cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table
 %macro RGB32_TO_UV_FN 5-6
 cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
 %if ARCH_X86_64
+%if mmsize == 32
+    vbroadcasti128  m8, [rgba_Ucoeff_%2%4]
+    vbroadcasti128  m9, [rgba_Ucoeff_%3%5]
+    vbroadcasti128 m10, [rgba_Vcoeff_%2%4]
+    vbroadcasti128 m11, [rgba_Vcoeff_%3%5]
+%else
     mova           m8, [rgba_Ucoeff_%2%4]
     mova           m9, [rgba_Ucoeff_%3%5]
     mova          m10, [rgba_Vcoeff_%2%4]
     mova          m11, [rgba_Vcoeff_%3%5]
+%endif
 %define coeffU1 m8
 %define coeffU2 m9
 %define coeffV1 m10
@@ -473,11 +497,19 @@ cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
     neg            wq
     pcmpeqb        m7, m7
     psrlw          m7, 8                  ; (word) { 0x00ff } x4
+%if mmsize == 32
+    vbroadcasti128 m6, [rgb_UVrnd]
+%else
     mova           m6, [rgb_UVrnd]
+%endif
 .loop:
     ; FIXME check alignment and use mova
-    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
-    movu           m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    movu          xm0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu          xm4, [srcq+wq*2+16]     ; (byte) { Bx, Gx, Rx, xx }[4-7]
+%if mmsize == 32
+    vinserti128    m0, m0, [srcq+wq*2+32], 1
+    vinserti128    m4, m4, [srcq+wq*2+48], 1
+%endif
     DEINTB          1,  0,  5,  4,  7     ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
     pmaddwd        m3, m1, coeffV1        ; (dword) { Bx*BV + Rx*RV }[0-3]
     pmaddwd        m2, m0, coeffV2        ; (dword) { Gx*GV }[0-3]
@@ -511,6 +543,7 @@ cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
     add            dstUq, mmsize - 1
     add            dstVq, mmsize - 1
 .loop2:
+INIT_XMM cpuname
     movd           m0, [srcq+wq*2]        ; (byte) { Bx, Gx, Rx, xx }[0-3]
     DEINTB          1,  0,  5,  4,  7     ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
     pmaddwd        m3, m1, coeffV1        ; (dword) { Bx*BV + Rx*RV }[0-3]
@@ -530,6 +563,9 @@ cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
     add            wq, 2
     jl .loop2
 .end:
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
     RET
 %endif ; ARCH_X86_64 && %0 == 3
 %endmacro
@@ -556,6 +592,13 @@ INIT_XMM avx
 RGB32_FUNCS 8, 12
 %endif
 
+%if ARCH_X86_64
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+RGB32_FUNCS 8, 12
+%endif
+%endif
+
 ;-----------------------------------------------------------------------------
 ; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
 ;
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 1438c077e6..5a9da23265 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -321,6 +321,10 @@ void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
 INPUT_FUNCS(sse2);
 INPUT_FUNCS(ssse3);
 INPUT_FUNCS(avx);
+INPUT_FUNC(rgba, avx2);
+INPUT_FUNC(bgra, avx2);
+INPUT_FUNC(argb, avx2);
+INPUT_FUNC(abgr, avx2);
 INPUT_FUNC(rgb24, avx2);
 INPUT_FUNC(bgr24, avx2);
 
@@ -640,6 +644,10 @@ switch(c->dstBpc){ \
             switch (c->srcFormat) {
             case_rgb(rgb24, RGB24, avx2);
             case_rgb(bgr24, BGR24, avx2);
+            case_rgb(bgra,  BGRA,  avx2);
+            case_rgb(rgba,  RGBA,  avx2);
+            case_rgb(abgr,  ABGR,  avx2);
+            case_rgb(argb,  ARGB,  avx2);
             }
         switch (c->dstFormat) {
         case AV_PIX_FMT_NV12:
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [FFmpeg-devel] [PATCH 2/2] swscale/x86/input: add AVX2 optimized uyvytoyuv422
  2024-06-05 20:28 [FFmpeg-devel] [PATCH 1/2] swscale/x86/input: add AVX2 optimized RGB32 to YUV functions James Almer
@ 2024-06-05 20:28 ` James Almer
  2024-06-05 21:00   ` Andreas Rheinhardt
  0 siblings, 1 reply; 6+ messages in thread
From: James Almer @ 2024-06-05 20:28 UTC (permalink / raw)
  To: ffmpeg-devel

uyvytoyuv422_c: 23991.8
uyvytoyuv422_sse2: 2817.8
uyvytoyuv422_avx: 2819.3
uyvytoyuv422_avx2: 1972.3

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libswscale/x86/rgb2rgb.c     |  6 ++++++
 libswscale/x86/rgb_2_rgb.asm | 32 ++++++++++++++++++++++++--------
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index b325e5dbd5..21ccfafe51 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -136,6 +136,9 @@ void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
 void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                          const uint8_t *src, int width, int height,
                          int lumStride, int chromStride, int srcStride);
+void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
 #endif
 
 av_cold void rgb2rgb_init_x86(void)
@@ -177,5 +180,8 @@ av_cold void rgb2rgb_init_x86(void)
     if (EXTERNAL_AVX(cpu_flags)) {
         uyvytoyuv422 = ff_uyvytoyuv422_avx;
     }
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        uyvytoyuv422 = ff_uyvytoyuv422_avx2;
+    }
 #endif
 }
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 76ca1eec03..0bf1278718 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -34,13 +34,16 @@ pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
 SECTION .text
 
-%macro RSHIFT_COPY 3
+%macro RSHIFT_COPY 5
 ; %1 dst ; %2 src ; %3 shift
-%if cpuflag(avx)
-    psrldq  %1, %2, %3
+%if mmsize == 32
+    vperm2i128 %1, %2, %3, %5
+    RSHIFT         %1, %4
+%elif cpuflag(avx)
+    psrldq  %1, %2, %4
 %else
     mova           %1, %2
-    RSHIFT         %1, %3
+    RSHIFT         %1, %4
 %endif
 %endmacro
 
@@ -233,26 +236,37 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
     jge .end_line
 
     .loop_simd:
+%if mmsize == 32
+        movu   xm2, [srcq + wtwoq         ]
+        movu   xm3, [srcq + wtwoq + 16    ]
+        movu   xm4, [srcq + wtwoq + 16 * 2]
+        movu   xm5, [srcq + wtwoq + 16 * 3]
+        vinserti128 m2, m2, [srcq + wtwoq + 16 * 4], 1
+        vinserti128 m3, m3, [srcq + wtwoq + 16 * 5], 1
+        vinserti128 m4, m4, [srcq + wtwoq + 16 * 6], 1
+        vinserti128 m5, m5, [srcq + wtwoq + 16 * 7], 1
+%else
         movu    m2, [srcq + wtwoq             ]
         movu    m3, [srcq + wtwoq + mmsize    ]
         movu    m4, [srcq + wtwoq + mmsize * 2]
         movu    m5, [srcq + wtwoq + mmsize * 3]
+%endif
 
         ; extract y part 1
-        RSHIFT_COPY    m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
+        RSHIFT_COPY    m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY...
         pand           m6, m1; YxYx YxYx...
 
-        RSHIFT_COPY    m7, m3, 1 ; UYVY UYVY -> YVYU YVY...
+        RSHIFT_COPY    m7, m3, m5, 1, 0x20 ; UYVY UYVY -> YVYU YVY...
         pand           m7, m1 ; YxYx YxYx...
 
         packuswb       m6, m7 ; YYYY YYYY...
         movu [ydstq + wq], m6
 
         ; extract y part 2
-        RSHIFT_COPY    m6, m4, 1 ; UYVY UYVY -> YVYU YVY...
+        RSHIFT_COPY    m6, m4, m2, 1, 0x13 ; UYVY UYVY -> YVYU YVY...
         pand           m6, m1; YxYx YxYx...
 
-        RSHIFT_COPY    m7, m5, 1 ; UYVY UYVY -> YVYU YVY...
+        RSHIFT_COPY    m7, m5, m3, 1, 0x13 ; UYVY UYVY -> YVYU YVY...
         pand           m7, m1 ; YxYx YxYx...
 
         packuswb                m6, m7 ; YYYY YYYY...
@@ -309,4 +323,6 @@ UYVY_TO_YUV422
 
 INIT_XMM avx
 UYVY_TO_YUV422
+INIT_YMM avx2
+UYVY_TO_YUV422
 %endif
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] swscale/x86/input: add AVX2 optimized uyvytoyuv422
  2024-06-05 20:28 ` [FFmpeg-devel] [PATCH 2/2] swscale/x86/input: add AVX2 optimized uyvytoyuv422 James Almer
@ 2024-06-05 21:00   ` Andreas Rheinhardt
  2024-06-06  6:03     ` Rémi Denis-Courmont
  0 siblings, 1 reply; 6+ messages in thread
From: Andreas Rheinhardt @ 2024-06-05 21:00 UTC (permalink / raw)
  To: ffmpeg-devel

James Almer:
> uyvytoyuv422_c: 23991.8
> uyvytoyuv422_sse2: 2817.8
> uyvytoyuv422_avx: 2819.3

Why don't you nuke the avx version in a follow-up patch?

> uyvytoyuv422_avx2: 1972.3
> 
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  libswscale/x86/rgb2rgb.c     |  6 ++++++
>  libswscale/x86/rgb_2_rgb.asm | 32 ++++++++++++++++++++++++--------
>  2 files changed, 30 insertions(+), 8 deletions(-)
> 
> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
> index b325e5dbd5..21ccfafe51 100644
> --- a/libswscale/x86/rgb2rgb.c
> +++ b/libswscale/x86/rgb2rgb.c
> @@ -136,6 +136,9 @@ void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
>  void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
>                           const uint8_t *src, int width, int height,
>                           int lumStride, int chromStride, int srcStride);
> +void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> +                          const uint8_t *src, int width, int height,
> +                          int lumStride, int chromStride, int srcStride);
>  #endif
>  
>  av_cold void rgb2rgb_init_x86(void)
> @@ -177,5 +180,8 @@ av_cold void rgb2rgb_init_x86(void)
>      if (EXTERNAL_AVX(cpu_flags)) {
>          uyvytoyuv422 = ff_uyvytoyuv422_avx;
>      }
> +    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> +        uyvytoyuv422 = ff_uyvytoyuv422_avx2;
> +    }
>  #endif
>  }
> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> index 76ca1eec03..0bf1278718 100644
> --- a/libswscale/x86/rgb_2_rgb.asm
> +++ b/libswscale/x86/rgb_2_rgb.asm
> @@ -34,13 +34,16 @@ pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
>  
>  SECTION .text
>  
> -%macro RSHIFT_COPY 3
> +%macro RSHIFT_COPY 5
>  ; %1 dst ; %2 src ; %3 shift
> -%if cpuflag(avx)
> -    psrldq  %1, %2, %3
> +%if mmsize == 32
> +    vperm2i128 %1, %2, %3, %5
> +    RSHIFT         %1, %4
> +%elif cpuflag(avx)
> +    psrldq  %1, %2, %4
>  %else
>      mova           %1, %2
> -    RSHIFT         %1, %3
> +    RSHIFT         %1, %4
>  %endif
>  %endmacro
>  
> @@ -233,26 +236,37 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
>      jge .end_line
>  
>      .loop_simd:
> +%if mmsize == 32
> +        movu   xm2, [srcq + wtwoq         ]
> +        movu   xm3, [srcq + wtwoq + 16    ]
> +        movu   xm4, [srcq + wtwoq + 16 * 2]
> +        movu   xm5, [srcq + wtwoq + 16 * 3]
> +        vinserti128 m2, m2, [srcq + wtwoq + 16 * 4], 1
> +        vinserti128 m3, m3, [srcq + wtwoq + 16 * 5], 1
> +        vinserti128 m4, m4, [srcq + wtwoq + 16 * 6], 1
> +        vinserti128 m5, m5, [srcq + wtwoq + 16 * 7], 1
> +%else
>          movu    m2, [srcq + wtwoq             ]
>          movu    m3, [srcq + wtwoq + mmsize    ]
>          movu    m4, [srcq + wtwoq + mmsize * 2]
>          movu    m5, [srcq + wtwoq + mmsize * 3]
> +%endif
>  
>          ; extract y part 1
> -        RSHIFT_COPY    m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
> +        RSHIFT_COPY    m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY...
>          pand           m6, m1; YxYx YxYx...
>  
> -        RSHIFT_COPY    m7, m3, 1 ; UYVY UYVY -> YVYU YVY...
> +        RSHIFT_COPY    m7, m3, m5, 1, 0x20 ; UYVY UYVY -> YVYU YVY...
>          pand           m7, m1 ; YxYx YxYx...
>  
>          packuswb       m6, m7 ; YYYY YYYY...
>          movu [ydstq + wq], m6
>  
>          ; extract y part 2
> -        RSHIFT_COPY    m6, m4, 1 ; UYVY UYVY -> YVYU YVY...
> +        RSHIFT_COPY    m6, m4, m2, 1, 0x13 ; UYVY UYVY -> YVYU YVY...
>          pand           m6, m1; YxYx YxYx...
>  
> -        RSHIFT_COPY    m7, m5, 1 ; UYVY UYVY -> YVYU YVY...
> +        RSHIFT_COPY    m7, m5, m3, 1, 0x13 ; UYVY UYVY -> YVYU YVY...
>          pand           m7, m1 ; YxYx YxYx...
>  
>          packuswb                m6, m7 ; YYYY YYYY...
> @@ -309,4 +323,6 @@ UYVY_TO_YUV422
>  
>  INIT_XMM avx
>  UYVY_TO_YUV422
> +INIT_YMM avx2
> +UYVY_TO_YUV422
>  %endif

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] swscale/x86/input: add AVX2 optimized uyvytoyuv422
  2024-06-05 21:00   ` Andreas Rheinhardt
@ 2024-06-06  6:03     ` Rémi Denis-Courmont
  2024-06-06  7:01       ` Christophe Gisquet
  0 siblings, 1 reply; 6+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-06  6:03 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



Le 6 juin 2024 00:00:57 GMT+03:00, Andreas Rheinhardt <andreas.rheinhardt@outlook.com> a écrit :
>James Almer:
>> uyvytoyuv422_c: 23991.8
>> uyvytoyuv422_sse2: 2817.8
>> uyvytoyuv422_avx: 2819.3
>
>Why don't you nuke the avx version in a follow-up patch?

Same problem with the RGBA stuff as well. Are the AVX functions expected to be faster than SSE2 on processors *without* AVX2?
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] swscale/x86/input: add AVX2 optimized uyvytoyuv422
  2024-06-06  6:03     ` Rémi Denis-Courmont
@ 2024-06-06  7:01       ` Christophe Gisquet
  2024-06-07  7:24         ` Rémi Denis-Courmont
  0 siblings, 1 reply; 6+ messages in thread
From: Christophe Gisquet @ 2024-06-06  7:01 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le jeu. 6 juin 2024 à 08:11, Rémi Denis-Courmont <remi@remlab.net> a écrit :
> >James Almer:
> >> uyvytoyuv422_c: 23991.8
> >> uyvytoyuv422_sse2: 2817.8
> >> uyvytoyuv422_avx: 2819.3
> >
> >Why don't you nuke the avx version in a follow-up patch?
>
> Same problem with the RGBA stuff as well. Are the AVX functions expected to be faster than SSE2 on processors *without* AVX2?

Something frequent in this type of questions is that people are using
numbers from a CPU that has had 10 years of arch improvements (and
probably a doubling in throughput for any instruction set) over one
that supported at most AVX. The presence of an AVX function (whose
benefit is only 3-operand instructions, so admittedly small) would
ideally only be benchmarked on that kind of CPUs.

Case in point, at that time, even x264 introduced avx versions, so
there was a time and CPU generations where yes, it was faster:
https://code.videolan.org/search?search=INIT_XMM%20avx&nav_source=navbar&project_id=536&group_id=9&search_code=true&repository_ref=master
https://code.videolan.org/videolan/x264/-/commit/abc2283e9abc6254744bf6dd148ac25433cdf80e

But I understand the point is that any type of maintenance for a minor
improvement to few CPUs, which are maybe 1% of a userbase, is not
appealing.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] swscale/x86/input: add AVX2 optimized uyvytoyuv422
  2024-06-06  7:01       ` Christophe Gisquet
@ 2024-06-07  7:24         ` Rémi Denis-Courmont
  0 siblings, 0 replies; 6+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-07  7:24 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



Le 6 juin 2024 10:01:24 GMT+03:00, Christophe Gisquet <christophe.gisquet@gmail.com> a écrit :
>Le jeu. 6 juin 2024 à 08:11, Rémi Denis-Courmont <remi@remlab.net> a écrit :
>> >James Almer:
>> >> uyvytoyuv422_c: 23991.8
>> >> uyvytoyuv422_sse2: 2817.8
>> >> uyvytoyuv422_avx: 2819.3
>> >
>> >Why don't you nuke the avx version in a follow-up patch?
>>
>> Same problem with the RGBA stuff as well. Are the AVX functions expected to be faster than SSE2 on processors *without* AVX2?
>
>Something frequent in this type of questions is that people are using
>numbers from a CPU that has had 10 years of arch improvements (and
>probably a doubling in throughput for any instruction set) over one
>that supported at most AVX. The presence of an AVX function (whose
>benefit is only 3-operand instructions, so admittedly small) would
>ideally only be benchmarked on that kind of CPUs.

It feels a bit dense for someone not intimate with x86 innards such as I. Intuitively, 3 operands instructions are certainly helpful in avoiding vector copies around destructive operations. But it should be clear if, for any given function, this does or does not help.

That being said, I have no objections as such to run-time optimisations for middle-aged processors.

If anything, I think we should auto-trim the useless C code at least via DCE if we want to save space. We can assume at least SSE2 on x86-64, no? Of course this will break checkasm and some command line flags.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-06-07  7:25 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-05 20:28 [FFmpeg-devel] [PATCH 1/2] swscale/x86/input: add AVX2 optimized RGB32 to YUV functions James Almer
2024-06-05 20:28 ` [FFmpeg-devel] [PATCH 2/2] swscale/x86/input: add AVX2 optimized uyvytoyuv422 James Almer
2024-06-05 21:00   ` Andreas Rheinhardt
2024-06-06  6:03     ` Rémi Denis-Courmont
2024-06-06  7:01       ` Christophe Gisquet
2024-06-07  7:24         ` Rémi Denis-Courmont

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git