Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/2] avcodec/x86/v210: add some comments to the improved avx2 function
@ 2022-12-15 10:49 James Darnley
  2022-12-15 10:49 ` [FFmpeg-devel] [RFC PATCH 2/2] avcodec/x86: add avx512icl function for v210dec James Darnley
  0 siblings, 1 reply; 2+ messages in thread
From: James Darnley @ 2022-12-15 10:49 UTC (permalink / raw)
  To: ffmpeg-devel

---
 libavcodec/x86/v210.asm | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
index 3b9e0761df..600a4ddc5f 100644
--- a/libavcodec/x86/v210.asm
+++ b/libavcodec/x86/v210.asm
@@ -65,18 +65,18 @@ cglobal v210_planar_unpack_%1, 5, 5, 6 + 2 * cpuflag(avx2), src, y, u, v, w
     mova   m0, [srcq]
 %endif
 
-    pmullw m1, m0, m3
-    pslld  m0, 12
-    psrlw  m1, 6                       ; yB yA u5 v4 y8 y7 v3 u3 y5 y4 u2 v1 y2 y1 v0 u0
-    psrld  m0, 22                      ; 00 v5 00 y9 00 u4 00 y6 00 v2 00 y3 00 u1 00 y0
+    pmullw m1, m0, m3 ; shifts the 1st and 3rd sample of each dword into the high 10 bits of each word
+    pslld  m0, 12     ; shifts the 2nd sample of each dword into the high 10 bits of each dword
+    psrlw  m1, 6      ; shifts the 1st and 3rd samples back into the low 10 bits
+    psrld  m0, 22     ; shifts the 2nd sample back into the low 10 bits of each dword
 
 %if cpuflag(avx2)
-    vpblendd m2, m1, m0, 0x55          ; yB yA 00 y9 y8 y7 00 y6 y5 y4 00 y3 y2 y1 00 y0
+    vpblendd m2, m1, m0, 0x55 ; merge the odd dwords from m0 and even from m1 ; yB yA 00 y9 y8 y7 00 y6 y5 y4 00 y3 y2 y1 00 y0
     pshufb m2, m4                      ; 00 00 yB yA y9 y8 y7 y6 00 00 y5 y4 y3 y2 y1 y0
     vpermd m2, m6, m2                  ; 00 00 00 00 yB yA y9 y8 y7 y6 y5 y4 y3 y2 y1 y0
     movu   [yq+2*wq], m2
 
-    vpblendd m1, m1, m0, 0xaa          ; 00 v5 u5 v4 00 u4 v3 u3 00 v2 u2 v1 00 u1 v0 u0
+    vpblendd m1, m1, m0, 0xaa ; merge the even dwords from m0 and odd from m1 ; 00 v5 u5 v4 00 u4 v3 u3 00 v2 u2 v1 00 u1 v0 u0
     pshufb m1, m5                      ; 00 v5 v4 v3 00 u5 u4 u3 00 v2 v1 v0 00 u2 u1 u0
     vpermq m1, m1, 0xd8                ; 00 v5 v4 v3 00 v2 v1 v0 00 u5 u4 u3 00 u2 u1 u0
     pshufb m1, m7                      ; 00 00 v5 v4 v3 v2 v1 v0 00 00 u5 u4 u3 u2 u1 u0
-- 
2.38.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [FFmpeg-devel] [RFC PATCH 2/2] avcodec/x86: add avx512icl function for v210dec
  2022-12-15 10:49 [FFmpeg-devel] [PATCH 1/2] avcodec/x86/v210: add some comments to the improved avx2 function James Darnley
@ 2022-12-15 10:49 ` James Darnley
  0 siblings, 0 replies; 2+ messages in thread
From: James Darnley @ 2022-12-15 10:49 UTC (permalink / raw)
  To: ffmpeg-devel

Ice Lake (Xeon Silver 4316): 2.01x faster (1147±36.8 vs. 571±38.2 decicycles) compared with avx2
---

I think I can merge this with the existing macro without it being too ugly.
That might allow a plain avx512 version too but I can't say if that would be any
faster.

 libavcodec/x86/v210-init.c | 10 ++++++-
 libavcodec/x86/v210.asm    | 60 +++++++++++++++++++++++++++++++++++++-
 tests/checkasm/v210dec.c   | 12 ++++----
 3 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c
index 5db1fef98c..8b3677b8aa 100644
--- a/libavcodec/x86/v210-init.c
+++ b/libavcodec/x86/v210-init.c
@@ -17,7 +17,7 @@
  */
 
 #include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
 #include "libavcodec/v210dec.h"
 
 extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
@@ -28,6 +28,8 @@ extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y
 extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
 extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
 
+extern void ff_v210_planar_unpack_avx512icl(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
 av_cold void ff_v210_x86_init(V210DecContext *s)
 {
 #if HAVE_X86ASM
@@ -42,6 +44,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s)
 
         if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
             s->unpack_frame = ff_v210_planar_unpack_aligned_avx2;
+
+        if (EXTERNAL_AVX512ICL(cpu_flags))
+            s->unpack_frame = ff_v210_planar_unpack_avx512icl;
     }
     else {
         if (cpu_flags & AV_CPU_FLAG_SSSE3)
@@ -52,6 +57,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s)
 
         if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
             s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2;
+
+        if (EXTERNAL_AVX512ICL(cpu_flags))
+            s->unpack_frame = ff_v210_planar_unpack_avx512icl;
     }
 #endif
 }
diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
index 600a4ddc5f..f247737ed0 100644
--- a/libavcodec/x86/v210.asm
+++ b/libavcodec/x86/v210.asm
@@ -22,7 +22,21 @@
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA 32
+SECTION_RODATA 64
+
+perm_y:
+    db  0,1,   4,5,   6,7,   8,9,  12,13, 14,15, 16,17, 20,21
+    db 22,23, 24,25, 28,29, 30,31, 32,33, 36,37, 38,39, 40,41
+    db 44,45, 46,47, 48,49, 52,53, 54,55, 56,57, 60,61, 62,63
+times 16 db 0xff ; align to 64
+
+perm_uv:
+    db  0,1,   4,5,  10,11, 16,17, 20,21, 26,27, 32,33, 36,37
+    db 42,43, 48,49, 52,53, 58,59
+times 8 db 0xff ; align to 32
+    db  2,3,   8,9,  12,13, 18,19, 24,25, 28,29, 34,35, 40,41
+    db 44,45, 50,51, 56,57, 60,61
+times 8 db 0xff ; align to 32
 
 ; for AVX2 version only
 v210_luma_permute: dd 0,1,2,4,5,6,7,7  ; 32-byte alignment required
@@ -34,6 +48,9 @@ v210_mult: dw 64,4,64,4,64,4,64,4
 v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
 v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
 
+shift: times 4 dw 6, 2
+kmask: dw 0x5555, 0xaaaa
+
 SECTION .text
 
 %macro v210_planar_unpack 1
@@ -127,3 +144,44 @@ v210_planar_unpack aligned
 INIT_YMM avx2
 v210_planar_unpack aligned
 %endif
+
+%if HAVE_AVX512ICL_EXTERNAL
+
+INIT_ZMM avx512icl
+
+cglobal v210_planar_unpack, 5, 5, 6, src, y, u, v, w
+    movsxdifnidn wq, wd
+    lea    yq, [yq+2*wq]
+    add    uq, wq
+    add    vq, wq
+    neg    wq
+
+    kmovw k1, [kmask]   ; odd dword mask
+    kmovw k2, [kmask+2] ; even dword mask
+
+    VBROADCASTI128 m0, [shift]
+    mova           m1, [perm_y]
+    mova           m2, [perm_uv]
+
+    .loop:
+        movu    m3, [srcq]
+        vpsllvw m4, m3, m0
+        pslld   m5, m3, 12
+        psrlw   m4, 6
+        psrld   m5, 22
+
+        vpblendmd m3{k1}, m4, m5
+        vpermb    m3, m1, m3 ; could use vpcompressw
+        movu      [yq+2*wq], m3
+
+        vpblendmd     m5{k2}, m4, m5
+        vpermb        m5, m2, m5
+        movu          [uq+wq], ym5
+        vextracti32x8 [vq+wq], zm5, 1
+
+        add srcq, mmsize
+        add wq, (mmsize*3)/8
+    jl  .loop
+RET
+
+%endif
diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c
index 6aef519cc5..93993bae71 100644
--- a/tests/checkasm/v210dec.c
+++ b/tests/checkasm/v210dec.c
@@ -54,12 +54,12 @@ void checkasm_check_v210dec(void)
     if (check_func(h.unpack_frame, "v210_unpack")) {
         uint32_t src0[NUM_SAMPLES/3];
         uint32_t src1[NUM_SAMPLES/3];
-        uint16_t y0[NUM_SAMPLES/2];
-        uint16_t y1[NUM_SAMPLES/2];
-        uint16_t u0[NUM_SAMPLES/4];
-        uint16_t u1[NUM_SAMPLES/4];
-        uint16_t v0[NUM_SAMPLES/4];
-        uint16_t v1[NUM_SAMPLES/4];
+        uint16_t y0[NUM_SAMPLES/2 + 15];
+        uint16_t y1[NUM_SAMPLES/2 + 15];
+        uint16_t u0[NUM_SAMPLES/4 + 7];
+        uint16_t u1[NUM_SAMPLES/4 + 7];
+        uint16_t v0[NUM_SAMPLES/4 + 7];
+        uint16_t v1[NUM_SAMPLES/4 + 7];
         declare_func(void, const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
         const int pixels = NUM_SAMPLES / 2 / 6 * 6;
 
-- 
2.38.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2022-12-15 10:51 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-15 10:49 [FFmpeg-devel] [PATCH 1/2] avcodec/x86/v210: add some comments to the improved avx2 function James Darnley
2022-12-15 10:49 ` [FFmpeg-devel] [RFC PATCH 2/2] avcodec/x86: add avx512icl function for v210dec James Darnley

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git