Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: James Darnley <jdarnley@obe.tv>
To: ffmpeg-devel@ffmpeg.org
Subject: [FFmpeg-devel] [PATCH 3/3] avcodec/v210enc: add new 10-bit function for avx512 avx512icl
Date: Mon, 21 Nov 2022 13:44:08 +0100
Message-ID: <20221121124408.1577897-3-jdarnley@obe.tv> (raw)
In-Reply-To: <20221121124408.1577897-1-jdarnley@obe.tv>

avx512 on Skylake-X (Xeon D-2123IT):
1.19x faster (970±91.2 vs. 817±104.4 decicycles) compared with avx2

avx512icl on Ice Lake (Xeon Silver 4316):
2.52x faster (1350±5.3 vs. 535±9.5 decicycles) compared with avx2
---
 libavcodec/x86/v210enc.asm    | 99 +++++++++++++++++++++++++++++++++++
 libavcodec/x86/v210enc_init.c | 12 +++++
 2 files changed, 111 insertions(+)

diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index c2ad3d72c0..9cee954619 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -56,6 +56,36 @@ v210enc_8_permd: dd 0,1,4,5, 1,2,5,6
 v210enc_8_mult: db 4, 0, 64, 0
 v210enc_8_mask: dd 255<<12
 
+icl_perm_y: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb
+%assign i 0
+%rep 8
+    db -1,i+0,i+1,-1 , i+2,i+3,i+4,i+5
+    %assign i i+6
+%endrep
+
+icl_perm_uv: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb
+%assign i 0
+%rep 4
+    db i+0,i+1,i+32,i+33 , -1,i+2,i+3,-1 , i+34,i+35,i+4,i+5 , -1,i+36,i+37,-1
+    %assign i i+6
+%endrep
+
+icl_perm_y_kmask:  times 8 db 0b1111_0110
+icl_perm_uv_kmask: times 8 db 0b0110_1111
+
+icl_shift_y:  times 10 dw 2,0,4
+              times 4 db 0 ; padding to 64 bytes
+icl_shift_uv: times 5 dw 0,2,4
+              times 2 db 0 ; padding to 32 bytes
+              times 5 dw 4,0,2
+              times 2 db 0 ; padding to 32 bytes
+
+v210enc_10_permd_y:  dd 0,1,2,-1 , 3,4,5,-1
+v210enc_10_shufb_y:  db -1,0,1,-1 , 2,3,4,5 , -1,6,7,-1 , 8,9,10,11
+v210enc_10_permd_uv: dd 0,1,4,5 , 1,2,5,6
+v210enc_10_shufb_uv: db 0,1, 8, 9 , -1,2,3,-1 , 10,11,4,5 , -1,12,13,-1
+                     db 2,3,10,11 , -1,4,5,-1 , 12,13,6,7 , -1,14,15,-1
+
 SECTION .text
 
 %macro v210_planar_pack_10 0
@@ -113,6 +143,75 @@ INIT_YMM avx2
 v210_planar_pack_10
 %endif
 
+%macro v210_planar_pack_10_new 0
+
+cglobal v210_planar_pack_10, 5, 5, 8+2*notcpuflag(avx512icl), y, u, v, dst, width
+    lea     yq, [yq+2*widthq]
+    add     uq, widthq
+    add     vq, widthq
+    neg     widthq
+
+    %if cpuflag(avx512icl)
+        movu  m6, [icl_perm_y]
+        movu  m7, [icl_perm_uv]
+        kmovq k1, [icl_perm_y_kmask]
+        kmovq k2, [icl_perm_uv_kmask]
+    %else
+        movu           m6, [v210enc_10_permd_y]
+        VBROADCASTI128 m7, [v210enc_10_shufb_y]
+        movu           m8, [v210enc_10_permd_uv]
+        movu           m9, [v210enc_10_shufb_uv]
+    %endif
+    movu  m2, [icl_shift_y]
+    movu  m3, [icl_shift_uv]
+    VBROADCASTI128 m4, [v210_enc_min_10] ; only ymm sized
+    VBROADCASTI128 m5, [v210_enc_max_10] ; only ymm sized
+
+    .loop:
+        movu m0, [yq + widthq*2]
+        %if cpuflag(avx512icl)
+            movu         ym1, [uq + widthq*1]
+            vinserti32x8 zm1, [vq + widthq*1], 1
+        %else
+            movu         xm1, [uq + widthq*1]
+            vinserti128  ym1, [vq + widthq*1], 1
+        %endif
+        CLIPW m0, m4, m5
+        CLIPW m1, m4, m5
+
+        vpsllvw m0, m2
+        vpsllvw m1, m3
+        %if cpuflag(avx512icl)
+            vpermb  m0{k1}{z}, m6, m0
+            vpermb  m1{k2}{z}, m7, m1
+        %else
+            vpermd m0, m6, m0
+            pshufb m0, m7
+            vpermd m1, m8, m1
+            pshufb m1, m9
+        %endif
+        por     m0, m1
+
+        movu  [dstq], m0
+        add     dstq, mmsize
+        add   widthq, (mmsize*3)/8
+    jl .loop
+RET
+
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX512_EXTERNAL
+INIT_YMM avx512
+v210_planar_pack_10_new
+%endif
+%endif
+
+%if HAVE_AVX512ICL_EXTERNAL
+INIT_ZMM avx512icl
+v210_planar_pack_10_new
+%endif
+
 %macro v210_planar_pack_8 0
 
 ; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width)
diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c
index 6e9f8c6e61..5d1ebcb893 100644
--- a/libavcodec/x86/v210enc_init.c
+++ b/libavcodec/x86/v210enc_init.c
@@ -37,6 +37,12 @@ void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u,
 void ff_v210_planar_pack_10_avx2(const uint16_t *y, const uint16_t *u,
                                  const uint16_t *v, uint8_t *dst,
                                  ptrdiff_t width);
+void ff_v210_planar_pack_10_avx512(const uint16_t *y, const uint16_t *u,
+                                   const uint16_t *v, uint8_t *dst,
+                                   ptrdiff_t width);
+void ff_v210_planar_pack_10_avx512icl(const uint16_t *y, const uint16_t *u,
+                                      const uint16_t *v, uint8_t *dst,
+                                      ptrdiff_t width);
 
 av_cold void ff_v210enc_init_x86(V210EncContext *s)
 {
@@ -60,10 +66,16 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s)
     if (EXTERNAL_AVX512(cpu_flags)) {
         s->sample_factor_8  = 2;
         s->pack_line_8      = ff_v210_planar_pack_8_avx512;
+#ifdef ARCH_X86_64
+        s->sample_factor_10  = 2;
+        s->pack_line_10      = ff_v210_planar_pack_10_avx512;
+#endif
     }
 
     if (EXTERNAL_AVX512ICL(cpu_flags)) {
         s->sample_factor_8  = 4;
         s->pack_line_8      = ff_v210_planar_pack_8_avx512icl;
+        s->sample_factor_10 = 4;
+        s->pack_line_10     = ff_v210_planar_pack_10_avx512icl;
     }
 }
-- 
2.38.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

  parent reply	other threads:[~2022-11-21 12:47 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-21 12:44 [FFmpeg-devel] [PATCH 1/3] checkasm/v210enc: test the entire width of 10-bit planar input arrays James Darnley
2022-11-21 12:44 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86/v210: replace register use with named register James Darnley
2022-11-21 12:44 ` James Darnley [this message]
2022-11-21 14:12   ` [FFmpeg-devel] [PATCH 3/3] avcodec/v210enc: add new 10-bit function for avx512 avx512icl Andreas Rheinhardt
2022-11-21 16:50     ` James Darnley
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 1/5] checkasm/v210enc: test the entire width of 10-bit planar input arrays James Darnley
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 2/5] avcodec/x86/v210enc: replace register use with named register James Darnley
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 3/5] avcodec/v210enc: add new 10-bit function for avx512 avx512icl James Darnley
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 4/5] avcodec/x86/v210enc: expand and correct comments James Darnley
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 5/5] avcodec/x86/v210enc: remove unneeded instruction James Darnley

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221121124408.1577897-3-jdarnley@obe.tv \
    --to=jdarnley@obe.tv \
    --cc=ffmpeg-devel@ffmpeg.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git