From: James Darnley <jdarnley@obe.tv> To: ffmpeg-devel@ffmpeg.org Subject: [FFmpeg-devel] [RFC PATCH 2/2] avcodec/x86: add avx512icl function for v210dec Date: Thu, 15 Dec 2022 11:49:04 +0100 Message-ID: <20221215104904.3264109-2-jdarnley@obe.tv> (raw) In-Reply-To: <20221215104904.3264109-1-jdarnley@obe.tv> Ice Lake (Xeon Silver 4316): 2.01x faster (1147±36.8 vs. 571±38.2 decicycles) compared with avx2 --- I think I can merge this with the existing macro without it being too ugly. That might allow a plain avx512 version too but I can't say if that would be any faster. libavcodec/x86/v210-init.c | 10 ++++++- libavcodec/x86/v210.asm | 60 +++++++++++++++++++++++++++++++++++++- tests/checkasm/v210dec.c | 12 ++++---- 3 files changed, 74 insertions(+), 8 deletions(-) diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c index 5db1fef98c..8b3677b8aa 100644 --- a/libavcodec/x86/v210-init.c +++ b/libavcodec/x86/v210-init.c @@ -17,7 +17,7 @@ */ #include "libavutil/attributes.h" -#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" #include "libavcodec/v210dec.h" extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); @@ -28,6 +28,8 @@ extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_avx512icl(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); + av_cold void ff_v210_x86_init(V210DecContext *s) { #if HAVE_X86ASM @@ -42,6 +44,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s) if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2) s->unpack_frame = ff_v210_planar_unpack_aligned_avx2; + + if (EXTERNAL_AVX512ICL(cpu_flags)) + s->unpack_frame = ff_v210_planar_unpack_avx512icl; } else { if (cpu_flags & AV_CPU_FLAG_SSSE3) @@ -52,6 +57,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s) if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2) s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2; + + if (EXTERNAL_AVX512ICL(cpu_flags)) + s->unpack_frame = ff_v210_planar_unpack_avx512icl; } #endif } diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm index 600a4ddc5f..f247737ed0 100644 --- a/libavcodec/x86/v210.asm +++ b/libavcodec/x86/v210.asm @@ -22,7 +22,21 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 + +perm_y: + db 0,1, 4,5, 6,7, 8,9, 12,13, 14,15, 16,17, 20,21 + db 22,23, 24,25, 28,29, 30,31, 32,33, 36,37, 38,39, 40,41 + db 44,45, 46,47, 48,49, 52,53, 54,55, 56,57, 60,61, 62,63 +times 16 db 0xff ; align to 64 + +perm_uv: + db 0,1, 4,5, 10,11, 16,17, 20,21, 26,27, 32,33, 36,37 + db 42,43, 48,49, 52,53, 58,59 +times 8 db 0xff ; align to 32 + db 2,3, 8,9, 12,13, 18,19, 24,25, 28,29, 34,35, 40,41 + db 44,45, 50,51, 56,57, 60,61 +times 8 db 0xff ; align to 32 ; for AVX2 version only v210_luma_permute: dd 0,1,2,4,5,6,7,7 ; 32-byte alignment required @@ -34,6 +48,9 @@ v210_mult: dw 64,4,64,4,64,4,64,4 v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 +shift: times 4 dw 6, 2 +kmask: dw 0x5555, 0xaaaa + SECTION .text %macro v210_planar_unpack 1 @@ -127,3 +144,44 @@ v210_planar_unpack aligned INIT_YMM avx2 v210_planar_unpack aligned %endif + +%if HAVE_AVX512ICL_EXTERNAL + +INIT_ZMM avx512icl + +cglobal v210_planar_unpack, 5, 5, 6, src, y, u, v, w + movsxdifnidn wq, wd + lea yq, [yq+2*wq] + add uq, wq + add vq, wq + neg wq + + kmovw k1, [kmask] ; odd dword mask + kmovw k2, [kmask+2] ; even dword mask + + VBROADCASTI128 m0, [shift] + mova m1, [perm_y] + mova m2, [perm_uv] + + .loop: + movu m3, [srcq] + vpsllvw m4, m3, m0 + pslld m5, m3, 12 + psrlw m4, 6 + psrld m5, 22 + + vpblendmd m3{k1}, m4, m5 + vpermb m3, m1, m3 ; could use vpcompressw + movu [yq+2*wq], m3 + + vpblendmd m5{k2}, m4, m5 + vpermb m5, m2, m5 + movu [uq+wq], ym5 + vextracti32x8 [vq+wq], zm5, 1 + + add srcq, mmsize + add wq, (mmsize*3)/8 + jl .loop +RET + +%endif diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c index 6aef519cc5..93993bae71 100644 --- a/tests/checkasm/v210dec.c +++ b/tests/checkasm/v210dec.c @@ -54,12 +54,12 @@ void checkasm_check_v210dec(void) if (check_func(h.unpack_frame, "v210_unpack")) { uint32_t src0[NUM_SAMPLES/3]; uint32_t src1[NUM_SAMPLES/3]; - uint16_t y0[NUM_SAMPLES/2]; - uint16_t y1[NUM_SAMPLES/2]; - uint16_t u0[NUM_SAMPLES/4]; - uint16_t u1[NUM_SAMPLES/4]; - uint16_t v0[NUM_SAMPLES/4]; - uint16_t v1[NUM_SAMPLES/4]; + uint16_t y0[NUM_SAMPLES/2 + 15]; + uint16_t y1[NUM_SAMPLES/2 + 15]; + uint16_t u0[NUM_SAMPLES/4 + 7]; + uint16_t u1[NUM_SAMPLES/4 + 7]; + uint16_t v0[NUM_SAMPLES/4 + 7]; + uint16_t v1[NUM_SAMPLES/4 + 7]; declare_func(void, const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); const int pixels = NUM_SAMPLES / 2 / 6 * 6; -- 2.38.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
prev parent reply other threads:[~2022-12-15 10:51 UTC|newest] Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top 2022-12-15 10:49 [FFmpeg-devel] [PATCH 1/2] avcodec/x86/v210: add some comments to the improved avx2 function James Darnley 2022-12-15 10:49 ` James Darnley [this message]
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20221215104904.3264109-2-jdarnley@obe.tv \ --to=jdarnley@obe.tv \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git