From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> To: ffmpeg-devel@ffmpeg.org Subject: Re: [FFmpeg-devel] [PATCH 3/3] avcodec/v210enc: add new 10-bit function for avx512 avx512icl Date: Mon, 21 Nov 2022 15:12:38 +0100 Message-ID: <AS8P250MB0744F6277FC308CAE95107438F0A9@AS8P250MB0744.EURP250.PROD.OUTLOOK.COM> (raw) In-Reply-To: <20221121124408.1577897-3-jdarnley@obe.tv> James Darnley: > avx512 on Skylake-X (Xeon D-2123IT): > 1.19x faster (970±91.2 vs. 817±104.4 decicycles) compared with avx2 > > avx512icl on Ice Lake (Xeon Silver 4316): > 2.52x faster (1350±5.3 vs. 535±9.5 decicycles) compared with avx2 > --- > libavcodec/x86/v210enc.asm | 99 +++++++++++++++++++++++++++++++++++ > libavcodec/x86/v210enc_init.c | 12 +++++ > 2 files changed, 111 insertions(+) > > diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm > index c2ad3d72c0..9cee954619 100644 > --- a/libavcodec/x86/v210enc.asm > +++ b/libavcodec/x86/v210enc.asm > @@ -56,6 +56,36 @@ v210enc_8_permd: dd 0,1,4,5, 1,2,5,6 > v210enc_8_mult: db 4, 0, 64, 0 > v210enc_8_mask: dd 255<<12 > > +icl_perm_y: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb > +%assign i 0 > +%rep 8 > + db -1,i+0,i+1,-1 , i+2,i+3,i+4,i+5 > + %assign i i+6 > +%endrep > + > +icl_perm_uv: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb > +%assign i 0 > +%rep 4 > + db i+0,i+1,i+32,i+33 , -1,i+2,i+3,-1 , i+34,i+35,i+4,i+5 , -1,i+36,i+37,-1 > + %assign i i+6 > +%endrep > + > +icl_perm_y_kmask: times 8 db 0b1111_0110 > +icl_perm_uv_kmask: times 8 db 0b0110_1111 > + > +icl_shift_y: times 10 dw 2,0,4 > + times 4 db 0 ; padding to 64 bytes > +icl_shift_uv: times 5 dw 0,2,4 > + times 2 db 0 ; padding to 32 bytes > + times 5 dw 4,0,2 > + times 2 db 0 ; padding to 32 bytes > + > +v210enc_10_permd_y: dd 0,1,2,-1 , 3,4,5,-1 > +v210enc_10_shufb_y: db -1,0,1,-1 , 2,3,4,5 , -1,6,7,-1 , 8,9,10,11 > +v210enc_10_permd_uv: dd 0,1,4,5 , 1,2,5,6 > +v210enc_10_shufb_uv: db 0,1, 8, 9 , -1,2,3,-1 , 10,11,4,5 , -1,12,13,-1 > + db 2,3,10,11 , -1,4,5,-1 , 12,13,6,7 , -1,14,15,-1 > + > SECTION .text > > %macro v210_planar_pack_10 0 > @@ -113,6 +143,75 @@ INIT_YMM avx2 > v210_planar_pack_10 > %endif > > +%macro v210_planar_pack_10_new 0 > + > +cglobal v210_planar_pack_10, 5, 5, 8+2*notcpuflag(avx512icl), y, u, v, dst, width > + lea yq, [yq+2*widthq] > + add uq, widthq > + add vq, widthq > + neg widthq > + > + %if cpuflag(avx512icl) > + movu m6, [icl_perm_y] > + movu m7, [icl_perm_uv] > + kmovq k1, [icl_perm_y_kmask] > + kmovq k2, [icl_perm_uv_kmask] > + %else > + movu m6, [v210enc_10_permd_y] > + VBROADCASTI128 m7, [v210enc_10_shufb_y] > + movu m8, [v210enc_10_permd_uv] > + movu m9, [v210enc_10_shufb_uv] > + %endif > + movu m2, [icl_shift_y] > + movu m3, [icl_shift_uv] > + VBROADCASTI128 m4, [v210_enc_min_10] ; only ymm sized > + VBROADCASTI128 m5, [v210_enc_max_10] ; only ymm sized > + > + .loop: > + movu m0, [yq + widthq*2] > + %if cpuflag(avx512icl) > + movu ym1, [uq + widthq*1] > + vinserti32x8 zm1, [vq + widthq*1], 1 > + %else > + movu xm1, [uq + widthq*1] > + vinserti128 ym1, [vq + widthq*1], 1 > + %endif > + CLIPW m0, m4, m5 > + CLIPW m1, m4, m5 > + > + vpsllvw m0, m2 > + vpsllvw m1, m3 > + %if cpuflag(avx512icl) > + vpermb m0{k1}{z}, m6, m0 > + vpermb m1{k2}{z}, m7, m1 > + %else > + vpermd m0, m6, m0 > + pshufb m0, m7 > + vpermd m1, m8, m1 > + pshufb m1, m9 > + %endif > + por m0, m1 > + > + movu [dstq], m0 > + add dstq, mmsize > + add widthq, (mmsize*3)/8 > + jl .loop > +RET > + > +%endmacro > + > +%if ARCH_X86_64 > +%if HAVE_AVX512_EXTERNAL > +INIT_YMM avx512 > +v210_planar_pack_10_new > +%endif > +%endif > + > +%if HAVE_AVX512ICL_EXTERNAL > +INIT_ZMM avx512icl > +v210_planar_pack_10_new > +%endif > + > %macro v210_planar_pack_8 0 > > ; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width) > diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c > index 6e9f8c6e61..5d1ebcb893 100644 > --- a/libavcodec/x86/v210enc_init.c > +++ b/libavcodec/x86/v210enc_init.c > @@ -37,6 +37,12 @@ void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u, > void ff_v210_planar_pack_10_avx2(const uint16_t *y, const uint16_t *u, > const uint16_t *v, uint8_t *dst, > ptrdiff_t width); > +void ff_v210_planar_pack_10_avx512(const uint16_t *y, const uint16_t *u, > + const uint16_t *v, uint8_t *dst, > + ptrdiff_t width); > +void ff_v210_planar_pack_10_avx512icl(const uint16_t *y, const uint16_t *u, > + const uint16_t *v, uint8_t *dst, > + ptrdiff_t width); > > av_cold void ff_v210enc_init_x86(V210EncContext *s) > { > @@ -60,10 +66,16 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s) > if (EXTERNAL_AVX512(cpu_flags)) { > s->sample_factor_8 = 2; > s->pack_line_8 = ff_v210_planar_pack_8_avx512; > +#ifdef ARCH_X86_64 ARCH_X86_64 is always defined. So checks of this type need to check with #if. > + s->sample_factor_10 = 2; > + s->pack_line_10 = ff_v210_planar_pack_10_avx512; > +#endif > } > > if (EXTERNAL_AVX512ICL(cpu_flags)) { > s->sample_factor_8 = 4; > s->pack_line_8 = ff_v210_planar_pack_8_avx512icl; > + s->sample_factor_10 = 4; > + s->pack_line_10 = ff_v210_planar_pack_10_avx512icl; > } > } _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2022-11-21 14:12 UTC|newest] Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top 2022-11-21 12:44 [FFmpeg-devel] [PATCH 1/3] checkasm/v210enc: test the entire width of 10-bit planar input arrays James Darnley 2022-11-21 12:44 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86/v210: replace register use with named register James Darnley 2022-11-21 12:44 ` [FFmpeg-devel] [PATCH 3/3] avcodec/v210enc: add new 10-bit function for avx512 avx512icl James Darnley 2022-11-21 14:12 ` Andreas Rheinhardt [this message] 2022-11-21 16:50 ` James Darnley 2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 1/5] checkasm/v210enc: test the entire width of 10-bit planar input arrays James Darnley 2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 2/5] avcodec/x86/v210enc: replace register use with named register James Darnley 2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 3/5] avcodec/v210enc: add new 10-bit function for avx512 avx512icl James Darnley 2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 4/5] avcodec/x86/v210enc: expand and correct comments James Darnley 2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 5/5] avcodec/x86/v210enc: remove unneeded instruction James Darnley
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=AS8P250MB0744F6277FC308CAE95107438F0A9@AS8P250MB0744.EURP250.PROD.OUTLOOK.COM \ --to=andreas.rheinhardt@outlook.com \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git