On 2/18/2025 11:58 AM, Shreesh Adiga wrote: > On Mon, Feb 3, 2025 at 10:03 PM Shreesh Adiga > <16567adigashreesh@gmail.com> wrote: >> >> The scalar loop is replaced with masked AVX512 instructions. >> For extracting the Y from UYVY, vperm2b is used instead of >> various AND and packuswb. >> >> Instead of loading the vectors with interleaved lanes as done >> in AVX2 version, normal load is used. At the end of packuswb, >> for U and V, an extra permute operation is done to get the >> required layout. >> >> AMD 7950x Zen 4 benchmark data: >> uyvytoyuv422_c: 29105.0 ( 1.00x) >> uyvytoyuv422_sse2: 3888.0 ( 7.49x) >> uyvytoyuv422_avx: 3374.2 ( 8.63x) >> uyvytoyuv422_avx2: 2649.8 (10.98x) >> uyvytoyuv422_avx512icl: 1615.0 (18.02x) >> >> Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com> >> --- >> libswscale/x86/rgb2rgb.c | 6 ++ >> libswscale/x86/rgb_2_rgb.asm | 105 +++++++++++++++++++++++++++++++++++ >> 2 files changed, 111 insertions(+) >> >> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c >> index 4cbed54b35..6601dad233 100644 >> --- a/libswscale/x86/rgb2rgb.c >> +++ b/libswscale/x86/rgb2rgb.c >> @@ -2383,6 +2383,9 @@ void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, >> void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, >> const uint8_t *src, int width, int height, >> int lumStride, int chromStride, int srcStride); >> +void ff_uyvytoyuv422_avx512icl(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, >> + const uint8_t *src, int width, int height, >> + int lumStride, int chromStride, int srcStride); >> #endif >> >> #define DEINTERLEAVE_BYTES(cpuext) \ >> @@ -2477,6 +2480,9 @@ av_cold void rgb2rgb_init_x86(void) >> } >> if (EXTERNAL_AVX2_FAST(cpu_flags)) { >> uyvytoyuv422 = ff_uyvytoyuv422_avx2; >> + } >> + if (EXTERNAL_AVX512ICL(cpu_flags)) { >> + uyvytoyuv422 = ff_uyvytoyuv422_avx512icl; >> #endif >> } >> #endif >> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm >> index ca7a481255..6e4df17298 100644 >> --- a/libswscale/x86/rgb_2_rgb.asm >> +++ b/libswscale/x86/rgb_2_rgb.asm >> @@ -35,6 +35,20 @@ pb_shuffle2013: db 2, 0, 1, 3, 6, 4, 5, 7, 10, 8, 9, 11, 14, 12, 13, 15 >> pb_shuffle2130: db 2, 1, 3, 0, 6, 5, 7, 4, 10, 9, 11, 8, 14, 13, 15, 12 >> pb_shuffle1203: db 1, 2, 0, 3, 5, 6, 4, 7, 9, 10, 8, 11, 13, 14, 12, 15 >> >> +%if HAVE_AVX512ICL_EXTERNAL >> +; shuffle vector to rearrange packuswb result to be linear >> +shuf_packus: db 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51,\ >> + 4, 5, 6, 7, 20, 21, 22, 23, 36, 37, 38, 39, 52, 53, 54, 55,\ >> + 8, 9, 10, 11, 24, 25, 26, 27, 40, 41, 42, 43, 56, 57, 58, 59,\ >> + 12, 13, 14, 15, 28, 29, 30, 31, 44, 45, 46, 47, 60, 61, 62, 63 >> + >> +; shuffle vector to combine odd elements from two vectors to extract Y >> +shuf_perm2b: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,\ >> + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,\ >> + 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,\ >> + 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127 >> +%endif >> + >> SECTION .text >> >> %macro RSHIFT_COPY 5 >> @@ -156,9 +170,20 @@ SHUFFLE_BYTES 1, 2, 0, 3 >> ; int lumStride, int chromStride, int srcStride) >> ;----------------------------------------------------------------------------------------------- >> %macro UYVY_TO_YUV422 0 >> +%if mmsize == 64 >> +; need two more registers to store shuffle vectors for AVX512ICL >> +cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w >> +%else >> cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w >> +%endif >> pxor m0, m0 >> +%if mmsize == 64 >> + vpternlogd m1, m1, m1, 0xff ; m1 = _mm512_set1_epi8(0xff) >> + movu m8, [shuf_packus] >> + movu m9, [shuf_perm2b] >> +%else >> pcmpeqw m1, m1 >> +%endif >> psrlw m1, 8 >> >> movsxdifnidn wq, wd >> @@ -188,6 +213,63 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s >> and xq, mmsize * 2 - 1 >> je .loop_simd >> >> +%if mmsize == 64 >> + shr xq, 1 >> + mov tmpq, -1 >> + shlx tmpq, tmpq, xq >> + not tmpq >> + kmovq k7, tmpq ; write mask for U/V >> + kmovd k1, tmpd ; write mask for 1st half of Y >> + kmovw k3, tmpd ; read mask for 1st vector >> + shr tmpq, 16 >> + kmovw k4, tmpd ; read mask for 2nd vector >> + shr tmpq, 16 >> + kmovd k2, tmpd ; write mask for 2nd half of Y >> + kmovw k5, tmpd ; read mask for 3rd vector >> + shr tmpd, 16 >> + kmovw k6, tmpd ; read mask for 4th vector >> + >> + vmovdqu32 m2{k3}{z}, [srcq + wtwoq ] >> + vmovdqu32 m3{k4}{z}, [srcq + wtwoq + mmsize ] >> + vmovdqu32 m4{k5}{z}, [srcq + wtwoq + mmsize * 2] >> + vmovdqu32 m5{k6}{z}, [srcq + wtwoq + mmsize * 3] >> + >> + ; extract y part 1 >> + mova m6, m9 >> + vpermi2b m6, m2, m3 ; UYVY UYVY -> YYYY using permute >> + vmovdqu16 [ydstq + wq]{k1}, m6 >> + >> + ; extract y part 2 >> + mova m7, m9 >> + vpermi2b m7, m4, m5 ; UYVY UYVY -> YYYY using permute >> + vmovdqu16 [ydstq + wq + mmsize]{k2}, m7 >> + >> + ; extract uv >> + pand m2, m1 ; UxVx... >> + pand m3, m1 ; UxVx... >> + pand m4, m1 ; UxVx... >> + pand m5, m1 ; UxVx... >> + packuswb m2, m3 ; UVUV... >> + packuswb m4, m5 ; UVUV... >> + >> + ; U >> + pand m6, m2, m1 ; UxUx... >> + pand m7, m4, m1 ; UxUx... >> + packuswb m6, m7 ; UUUU >> + vpermb m6, m8, m6 >> + vmovdqu8 [udstq + whalfq]{k7}, m6 >> + >> + ; V >> + psrlw m2, 8 ; VxVx... >> + psrlw m4, 8 ; VxVx... >> + packuswb m2, m4 ; VVVV >> + vpermb m2, m8, m2 >> + vmovdqu8 [vdstq + whalfq]{k7}, m2 >> + >> + lea wq, [ wq + 2 * xq] >> + lea wtwoq, [wtwoq + 4 * xq] >> + add whalfq, xq >> +%else >> .loop_scalar: >> mov tmpb, [srcq + wtwoq + 0] >> mov [udstq + whalfq], tmpb >> @@ -206,6 +288,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s >> add whalfq, 1 >> sub xq, 2 >> jg .loop_scalar >> +%endif >> >> ; check if simd loop is need >> cmp wq, 0 >> @@ -228,6 +311,17 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s >> movu m5, [srcq + wtwoq + mmsize * 3] >> %endif >> >> +%if mmsize == 64 >> + ; extract y part 1 >> + mova m6, m9 >> + vpermi2b m6, m2, m3 ; UYVY UYVY -> YYYY using permute >> + movu [ydstq + wq], m6 >> + >> + ; extract y part 2 >> + mova m7, m9 >> + vpermi2b m7, m4, m5 ; UYVY UYVY -> YYYY using permute >> + movu [ydstq + wq + mmsize], m7 >> +%else >> ; extract y part 1 >> RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY... >> pand m6, m1; YxYx YxYx... >> @@ -247,6 +341,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s >> >> packuswb m6, m7 ; YYYY YYYY... >> movu [ydstq + wq + mmsize], m6 >> +%endif >> >> ; extract uv >> pand m2, m1 ; UxVx... >> @@ -262,6 +357,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s >> pand m7, m4, m1 ; UxUx... >> >> packuswb m6, m7 ; UUUU >> +%if mmsize == 64 >> + vpermb m6, m8, m6 >> +%endif >> movu [udstq + whalfq], m6 >> >> >> @@ -269,6 +367,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s >> psrlw m2, 8 ; VxVx... >> psrlw m4, 8 ; VxVx... >> packuswb m2, m4 ; VVVV >> +%if mmsize == 64 >> + vpermb m2, m8, m2 >> +%endif >> movu [vdstq + whalfq], m2 >> >> add whalfq, mmsize >> @@ -303,4 +404,8 @@ UYVY_TO_YUV422 >> INIT_YMM avx2 >> UYVY_TO_YUV422 >> %endif >> +%if HAVE_AVX512ICL_EXTERNAL >> +INIT_ZMM avx512icl >> +UYVY_TO_YUV422 >> +%endif >> %endif >> -- >> 2.45.3 >> > > Hi maintainers, > > Would anyone be willing to review this and provide inputs on getting > this accepted? > As a new contributor interested in contributing ASM, I was hoping to > work on https://trac.ffmpeg.org/wiki/SmallASMTasks mentioned AVX512ICL > work. Applied, sorry for the delay.