From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTPS id 2C01E4BDC6 for ; Mon, 3 Feb 2025 16:33:55 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 9B3D768A9A7; Mon, 3 Feb 2025 18:33:51 +0200 (EET) Received: from mail-pj1-f51.google.com (mail-pj1-f51.google.com [209.85.216.51]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id B4882680069 for ; Mon, 3 Feb 2025 18:33:44 +0200 (EET) Received: by mail-pj1-f51.google.com with SMTP id 98e67ed59e1d1-2ee8e8e29f6so5871918a91.0 for ; Mon, 03 Feb 2025 08:33:44 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1738600422; x=1739205222; darn=ffmpeg.org; h=content-transfer-encoding:mime-version:message-id:date:subject:to :from:from:to:cc:subject:date:message-id:reply-to; bh=/3BhN+JVxHDXMa7OUJ7TxCAAaTXdRVycJXY6hXaKufI=; b=dMcq5m/FaGhAZeCQ2nB3RcLg6rYtGfw3TUt1MYTcjpGLgmyqsW9swH3+qChvDjM5hu 6x77EltlSwZJ3YAfpd9lUfy/SnQ5+VMYtR2ueJwY1uj/jUkyis19VvRCE4tVJQGTtNPU oitBbkdFHksCqwPMqCtS0D6umaBnOtK8mJlVUlt8FZQlQ9OYqXU5NlGoFwFzlO4gMjPo Y8qr2ArD7LTIhM1QgwxkymTCdNR3OUvrDMFPaij1ODV4Xo3vtfKxNnv+0SSGzycbXy9u QkGI2Pdi/IynOK90qOszsS3I4IQKIKukS6+sbyHIGA0ub+0MVC3umZJEdxv670q+nv+L ew8g== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1738600422; x=1739205222; h=content-transfer-encoding:mime-version:message-id:date:subject:to :from:x-gm-message-state:from:to:cc:subject:date:message-id:reply-to; bh=/3BhN+JVxHDXMa7OUJ7TxCAAaTXdRVycJXY6hXaKufI=; b=Q7CxYcoWr52dNAuOIYVpAQZzz9ljCb2EUiEGiy7oqYfE2q4F4SIjSkoHkK0t4+RNa5 r9oKK7eXlGQOQ3ERcL+t/pOvMBp3T+VlCLgMqpdSvYCQfU3dV82mGo86vZNg3ujwGxqN +R3D1kNwKL+0zhSBC+XWUD50qvpYsW5PH8c1u4Y+jJiy4T7Jw1uRIL6fgsJ8uqLd2S0Q HaFKTOkpfllBcL66mwrtoO6gMOn4y8mgw5jYX0OkKAypDYx5eY6XYvH5WPAyYaNkhFi4 jgngTL7X2c46XRzxYkShL8ceZ1vpxx+byl/w2Gi548ZNGePSSYPPTDkR3hpZm9BktZ9T 8aqg== X-Gm-Message-State: AOJu0YyPzH98F0souKNQlJG4MELUTrWZZg686O7fHZaTls+ZNjTnR5Qs KIsHOPDk5rJ/k1M82Yj75Bpf/zPNulwX0SLk6MIZ1gTZiD+1wGkzVVvPuA== X-Gm-Gg: ASbGncuaoRJ4t67SFCPjkcpgSHZrkGn35rolux5XxO2Xx76zUmtecnPmayJeAtLal03 DqrK6OFXFzbb8f2Vdv2ljsDPRx4cS9t8JVwba4jPE7cTiV9Ksid8FDJLtlB5aMv91PXgwn3Q33o 8DpZdNTQl3wUT4nkSyv+YcBRMeJyngL5RJqZVeWJXbBvrMD10O4QkWyOHbrA8EALC34kZUfOdsN oxVLJeDh8HGeJUTAQ+NOX9gOqiYCvHnj7WA4lQnpMqhXYz64eiYFzD/zZ3TopeIe1Bopklqr9eC wfeDtQDx8KfbnBQepryktFPqC/J1Ij+CK5TUfQ== X-Google-Smtp-Source: AGHT+IHLM0e0hcGgFcNqwZdO6tjWY0YyA9WsEM+qLKefoEXOpcKMEn0x05XE5epM0t3PTZ+2WEQwnA== X-Received: by 2002:a05:6a00:a01:b0:725:b201:2353 with SMTP id d2e1a72fcca58-72fd0bffc63mr32972557b3a.13.1738600422106; Mon, 03 Feb 2025 08:33:42 -0800 (PST) Received: from localhost.localdomain ([106.51.30.120]) by smtp.gmail.com with ESMTPSA id d2e1a72fcca58-72fe69ba4acsm8511721b3a.116.2025.02.03.08.33.40 for (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 03 Feb 2025 08:33:41 -0800 (PST) From: Shreesh Adiga <16567adigashreesh@gmail.com> To: ffmpeg-devel@ffmpeg.org Date: Mon, 3 Feb 2025 22:03:30 +0530 Message-ID: <20250203163330.533628-1-16567adigashreesh@gmail.com> X-Mailer: git-send-email 2.45.3 MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add AVX512ICL version of uyvytoyuv422 X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: The scalar loop is replaced with masked AVX512 instructions. For extracting the Y from UYVY, vperm2b is used instead of various AND and packuswb. Instead of loading the vectors with interleaved lanes as done in AVX2 version, normal load is used. At the end of packuswb, for U and V, an extra permute operation is done to get the required layout. AMD 7950x Zen 4 benchmark data: uyvytoyuv422_c: 29105.0 ( 1.00x) uyvytoyuv422_sse2: 3888.0 ( 7.49x) uyvytoyuv422_avx: 3374.2 ( 8.63x) uyvytoyuv422_avx2: 2649.8 (10.98x) uyvytoyuv422_avx512icl: 1615.0 (18.02x) Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com> --- libswscale/x86/rgb2rgb.c | 6 ++ libswscale/x86/rgb_2_rgb.asm | 105 +++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+) diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index 4cbed54b35..6601dad233 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -2383,6 +2383,9 @@ void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride); +void ff_uyvytoyuv422_avx512icl(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); #endif #define DEINTERLEAVE_BYTES(cpuext) \ @@ -2477,6 +2480,9 @@ av_cold void rgb2rgb_init_x86(void) } if (EXTERNAL_AVX2_FAST(cpu_flags)) { uyvytoyuv422 = ff_uyvytoyuv422_avx2; + } + if (EXTERNAL_AVX512ICL(cpu_flags)) { + uyvytoyuv422 = ff_uyvytoyuv422_avx512icl; #endif } #endif diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm index ca7a481255..6e4df17298 100644 --- a/libswscale/x86/rgb_2_rgb.asm +++ b/libswscale/x86/rgb_2_rgb.asm @@ -35,6 +35,20 @@ pb_shuffle2013: db 2, 0, 1, 3, 6, 4, 5, 7, 10, 8, 9, 11, 14, 12, 13, 15 pb_shuffle2130: db 2, 1, 3, 0, 6, 5, 7, 4, 10, 9, 11, 8, 14, 13, 15, 12 pb_shuffle1203: db 1, 2, 0, 3, 5, 6, 4, 7, 9, 10, 8, 11, 13, 14, 12, 15 +%if HAVE_AVX512ICL_EXTERNAL +; shuffle vector to rearrange packuswb result to be linear +shuf_packus: db 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51,\ + 4, 5, 6, 7, 20, 21, 22, 23, 36, 37, 38, 39, 52, 53, 54, 55,\ + 8, 9, 10, 11, 24, 25, 26, 27, 40, 41, 42, 43, 56, 57, 58, 59,\ + 12, 13, 14, 15, 28, 29, 30, 31, 44, 45, 46, 47, 60, 61, 62, 63 + +; shuffle vector to combine odd elements from two vectors to extract Y +shuf_perm2b: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,\ + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,\ + 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,\ + 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127 +%endif + SECTION .text %macro RSHIFT_COPY 5 @@ -156,9 +170,20 @@ SHUFFLE_BYTES 1, 2, 0, 3 ; int lumStride, int chromStride, int srcStride) ;----------------------------------------------------------------------------------------------- %macro UYVY_TO_YUV422 0 +%if mmsize == 64 +; need two more registers to store shuffle vectors for AVX512ICL +cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w +%else cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w +%endif pxor m0, m0 +%if mmsize == 64 + vpternlogd m1, m1, m1, 0xff ; m1 = _mm512_set1_epi8(0xff) + movu m8, [shuf_packus] + movu m9, [shuf_perm2b] +%else pcmpeqw m1, m1 +%endif psrlw m1, 8 movsxdifnidn wq, wd @@ -188,6 +213,63 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s and xq, mmsize * 2 - 1 je .loop_simd +%if mmsize == 64 + shr xq, 1 + mov tmpq, -1 + shlx tmpq, tmpq, xq + not tmpq + kmovq k7, tmpq ; write mask for U/V + kmovd k1, tmpd ; write mask for 1st half of Y + kmovw k3, tmpd ; read mask for 1st vector + shr tmpq, 16 + kmovw k4, tmpd ; read mask for 2nd vector + shr tmpq, 16 + kmovd k2, tmpd ; write mask for 2nd half of Y + kmovw k5, tmpd ; read mask for 3rd vector + shr tmpd, 16 + kmovw k6, tmpd ; read mask for 4th vector + + vmovdqu32 m2{k3}{z}, [srcq + wtwoq ] + vmovdqu32 m3{k4}{z}, [srcq + wtwoq + mmsize ] + vmovdqu32 m4{k5}{z}, [srcq + wtwoq + mmsize * 2] + vmovdqu32 m5{k6}{z}, [srcq + wtwoq + mmsize * 3] + + ; extract y part 1 + mova m6, m9 + vpermi2b m6, m2, m3 ; UYVY UYVY -> YYYY using permute + vmovdqu16 [ydstq + wq]{k1}, m6 + + ; extract y part 2 + mova m7, m9 + vpermi2b m7, m4, m5 ; UYVY UYVY -> YYYY using permute + vmovdqu16 [ydstq + wq + mmsize]{k2}, m7 + + ; extract uv + pand m2, m1 ; UxVx... + pand m3, m1 ; UxVx... + pand m4, m1 ; UxVx... + pand m5, m1 ; UxVx... + packuswb m2, m3 ; UVUV... + packuswb m4, m5 ; UVUV... + + ; U + pand m6, m2, m1 ; UxUx... + pand m7, m4, m1 ; UxUx... + packuswb m6, m7 ; UUUU + vpermb m6, m8, m6 + vmovdqu8 [udstq + whalfq]{k7}, m6 + + ; V + psrlw m2, 8 ; VxVx... + psrlw m4, 8 ; VxVx... + packuswb m2, m4 ; VVVV + vpermb m2, m8, m2 + vmovdqu8 [vdstq + whalfq]{k7}, m2 + + lea wq, [ wq + 2 * xq] + lea wtwoq, [wtwoq + 4 * xq] + add whalfq, xq +%else .loop_scalar: mov tmpb, [srcq + wtwoq + 0] mov [udstq + whalfq], tmpb @@ -206,6 +288,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s add whalfq, 1 sub xq, 2 jg .loop_scalar +%endif ; check if simd loop is need cmp wq, 0 @@ -228,6 +311,17 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s movu m5, [srcq + wtwoq + mmsize * 3] %endif +%if mmsize == 64 + ; extract y part 1 + mova m6, m9 + vpermi2b m6, m2, m3 ; UYVY UYVY -> YYYY using permute + movu [ydstq + wq], m6 + + ; extract y part 2 + mova m7, m9 + vpermi2b m7, m4, m5 ; UYVY UYVY -> YYYY using permute + movu [ydstq + wq + mmsize], m7 +%else ; extract y part 1 RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY... pand m6, m1; YxYx YxYx... @@ -247,6 +341,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s packuswb m6, m7 ; YYYY YYYY... movu [ydstq + wq + mmsize], m6 +%endif ; extract uv pand m2, m1 ; UxVx... @@ -262,6 +357,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s pand m7, m4, m1 ; UxUx... packuswb m6, m7 ; UUUU +%if mmsize == 64 + vpermb m6, m8, m6 +%endif movu [udstq + whalfq], m6 @@ -269,6 +367,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s psrlw m2, 8 ; VxVx... psrlw m4, 8 ; VxVx... packuswb m2, m4 ; VVVV +%if mmsize == 64 + vpermb m2, m8, m2 +%endif movu [vdstq + whalfq], m2 add whalfq, mmsize @@ -303,4 +404,8 @@ UYVY_TO_YUV422 INIT_YMM avx2 UYVY_TO_YUV422 %endif +%if HAVE_AVX512ICL_EXTERNAL +INIT_ZMM avx512icl +UYVY_TO_YUV422 +%endif %endif -- 2.45.3 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".