From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTP id D05AA4B456 for ; Wed, 5 Jun 2024 20:29:26 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 15BE168D6DF; Wed, 5 Jun 2024 23:29:20 +0300 (EEST) Received: from mail-pf1-f178.google.com (mail-pf1-f178.google.com [209.85.210.178]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 4CA1868D44A for ; Wed, 5 Jun 2024 23:29:14 +0300 (EEST) Received: by mail-pf1-f178.google.com with SMTP id d2e1a72fcca58-70249faa853so155135b3a.3 for ; Wed, 05 Jun 2024 13:29:14 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1717619351; x=1718224151; darn=ffmpeg.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:to:from:from:to:cc:subject:date:message-id :reply-to; bh=P0lzJOk91F7ggDylIlcAozW5VSVZjok2hpme3e8CLpk=; b=W17YWjwAmUl0BJTAAZCSrsjSRmio/bMZcOYBmrGFtYNbYNFVHIs55cZy4Zk6ls5T2u SeRGVIbiAPlSM6Av/NMvdHkEOSl4CV4sXhKCtlyWifkBFGfrDdLheXmbix+22B6vpFcK KQecMxSeFiLmcUZlga8s1PMDpqIw+az1svR9lscifm+PGjCnA5URjG/dkrwN9gE1SQtT kZZZTMXWs0IfWWlbHFRpUWltTqmnWEQX8ckaDBEZY8+QDTd/GoRQ60lLk8CMbK65krBS RBrymBYvBNZ7Y7ayuM3GA4a9N5y0Zs90BLdLwKUSpHRvWLlLtafnSwAGDVuB+PGNxfAA Uw5A== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1717619351; x=1718224151; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=P0lzJOk91F7ggDylIlcAozW5VSVZjok2hpme3e8CLpk=; b=F1HRNzMHy+9R1Wj1cXlTCkYK9VOVg6nIeODVBOi6QM7Ru1FWHVKRPRXL3osDxWEVVM 3z2IcaZllAUlnekuesOvQhexvFsQxdN4qaLw/kR5SSovZUC8ZufSbzcyMDa7UKeTeRMY yHpU3nXEpUHsZr4djMFl7L1JssN388QOaWgA+DJOVF/wlnk50Zb+3Y3fCug5lGz2LFLC lrVyIKEtPLpf0GRzaukwKoxBTMsIa4PIAPvZtgc7epP0o496yoZUP2QXQCIjBp1YhJbW VkFfiykjJFM9bCzXnoc+c8t5yU5+SqnRT80yFKe45tS1aER/snv8dmsIiGx1WwKNcTab k2pw== X-Gm-Message-State: AOJu0YzXxRlRqfSnU9Qz6iXl32dPhFt9W+UgVDAx/RJZqMbplBwXPr8o 8AjQDs1vEWzvHGYoOivP/zflPveqKmI5dPHdHugI+pK9h+o8fOkme64sDA== X-Google-Smtp-Source: AGHT+IGJRUBh+a3oAkqiYdq3aWNNQYSH71noYLN0sGk4bIPMxUaey+0MtMnJXzL3KJQhBTQAqRBU2w== X-Received: by 2002:a05:6a20:918e:b0:1af:cbd3:ab4c with SMTP id adf61e73a8af0-1b2b7025e81mr4665052637.35.1717619351341; Wed, 05 Jun 2024 13:29:11 -0700 (PDT) Received: from localhost.localdomain ([190.194.167.233]) by smtp.gmail.com with ESMTPSA id d2e1a72fcca58-7024967ae05sm8536492b3a.157.2024.06.05.13.29.10 for (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 05 Jun 2024 13:29:10 -0700 (PDT) From: James Almer To: ffmpeg-devel@ffmpeg.org Date: Wed, 5 Jun 2024 17:28:53 -0300 Message-ID: <20240605202853.3135-2-jamrial@gmail.com> X-Mailer: git-send-email 2.45.1 In-Reply-To: <20240605202853.3135-1-jamrial@gmail.com> References: <20240605202853.3135-1-jamrial@gmail.com> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 2/2] swscale/x86/input: add AVX2 optimized uyvytoyuv422 X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: uyvytoyuv422_c: 23991.8 uyvytoyuv422_sse2: 2817.8 uyvytoyuv422_avx: 2819.3 uyvytoyuv422_avx2: 1972.3 Signed-off-by: James Almer --- libswscale/x86/rgb2rgb.c | 6 ++++++ libswscale/x86/rgb_2_rgb.asm | 32 ++++++++++++++++++++++++-------- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index b325e5dbd5..21ccfafe51 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -136,6 +136,9 @@ void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride); +void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); #endif av_cold void rgb2rgb_init_x86(void) @@ -177,5 +180,8 @@ av_cold void rgb2rgb_init_x86(void) if (EXTERNAL_AVX(cpu_flags)) { uyvytoyuv422 = ff_uyvytoyuv422_avx; } + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + uyvytoyuv422 = ff_uyvytoyuv422_avx2; + } #endif } diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm index 76ca1eec03..0bf1278718 100644 --- a/libswscale/x86/rgb_2_rgb.asm +++ b/libswscale/x86/rgb_2_rgb.asm @@ -34,13 +34,16 @@ pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 SECTION .text -%macro RSHIFT_COPY 3 +%macro RSHIFT_COPY 5 ; %1 dst ; %2 src ; %3 shift -%if cpuflag(avx) - psrldq %1, %2, %3 +%if mmsize == 32 + vperm2i128 %1, %2, %3, %5 + RSHIFT %1, %4 +%elif cpuflag(avx) + psrldq %1, %2, %4 %else mova %1, %2 - RSHIFT %1, %3 + RSHIFT %1, %4 %endif %endmacro @@ -233,26 +236,37 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s jge .end_line .loop_simd: +%if mmsize == 32 + movu xm2, [srcq + wtwoq ] + movu xm3, [srcq + wtwoq + 16 ] + movu xm4, [srcq + wtwoq + 16 * 2] + movu xm5, [srcq + wtwoq + 16 * 3] + vinserti128 m2, m2, [srcq + wtwoq + 16 * 4], 1 + vinserti128 m3, m3, [srcq + wtwoq + 16 * 5], 1 + vinserti128 m4, m4, [srcq + wtwoq + 16 * 6], 1 + vinserti128 m5, m5, [srcq + wtwoq + 16 * 7], 1 +%else movu m2, [srcq + wtwoq ] movu m3, [srcq + wtwoq + mmsize ] movu m4, [srcq + wtwoq + mmsize * 2] movu m5, [srcq + wtwoq + mmsize * 3] +%endif ; extract y part 1 - RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY... + RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY... pand m6, m1; YxYx YxYx... - RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY... + RSHIFT_COPY m7, m3, m5, 1, 0x20 ; UYVY UYVY -> YVYU YVY... pand m7, m1 ; YxYx YxYx... packuswb m6, m7 ; YYYY YYYY... movu [ydstq + wq], m6 ; extract y part 2 - RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY... + RSHIFT_COPY m6, m4, m2, 1, 0x13 ; UYVY UYVY -> YVYU YVY... pand m6, m1; YxYx YxYx... - RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY... + RSHIFT_COPY m7, m5, m3, 1, 0x13 ; UYVY UYVY -> YVYU YVY... pand m7, m1 ; YxYx YxYx... packuswb m6, m7 ; YYYY YYYY... @@ -309,4 +323,6 @@ UYVY_TO_YUV422 INIT_XMM avx UYVY_TO_YUV422 +INIT_YMM avx2 +UYVY_TO_YUV422 %endif -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".