From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTP id 0F65E40E95 for ; Wed, 9 Feb 2022 09:14:04 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id C32E668B19F; Wed, 9 Feb 2022 11:14:02 +0200 (EET) Received: from mail-wm1-f73.google.com (mail-wm1-f73.google.com [209.85.128.73]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 26FC168A888 for ; Wed, 9 Feb 2022 11:13:56 +0200 (EET) Received: by mail-wm1-f73.google.com with SMTP id r8-20020a7bc088000000b0037bbf779d26so226501wmh.7 for ; Wed, 09 Feb 2022 01:13:56 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=20210112; h=date:message-id:mime-version:subject:from:to:cc; bh=7RTLgeKUdGbFt2Vmrv/Q9Gw1Gqgkn2ZPPhlqZiOosec=; b=IQfxLWkIqK+jEnvMGT6fpRm79PUAytZZL3SQTgjL19qUJKzS0rR37kEzBcFxQkbWV7 PEkCkH13Dk1ZiGv12iVm0EXnsVlLWMwgpe+IFRlU5bXe3tsKIWkKm+zFbOqbZLQbQShR vXTNfFGPNuHVMCuVYu4ukmj6SXNmXODMZH0tjG8ZuLFGQn0/eGK3SzMBXUSUKMlcgQbK YnvvwaI1TCs4jjW9e2luU1ZrOux4ZbLg8SrldzqF0AD1OXCuIoQNaZkzlgX3I7fGfE3Y qJxYrBoqLDkIPGLXrhxnB0WZIEexVVUPaF+OrbpgvIOkvZCCTTSfjcZdOmUm16XjRhQ8 zh4w== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:date:message-id:mime-version:subject:from:to:cc; bh=7RTLgeKUdGbFt2Vmrv/Q9Gw1Gqgkn2ZPPhlqZiOosec=; b=VxJi+n8mS1GcHlL9+aqvMq2Gbo7nTuHyyU2n87Pn1sMvPfb0b9t55fOoAWehaYTLkf zH8gCXMNNxjJoOUcZhCc/4z+vlKnCYdBNXKY5HJ06fwJHFz6zGrgQaGtnX1hzx/N41Kb MneVhHDmM/hTYKtTERpg598trN6zM9AvgXF3XQ5qoWYjc6nPEGaBKwY1w/dy7OlKc35N 26o+ZAPL+2X9wjGDEaKW7Sbur8gUPG5l7KIBgtu/sAw4VCGLa4iy1frcmRLfUEdiZYYX cqj3GbfBs9W+++1cXe+leBgXhieki0MYgQBYcJHF23Djy0NDxpWVU5qq+33c08u6z1Sq SwGA== X-Gm-Message-State: AOAM530KdN5wDKT2Oh7FtRT7/B5H6+iIDNkhoT/YfjbX+cH1ZTIVBURc dMpazEd8xgabKJ578ASG9wtPn95sSiUiI0lYhTiH7RzjgZMNGZX1878zYdcdq5EkDuCesCRrGqu nL+njjnhYPVsF58f9xFLeF2QFKJnsHcsbq4ApVzKb7IJGQwlIkvfx0/FPwmD5n8fr8zEBEkg= X-Google-Smtp-Source: ABdhPJwkNxN8vB14u9v9+ORRJViXzdIzeuIMoToSEsdNQQ2+0WKL1cm0uOIBPvMVLu+yTIsZkn2GEivZwn9shqs= X-Received: from alankelly0.zrh.corp.google.com ([2a00:79e0:61:301:388b:9d0c:1bc4:40c]) (user=alankelly job=sendgmr) by 2002:a7b:c24a:: with SMTP id b10mr1164208wmj.191.1644398035435; Wed, 09 Feb 2022 01:13:55 -0800 (PST) Date: Wed, 9 Feb 2022 10:13:51 +0100 Message-Id: <20220209091351.3455295-1-alankelly@google.com> Mime-Version: 1.0 X-Mailer: git-send-email 2.35.0.263.gb82422642f-goog From: Alan Kelly To: ffmpeg-devel@ffmpeg.org Subject: [FFmpeg-devel] [PATCH 2/5] libswscale: Avx2 hscale can process inputs of any size. X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: Alan Kelly Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: The main loop processes blocks of 16 pixels. The tail processes blocks of size 4. --- libswscale/x86/scale_avx2.asm | 48 +++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm index 20acdbd633..dc42abb100 100644 --- a/libswscale/x86/scale_avx2.asm +++ b/libswscale/x86/scale_avx2.asm @@ -53,6 +53,9 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, mova m14, [four] shr fltsized, 2 %endif + cmp wq, 16 + jl .tail_loop + mov countq, 0x10 .loop: movu m1, [fltposq] movu m2, [fltposq+32] @@ -97,11 +100,52 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, vpsrad m6, 7 vpackssdw m5, m5, m6 vpermd m5, m15, m5 - vmovdqu [dstq + countq * 2], m5 + vmovdqu [dstq], m5 + add dstq, 0x20 add fltposq, 0x40 add countq, 0x10 cmp countq, wq - jl .loop + jle .loop + + sub countq, 0x10 + cmp countq, wq + jge .end + +.tail_loop: + movu xm1, [fltposq] +%ifidn %1, X4 + pxor xm9, xm9 + pxor xm10, xm10 + xor innerq, innerq +.tail_innerloop: +%endif + vpcmpeqd xm13, xm13 + vpgatherdd xm3,[srcmemq + xm1], xm13 + vpunpcklbw xm5, xm3, xm0 + vpunpckhbw xm6, xm3, xm0 + vpmaddwd xm5, xm5, [filterq] + vpmaddwd xm6, xm6, [filterq + 16] + add filterq, 0x20 +%ifidn %1, X4 + paddd xm9, xm5 + paddd xm10, xm6 + paddd xm1, xm14 + add innerq, 1 + cmp innerq, fltsizeq + jl .tail_innerloop + vphaddd xm5, xm9, xm10 +%else + vphaddd xm5, xm5, xm6 +%endif + vpsrad xm5, 7 + vpackssdw xm5, xm5, xm5 + vmovq [dstq], xm5 + add dstq, 0x8 + add fltposq, 0x10 + add countq, 0x4 + cmp countq, wq + jl .tail_loop +.end: REP_RET %endmacro -- 2.35.0.263.gb82422642f-goog _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".