* [FFmpeg-devel] [PATCH 2/5] libswscale: Avx2 hscale can process inputs of any size.
@ 2022-02-09 9:13 Alan Kelly
0 siblings, 0 replies; only message in thread
From: Alan Kelly @ 2022-02-09 9:13 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Alan Kelly
The main loop processes blocks of 16 pixels. The tail processes blocks
of size 4.
---
libswscale/x86/scale_avx2.asm | 48 +++++++++++++++++++++++++++++++++--
1 file changed, 46 insertions(+), 2 deletions(-)
diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
index 20acdbd633..dc42abb100 100644
--- a/libswscale/x86/scale_avx2.asm
+++ b/libswscale/x86/scale_avx2.asm
@@ -53,6 +53,9 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize,
mova m14, [four]
shr fltsized, 2
%endif
+ cmp wq, 16
+ jl .tail_loop
+ mov countq, 0x10
.loop:
movu m1, [fltposq]
movu m2, [fltposq+32]
@@ -97,11 +100,52 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize,
vpsrad m6, 7
vpackssdw m5, m5, m6
vpermd m5, m15, m5
- vmovdqu [dstq + countq * 2], m5
+ vmovdqu [dstq], m5
+ add dstq, 0x20
add fltposq, 0x40
add countq, 0x10
cmp countq, wq
- jl .loop
+ jle .loop
+
+ sub countq, 0x10
+ cmp countq, wq
+ jge .end
+
+.tail_loop:
+ movu xm1, [fltposq]
+%ifidn %1, X4
+ pxor xm9, xm9
+ pxor xm10, xm10
+ xor innerq, innerq
+.tail_innerloop:
+%endif
+ vpcmpeqd xm13, xm13
+ vpgatherdd xm3,[srcmemq + xm1], xm13
+ vpunpcklbw xm5, xm3, xm0
+ vpunpckhbw xm6, xm3, xm0
+ vpmaddwd xm5, xm5, [filterq]
+ vpmaddwd xm6, xm6, [filterq + 16]
+ add filterq, 0x20
+%ifidn %1, X4
+ paddd xm9, xm5
+ paddd xm10, xm6
+ paddd xm1, xm14
+ add innerq, 1
+ cmp innerq, fltsizeq
+ jl .tail_innerloop
+ vphaddd xm5, xm9, xm10
+%else
+ vphaddd xm5, xm5, xm6
+%endif
+ vpsrad xm5, 7
+ vpackssdw xm5, xm5, xm5
+ vmovq [dstq], xm5
+ add dstq, 0x8
+ add fltposq, 0x10
+ add countq, 0x4
+ cmp countq, wq
+ jl .tail_loop
+.end:
REP_RET
%endmacro
--
2.35.0.263.gb82422642f-goog
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2022-02-09 9:14 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-09 9:13 [FFmpeg-devel] [PATCH 2/5] libswscale: Avx2 hscale can process inputs of any size Alan Kelly
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git