* [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: optimize AVX2 version of uyvytoyuv422
@ 2025-02-20 13:21 Shreesh Adiga
0 siblings, 0 replies; only message in thread
From: Shreesh Adiga @ 2025-02-20 13:21 UTC (permalink / raw)
To: ffmpeg-devel
Currently the AVX2 version of uyvytoyuv422 in the SIMD loop does the following:
4 vinsertq to have interleaving of the vector lanes during load from memory.
4 vperm2i128 inside 4 RSHIFT_COPY calls to achieve the desired layout.
This patch replaces the above 8 instructions with 2 vpermq and
2 vpermd with a vector register similar to AVX512ICL version.
Observed the following numbers on various microarchitectures:
On AMD Zen3 laptop:
Before:
uyvytoyuv422_c: 51979.7 ( 1.00x)
uyvytoyuv422_sse2: 5410.5 ( 9.61x)
uyvytoyuv422_avx: 4642.7 (11.20x)
uyvytoyuv422_avx2: 4249.0 (12.23x)
After:
uyvytoyuv422_c: 51659.8 ( 1.00x)
uyvytoyuv422_sse2: 5420.8 ( 9.53x)
uyvytoyuv422_avx: 4651.2 (11.11x)
uyvytoyuv422_avx2: 3953.8 (13.07x)
On Intel Macbook Pro 2019:
Before:
uyvytoyuv422_c: 185014.4 ( 1.00x)
uyvytoyuv422_sse2: 22800.4 ( 8.11x)
uyvytoyuv422_avx: 19796.9 ( 9.35x)
uyvytoyuv422_avx2: 13141.9 (14.08x)
After:
uyvytoyuv422_c: 185093.4 ( 1.00x)
uyvytoyuv422_sse2: 22795.4 ( 8.12x)
uyvytoyuv422_avx: 19791.9 ( 9.35x)
uyvytoyuv422_avx2: 12043.1 (15.37x)
On AMD Zen4 desktop:
Before:
uyvytoyuv422_c: 29105.0 ( 1.00x)
uyvytoyuv422_sse2: 3888.0 ( 7.49x)
uyvytoyuv422_avx: 3374.2 ( 8.63x)
uyvytoyuv422_avx2: 2649.8 (10.98x)
uyvytoyuv422_avx512icl: 1615.0 (18.02x)
After:
uyvytoyuv422_c: 29093.4 ( 1.00x)
uyvytoyuv422_sse2: 3874.4 ( 7.51x)
uyvytoyuv422_avx: 3371.6 ( 8.63x)
uyvytoyuv422_avx2: 2174.6 (13.38x)
uyvytoyuv422_avx512icl: 1625.1 (17.90x)
Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
---
libswscale/x86/rgb_2_rgb.asm | 68 ++++++++++++++++++------------------
1 file changed, 34 insertions(+), 34 deletions(-)
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 6e4df17298..871bb21127 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -49,18 +49,21 @@ shuf_perm2b: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25,
97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127
%endif
+%if HAVE_AVX2_EXTERNAL
+; shuffle vector to rearrange packuswb result to be linear
+shuf_packus_avx2: db 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0,\
+ 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0,
+%endif
+
SECTION .text
-%macro RSHIFT_COPY 5
+%macro RSHIFT_COPY 3
; %1 dst ; %2 src ; %3 shift
-%if mmsize == 32
- vperm2i128 %1, %2, %3, %5
- RSHIFT %1, %4
-%elif cpuflag(avx)
- psrldq %1, %2, %4
+%if cpuflag(avx) || cpuflag(avx2) || cpuflag(avx512icl)
+ psrldq %1, %2, %3
%else
mova %1, %2
- RSHIFT %1, %4
+ RSHIFT %1, %3
%endif
%endmacro
@@ -170,18 +173,16 @@ SHUFFLE_BYTES 1, 2, 0, 3
; int lumStride, int chromStride, int srcStride)
;-----------------------------------------------------------------------------------------------
%macro UYVY_TO_YUV422 0
-%if mmsize == 64
-; need two more registers to store shuffle vectors for AVX512ICL
-cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
-%else
-cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
-%endif
+cglobal uyvytoyuv422, 9, 14, 8 + cpuflag(avx2) + cpuflag(avx512icl), ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
pxor m0, m0
%if mmsize == 64
vpternlogd m1, m1, m1, 0xff ; m1 = _mm512_set1_epi8(0xff)
movu m8, [shuf_packus]
movu m9, [shuf_perm2b]
%else
+ %if cpuflag(avx2)
+ movu m8, [shuf_packus_avx2]
+ %endif
pcmpeqw m1, m1
%endif
psrlw m1, 8
@@ -295,21 +296,10 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
jge .end_line
.loop_simd:
-%if mmsize == 32
- movu xm2, [srcq + wtwoq ]
- movu xm3, [srcq + wtwoq + 16 ]
- movu xm4, [srcq + wtwoq + 16 * 2]
- movu xm5, [srcq + wtwoq + 16 * 3]
- vinserti128 m2, m2, [srcq + wtwoq + 16 * 4], 1
- vinserti128 m3, m3, [srcq + wtwoq + 16 * 5], 1
- vinserti128 m4, m4, [srcq + wtwoq + 16 * 6], 1
- vinserti128 m5, m5, [srcq + wtwoq + 16 * 7], 1
-%else
movu m2, [srcq + wtwoq ]
movu m3, [srcq + wtwoq + mmsize ]
movu m4, [srcq + wtwoq + mmsize * 2]
movu m5, [srcq + wtwoq + mmsize * 3]
-%endif
%if mmsize == 64
; extract y part 1
@@ -323,23 +313,29 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
movu [ydstq + wq + mmsize], m7
%else
; extract y part 1
- RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY...
- pand m6, m1; YxYx YxYx...
+ RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
+ pand m6, m1 ; YxYx YxYx...
- RSHIFT_COPY m7, m3, m5, 1, 0x20 ; UYVY UYVY -> YVYU YVY...
- pand m7, m1 ; YxYx YxYx...
+ RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY...
+ pand m7, m1 ; YxYx YxYx...
- packuswb m6, m7 ; YYYY YYYY...
+ packuswb m6, m7 ; YYYY YYYY...
+%if mmsize == 32
+ vpermq m6, m6, 0xd8
+%endif
movu [ydstq + wq], m6
; extract y part 2
- RSHIFT_COPY m6, m4, m2, 1, 0x13 ; UYVY UYVY -> YVYU YVY...
- pand m6, m1; YxYx YxYx...
+ RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY...
+ pand m6, m1 ; YxYx YxYx...
- RSHIFT_COPY m7, m5, m3, 1, 0x13 ; UYVY UYVY -> YVYU YVY...
- pand m7, m1 ; YxYx YxYx...
+ RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY...
+ pand m7, m1 ; YxYx YxYx...
- packuswb m6, m7 ; YYYY YYYY...
+ packuswb m6, m7 ; YYYY YYYY...
+%if mmsize == 32
+ vpermq m6, m6, 0xd8
+%endif
movu [ydstq + wq + mmsize], m6
%endif
@@ -359,6 +355,8 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
packuswb m6, m7 ; UUUU
%if mmsize == 64
vpermb m6, m8, m6
+%elif mmsize == 32
+ vpermd m6, m8, m6
%endif
movu [udstq + whalfq], m6
@@ -369,6 +367,8 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
packuswb m2, m4 ; VVVV
%if mmsize == 64
vpermb m2, m8, m2
+%elif mmsize == 32
+ vpermd m2, m8, m2
%endif
movu [vdstq + whalfq], m2
--
2.45.3
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-02-20 13:22 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-02-20 13:21 [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: optimize AVX2 version of uyvytoyuv422 Shreesh Adiga
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git