* [FFmpeg-devel] [PATCH 2/3] checkasm: add test for yadif
2023-02-10 13:06 [FFmpeg-devel] [PATCH 1/3] avfilter: move yadif's filter_line init into a dedicated function James Darnley
@ 2023-02-10 13:06 ` James Darnley
2023-02-10 13:06 ` [FFmpeg-devel] [PATCH 3/3] avfilter/yadif: add avx2 filter_line function James Darnley
1 sibling, 0 replies; 5+ messages in thread
From: James Darnley @ 2023-02-10 13:06 UTC (permalink / raw)
To: ffmpeg-devel
---
tests/checkasm/Makefile | 1 +
tests/checkasm/checkasm.c | 3 ++
tests/checkasm/checkasm.h | 1 +
tests/checkasm/vf_yadif.c | 62 +++++++++++++++++++++++++++++++++++++++
4 files changed, 67 insertions(+)
create mode 100644 tests/checkasm/vf_yadif.c
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index a6f06c7007..fc65bdc77d 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -47,6 +47,7 @@ AVFILTEROBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o
AVFILTEROBJS-$(CONFIG_THRESHOLD_FILTER) += vf_threshold.o
AVFILTEROBJS-$(CONFIG_NLMEANS_FILTER) += vf_nlmeans.o
AVFILTEROBJS-$(CONFIG_SOBEL_FILTER) += vf_convolution.o
+AVFILTEROBJS-$(CONFIG_YADIF_FILTER) += vf_yadif.o
CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index e96d84a7da..2bb72cf839 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -200,6 +200,9 @@ static const struct {
#if CONFIG_SOBEL_FILTER
{ "vf_sobel", checkasm_check_vf_sobel },
#endif
+ #if CONFIG_YADIF_FILTER
+ { "vf_yadif", checkasm_check_vf_yadif },
+ #endif
#endif
#if CONFIG_SWSCALE
{ "sw_gbrp", checkasm_check_sw_gbrp },
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 8744a81218..0b9a83b5b5 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -87,6 +87,7 @@ void checkasm_check_vf_gblur(void);
void checkasm_check_vf_hflip(void);
void checkasm_check_vf_threshold(void);
void checkasm_check_vf_sobel(void);
+void checkasm_check_vf_yadif(void);
void checkasm_check_vp8dsp(void);
void checkasm_check_vp9dsp(void);
void checkasm_check_videodsp(void);
diff --git a/tests/checkasm/vf_yadif.c b/tests/checkasm/vf_yadif.c
new file mode 100644
index 0000000000..cb58519c23
--- /dev/null
+++ b/tests/checkasm/vf_yadif.c
@@ -0,0 +1,62 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+#include "checkasm.h"
+#include "libavcodec/internal.h"
+#include "libavfilter/yadif.h"
+
+#define WIDTH 256
+
+#define randomize_buffers(buf0, buf1, mask, count) \
+ for (size_t i; i < count; i++) \
+ buf0[i] = buf1[i] = rnd() & mask
+
+void checkasm_check_vf_yadif(void)
+{
+ YADIFContext ctx_8, ctx_10, ctx_16;
+
+ ff_yadif_init_filter_line(&ctx_8, 8);
+ ff_yadif_init_filter_line(&ctx_10, 10);
+ ff_yadif_init_filter_line(&ctx_16, 16);
+
+ if (check_func(ctx_8.filter_line, "yadif8")) {
+ uint8_t prev0[5*WIDTH + STRIDE_ALIGN], prev1[5*WIDTH + STRIDE_ALIGN];
+ uint8_t next0[5*WIDTH + STRIDE_ALIGN], next1[5*WIDTH + STRIDE_ALIGN];
+ uint8_t cur0[5*WIDTH + STRIDE_ALIGN], cur1[5*WIDTH + STRIDE_ALIGN];
+ uint8_t dst0[WIDTH + STRIDE_ALIGN], dst1[WIDTH + STRIDE_ALIGN];
+
+ declare_func(void, void *dst, void *prev, void *cur, void *next,
+ int w, int prefs, int mrefs, int parity, int mode);
+
+ randomize_buffers(prev0, prev1, 0xff, 5*WIDTH + STRIDE_ALIGN);
+ randomize_buffers(next0, next1, 0xff, 5*WIDTH + STRIDE_ALIGN);
+ randomize_buffers(cur0, cur1, 0xff, 5*WIDTH + STRIDE_ALIGN);
+
+ call_ref(dst0, prev0, cur0, next0, WIDTH, WIDTH, WIDTH, 0, 1);
+ call_new(dst1, prev1, cur1, next1, WIDTH, WIDTH, WIDTH, 0, 1);
+
+ if (memcmp(dst0, dst1, WIDTH)
+ || memcmp(prev0, prev1, sizeof prev0)
+ || memcmp(next0, next1, sizeof next0)
+ || memcmp(cur0, cur1, sizeof cur0))
+ fail();
+ bench_new(dst1, prev1, cur1, next1, WIDTH, WIDTH, WIDTH, 0, 1);
+ }
+ report("yadif8");
+}
--
2.39.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 3/3] avfilter/yadif: add avx2 filter_line function
2023-02-10 13:06 [FFmpeg-devel] [PATCH 1/3] avfilter: move yadif's filter_line init into a dedicated function James Darnley
2023-02-10 13:06 ` [FFmpeg-devel] [PATCH 2/3] checkasm: add test for yadif James Darnley
@ 2023-02-10 13:06 ` James Darnley
2023-02-20 12:55 ` James Darnley
1 sibling, 1 reply; 5+ messages in thread
From: James Darnley @ 2023-02-10 13:06 UTC (permalink / raw)
To: ffmpeg-devel
Zen 2 (Ryzen 7 3700X):
1.73x faster (3603±586.3 vs. 2082±317.1 decicycles) compared with ssse3
Using an SD y4m file speed increases from ~ 3600 fps to ~4700.
---
libavfilter/x86/vf_yadif.asm | 83 +++++++++++++++++++++++----------
libavfilter/x86/vf_yadif_init.c | 4 ++
2 files changed, 62 insertions(+), 25 deletions(-)
diff --git a/libavfilter/x86/vf_yadif.asm b/libavfilter/x86/vf_yadif.asm
index 809cebdd3f..571febfca3 100644
--- a/libavfilter/x86/vf_yadif.asm
+++ b/libavfilter/x86/vf_yadif.asm
@@ -25,11 +25,30 @@
SECTION_RODATA
-pb_1: times 16 db 1
-pw_1: times 8 dw 1
+pb_1: times 32 db 1
+pw_1: times 16 dw 1
SECTION .text
+%unmacro RSHIFT 2
+
+%macro RSHIFT 2
+%if mmsize == 32
+ vextracti128 xm7, %1, 1
+ palignr xmm %+ %1, xm7, xmm %+ %1, 2
+%else
+ psrldq %1, %2
+%endif
+%endmacro
+
+%macro UNPACK 1
+%if mmsize == 32
+ pmovzxbw %1, xmm %+ %1
+%else
+ punpcklbw %1, m7
+%endif
+%endmacro
+
%macro CHECK 2
movu m2, [curq+t1+%1]
movu m3, [curq+t0+%2]
@@ -40,7 +59,7 @@ SECTION .text
pand m4, [pb_1]
psubusb m5, m4
RSHIFT m5, 1
- punpcklbw m5, m7
+ UNPACK m5
mova m4, m2
psubusb m2, m3
psubusb m3, m4
@@ -49,9 +68,9 @@ SECTION .text
mova m4, m2
RSHIFT m3, 1
RSHIFT m4, 2
- punpcklbw m2, m7
- punpcklbw m3, m7
- punpcklbw m4, m7
+ UNPACK m2
+ UNPACK m3
+ UNPACK m4
paddw m2, m3
paddw m2, m4
%endmacro
@@ -81,13 +100,19 @@ SECTION .text
%endmacro
%macro LOAD 2
- movh %1, %2
- punpcklbw %1, m7
+ %if mmsize == 32
+ pmovzxbw %1, %2
+ %else
+ movh %1, %2
+ punpcklbw %1, m7
+ %endif
%endmacro
%macro FILTER 3
.loop%1:
- pxor m7, m7
+ %if mmsize != 32
+ pxor m7, m7
+ %endif
LOAD m0, [curq+t1]
LOAD m1, [curq+t0]
LOAD m2, [%2]
@@ -95,9 +120,9 @@ SECTION .text
mova m4, m3
paddw m3, m2
psraw m3, 1
- mova [rsp+ 0], m0
- mova [rsp+16], m3
- mova [rsp+32], m1
+ mova [rsp+0*mmsize], m0
+ mova [rsp+1*mmsize], m3
+ mova [rsp+2*mmsize], m1
psubw m2, m4
ABS1 m2, m4
LOAD m3, [prevq+t1]
@@ -119,7 +144,7 @@ SECTION .text
paddw m3, m4
psrlw m3, 1
pmaxsw m2, m3
- mova [rsp+48], m2
+ mova [rsp+3*mmsize], m2
paddw m1, m0
paddw m0, m0
@@ -134,9 +159,9 @@ SECTION .text
psubusb m3, m4
pmaxub m2, m3
mova m3, m2
- psrldq m3, 2
- punpcklbw m2, m7
- punpcklbw m3, m7
+ RSHIFT m3, 2
+ UNPACK m2
+ UNPACK m3
paddw m0, m2
paddw m0, m3
psubw m0, [pw_1]
@@ -150,7 +175,7 @@ SECTION .text
CHECK 1, -3
CHECK2
- mova m6, [rsp+48]
+ mova m6, [rsp+3*mmsize]
cmp DWORD r8m, 2
jge .end%1
LOAD m2, [%2+t1*2]
@@ -161,9 +186,9 @@ SECTION .text
paddw m3, m5
psrlw m2, 1
psrlw m3, 1
- mova m4, [rsp+ 0]
- mova m5, [rsp+16]
- mova m7, [rsp+32]
+ mova m4, [rsp+0*mmsize]
+ mova m5, [rsp+1*mmsize]
+ mova m7, [rsp+2*mmsize]
psubw m2, m4
psubw m3, m7
mova m0, m5
@@ -182,15 +207,21 @@ SECTION .text
pmaxsw m6, m4
.end%1:
- mova m2, [rsp+16]
+ mova m2, [rsp+1*mmsize]
mova m3, m2
psubw m2, m6
paddw m3, m6
pmaxsw m1, m2
pminsw m1, m3
- packuswb m1, m1
- movh [dstq], m1
+ %if mmsize == 32
+ vextracti128 xm4, ym1, 1
+ packuswb xm1, xm4
+ movu [dstq], xm1
+ %else
+ packuswb m1, m1
+ movh [dstq], m1
+ %endif
add dstq, mmsize/2
add prevq, mmsize/2
add curq, mmsize/2
@@ -201,10 +232,10 @@ SECTION .text
%macro YADIF 0
%if ARCH_X86_32
-cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \
+cglobal yadif_filter_line, 4, 6, 8, 4*mmsize, dst, prev, cur, next, w, prefs, \
mrefs, parity, mode
%else
-cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \
+cglobal yadif_filter_line, 4, 7, 8, 4*mmsize, dst, prev, cur, next, w, prefs, \
mrefs, parity, mode
%endif
%if ARCH_X86_32
@@ -233,3 +264,5 @@ INIT_XMM ssse3
YADIF
INIT_XMM sse2
YADIF
+INIT_YMM avx2
+YADIF
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index d648f0f835..48858dc295 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -29,6 +29,8 @@ void ff_yadif_filter_line_sse2(void *dst, void *prev, void *cur,
void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur,
void *next, int w, int prefs,
int mrefs, int parity, int mode);
+void ff_yadif_filter_line_avx2(void *dst, void *prev, void *cur, void *next,
+ int w, int prefs, int mrefs, int parity, int mode);
void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur,
void *next, int w, int prefs,
@@ -68,5 +70,7 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif, int bit_depth)
yadif->filter_line = ff_yadif_filter_line_sse2;
if (EXTERNAL_SSSE3(cpu_flags))
yadif->filter_line = ff_yadif_filter_line_ssse3;
+ if (EXTERNAL_AVX2(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_avx2;
}
}
--
2.39.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread