Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/3] avfilter: move yadif's filter_line init into a dedicated function
@ 2023-02-10 13:06 James Darnley
  2023-02-10 13:06 ` [FFmpeg-devel] [PATCH 2/3] checkasm: add test for yadif James Darnley
  2023-02-10 13:06 ` [FFmpeg-devel] [PATCH 3/3] avfilter/yadif: add avx2 filter_line function James Darnley
  0 siblings, 2 replies; 5+ messages in thread
From: James Darnley @ 2023-02-10 13:06 UTC (permalink / raw)
  To: ffmpeg-devel

---
 libavfilter/vf_yadif.c          | 13 +++++++++----
 libavfilter/x86/vf_yadif_init.c |  4 +---
 libavfilter/yadif.h             |  3 ++-
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c
index afa4d1d53d..1f9434f961 100644
--- a/libavfilter/vf_yadif.c
+++ b/libavfilter/vf_yadif.c
@@ -303,7 +303,14 @@ static int config_output(AVFilterLink *outlink)
 
     s->csp = av_pix_fmt_desc_get(outlink->format);
     s->filter = filter;
-    if (s->csp->comp[0].depth > 8) {
+    ff_yadif_init_filter_line(s, s->csp->comp[0].depth);
+
+    return 0;
+}
+
+av_cold void ff_yadif_init_filter_line(YADIFContext *s, int bit_depth)
+{
+    if (bit_depth > 8) {
         s->filter_line  = filter_line_c_16bit;
         s->filter_edges = filter_edges_16bit;
     } else {
@@ -312,10 +319,8 @@ static int config_output(AVFilterLink *outlink)
     }
 
 #if ARCH_X86
-    ff_yadif_init_x86(s);
+    ff_yadif_init_x86(s, bit_depth);
 #endif
-
-    return 0;
 }
 
 
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index 257c3f9199..d648f0f835 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -47,11 +47,9 @@ void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur,
                                       void *next, int w, int prefs,
                                       int mrefs, int parity, int mode);
 
-av_cold void ff_yadif_init_x86(YADIFContext *yadif)
+av_cold void ff_yadif_init_x86(YADIFContext *yadif, int bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
-    int bit_depth = (!yadif->csp) ? 8
-                                  : yadif->csp->comp[0].depth;
 
     if (bit_depth >= 15) {
         if (EXTERNAL_SSE2(cpu_flags))
diff --git a/libavfilter/yadif.h b/libavfilter/yadif.h
index c928911b35..5d8309b403 100644
--- a/libavfilter/yadif.h
+++ b/libavfilter/yadif.h
@@ -86,7 +86,8 @@ typedef struct YADIFContext {
     int current_field;  ///< YADIFCurrentField
 } YADIFContext;
 
-void ff_yadif_init_x86(YADIFContext *yadif);
+void ff_yadif_init_filter_line(YADIFContext *s, int bit_depth);
+void ff_yadif_init_x86(YADIFContext *yadif, int bit_depth);
 
 int ff_yadif_filter_frame(AVFilterLink *link, AVFrame *frame);
 
-- 
2.39.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH 2/3] checkasm: add test for yadif
  2023-02-10 13:06 [FFmpeg-devel] [PATCH 1/3] avfilter: move yadif's filter_line init into a dedicated function James Darnley
@ 2023-02-10 13:06 ` James Darnley
  2023-02-10 13:06 ` [FFmpeg-devel] [PATCH 3/3] avfilter/yadif: add avx2 filter_line function James Darnley
  1 sibling, 0 replies; 5+ messages in thread
From: James Darnley @ 2023-02-10 13:06 UTC (permalink / raw)
  To: ffmpeg-devel

---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/vf_yadif.c | 62 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 67 insertions(+)
 create mode 100644 tests/checkasm/vf_yadif.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index a6f06c7007..fc65bdc77d 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -47,6 +47,7 @@ AVFILTEROBJS-$(CONFIG_HFLIP_FILTER)      += vf_hflip.o
 AVFILTEROBJS-$(CONFIG_THRESHOLD_FILTER)  += vf_threshold.o
 AVFILTEROBJS-$(CONFIG_NLMEANS_FILTER)    += vf_nlmeans.o
 AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)      += vf_convolution.o
+AVFILTEROBJS-$(CONFIG_YADIF_FILTER)      += vf_yadif.o
 
 CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index e96d84a7da..2bb72cf839 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -200,6 +200,9 @@ static const struct {
     #if CONFIG_SOBEL_FILTER
         { "vf_sobel", checkasm_check_vf_sobel },
     #endif
+    #if CONFIG_YADIF_FILTER
+        { "vf_yadif", checkasm_check_vf_yadif },
+    #endif
 #endif
 #if CONFIG_SWSCALE
     { "sw_gbrp", checkasm_check_sw_gbrp },
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 8744a81218..0b9a83b5b5 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -87,6 +87,7 @@ void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
 void checkasm_check_vf_sobel(void);
+void checkasm_check_vf_yadif(void);
 void checkasm_check_vp8dsp(void);
 void checkasm_check_vp9dsp(void);
 void checkasm_check_videodsp(void);
diff --git a/tests/checkasm/vf_yadif.c b/tests/checkasm/vf_yadif.c
new file mode 100644
index 0000000000..cb58519c23
--- /dev/null
+++ b/tests/checkasm/vf_yadif.c
@@ -0,0 +1,62 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+#include "checkasm.h"
+#include "libavcodec/internal.h"
+#include "libavfilter/yadif.h"
+
+#define WIDTH 256
+
+#define randomize_buffers(buf0, buf1, mask, count) \
+    for (size_t i; i < count; i++) \
+        buf0[i] = buf1[i] = rnd() & mask
+
+void checkasm_check_vf_yadif(void)
+{
+    YADIFContext ctx_8, ctx_10, ctx_16;
+
+    ff_yadif_init_filter_line(&ctx_8, 8);
+    ff_yadif_init_filter_line(&ctx_10, 10);
+    ff_yadif_init_filter_line(&ctx_16, 16);
+
+    if (check_func(ctx_8.filter_line, "yadif8")) {
+        uint8_t prev0[5*WIDTH + STRIDE_ALIGN], prev1[5*WIDTH + STRIDE_ALIGN];
+        uint8_t next0[5*WIDTH + STRIDE_ALIGN], next1[5*WIDTH + STRIDE_ALIGN];
+        uint8_t cur0[5*WIDTH + STRIDE_ALIGN], cur1[5*WIDTH + STRIDE_ALIGN];
+        uint8_t dst0[WIDTH + STRIDE_ALIGN], dst1[WIDTH + STRIDE_ALIGN];
+
+        declare_func(void, void *dst, void *prev, void *cur, void *next,
+                int w, int prefs, int mrefs, int parity, int mode);
+
+        randomize_buffers(prev0, prev1, 0xff, 5*WIDTH + STRIDE_ALIGN);
+        randomize_buffers(next0, next1, 0xff, 5*WIDTH + STRIDE_ALIGN);
+        randomize_buffers(cur0, cur1, 0xff, 5*WIDTH + STRIDE_ALIGN);
+
+        call_ref(dst0, prev0, cur0, next0, WIDTH, WIDTH, WIDTH, 0, 1);
+        call_new(dst1, prev1, cur1, next1, WIDTH, WIDTH, WIDTH, 0, 1);
+
+        if (memcmp(dst0, dst1, WIDTH)
+                || memcmp(prev0, prev1, sizeof prev0)
+                || memcmp(next0, next1, sizeof next0)
+                || memcmp(cur0, cur1, sizeof cur0))
+            fail();
+        bench_new(dst1, prev1, cur1, next1, WIDTH, WIDTH, WIDTH, 0, 1);
+    }
+    report("yadif8");
+}
-- 
2.39.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH 3/3] avfilter/yadif: add avx2 filter_line function
  2023-02-10 13:06 [FFmpeg-devel] [PATCH 1/3] avfilter: move yadif's filter_line init into a dedicated function James Darnley
  2023-02-10 13:06 ` [FFmpeg-devel] [PATCH 2/3] checkasm: add test for yadif James Darnley
@ 2023-02-10 13:06 ` James Darnley
  2023-02-20 12:55   ` James Darnley
  1 sibling, 1 reply; 5+ messages in thread
From: James Darnley @ 2023-02-10 13:06 UTC (permalink / raw)
  To: ffmpeg-devel

Zen 2 (Ryzen 7 3700X):
1.73x faster (3603±586.3 vs. 2082±317.1 decicycles) compared with ssse3

Using an SD y4m file speed increases from ~ 3600 fps to ~4700.
---
 libavfilter/x86/vf_yadif.asm    | 83 +++++++++++++++++++++++----------
 libavfilter/x86/vf_yadif_init.c |  4 ++
 2 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/libavfilter/x86/vf_yadif.asm b/libavfilter/x86/vf_yadif.asm
index 809cebdd3f..571febfca3 100644
--- a/libavfilter/x86/vf_yadif.asm
+++ b/libavfilter/x86/vf_yadif.asm
@@ -25,11 +25,30 @@
 
 SECTION_RODATA
 
-pb_1: times 16 db 1
-pw_1: times  8 dw 1
+pb_1: times 32 db 1
+pw_1: times 16 dw 1
 
 SECTION .text
 
+%unmacro RSHIFT 2
+
+%macro RSHIFT 2
+%if mmsize == 32
+    vextracti128 xm7, %1, 1
+    palignr xmm %+ %1, xm7, xmm %+ %1, 2
+%else
+    psrldq %1, %2
+%endif
+%endmacro
+
+%macro UNPACK 1
+%if mmsize == 32
+    pmovzxbw %1, xmm %+ %1
+%else
+    punpcklbw %1, m7
+%endif
+%endmacro
+
 %macro CHECK 2
     movu      m2, [curq+t1+%1]
     movu      m3, [curq+t0+%2]
@@ -40,7 +59,7 @@ SECTION .text
     pand      m4, [pb_1]
     psubusb   m5, m4
     RSHIFT    m5, 1
-    punpcklbw m5, m7
+    UNPACK    m5
     mova      m4, m2
     psubusb   m2, m3
     psubusb   m3, m4
@@ -49,9 +68,9 @@ SECTION .text
     mova      m4, m2
     RSHIFT    m3, 1
     RSHIFT    m4, 2
-    punpcklbw m2, m7
-    punpcklbw m3, m7
-    punpcklbw m4, m7
+    UNPACK    m2
+    UNPACK    m3
+    UNPACK    m4
     paddw     m2, m3
     paddw     m2, m4
 %endmacro
@@ -81,13 +100,19 @@ SECTION .text
 %endmacro
 
 %macro LOAD 2
-    movh      %1, %2
-    punpcklbw %1, m7
+    %if mmsize == 32
+        pmovzxbw %1, %2
+    %else
+        movh      %1, %2
+        punpcklbw %1, m7
+    %endif
 %endmacro
 
 %macro FILTER 3
 .loop%1:
-    pxor         m7, m7
+    %if mmsize != 32
+        pxor         m7, m7
+    %endif
     LOAD         m0, [curq+t1]
     LOAD         m1, [curq+t0]
     LOAD         m2, [%2]
@@ -95,9 +120,9 @@ SECTION .text
     mova         m4, m3
     paddw        m3, m2
     psraw        m3, 1
-    mova   [rsp+ 0], m0
-    mova   [rsp+16], m3
-    mova   [rsp+32], m1
+    mova   [rsp+0*mmsize], m0
+    mova   [rsp+1*mmsize], m3
+    mova   [rsp+2*mmsize], m1
     psubw        m2, m4
     ABS1         m2, m4
     LOAD         m3, [prevq+t1]
@@ -119,7 +144,7 @@ SECTION .text
     paddw        m3, m4
     psrlw        m3, 1
     pmaxsw       m2, m3
-    mova   [rsp+48], m2
+    mova   [rsp+3*mmsize], m2
 
     paddw        m1, m0
     paddw        m0, m0
@@ -134,9 +159,9 @@ SECTION .text
     psubusb      m3, m4
     pmaxub       m2, m3
     mova         m3, m2
-    psrldq       m3, 2
-    punpcklbw    m2, m7
-    punpcklbw    m3, m7
+    RSHIFT       m3, 2
+    UNPACK       m2
+    UNPACK       m3
     paddw        m0, m2
     paddw        m0, m3
     psubw        m0, [pw_1]
@@ -150,7 +175,7 @@ SECTION .text
     CHECK 1, -3
     CHECK2
 
-    mova         m6, [rsp+48]
+    mova         m6, [rsp+3*mmsize]
     cmp   DWORD r8m, 2
     jge .end%1
     LOAD         m2, [%2+t1*2]
@@ -161,9 +186,9 @@ SECTION .text
     paddw        m3, m5
     psrlw        m2, 1
     psrlw        m3, 1
-    mova         m4, [rsp+ 0]
-    mova         m5, [rsp+16]
-    mova         m7, [rsp+32]
+    mova         m4, [rsp+0*mmsize]
+    mova         m5, [rsp+1*mmsize]
+    mova         m7, [rsp+2*mmsize]
     psubw        m2, m4
     psubw        m3, m7
     mova         m0, m5
@@ -182,15 +207,21 @@ SECTION .text
     pmaxsw       m6, m4
 
 .end%1:
-    mova         m2, [rsp+16]
+    mova         m2, [rsp+1*mmsize]
     mova         m3, m2
     psubw        m2, m6
     paddw        m3, m6
     pmaxsw       m1, m2
     pminsw       m1, m3
-    packuswb     m1, m1
 
-    movh     [dstq], m1
+    %if mmsize == 32
+        vextracti128 xm4, ym1, 1
+        packuswb xm1, xm4
+        movu [dstq], xm1
+    %else
+        packuswb     m1, m1
+        movh     [dstq], m1
+    %endif
     add        dstq, mmsize/2
     add       prevq, mmsize/2
     add        curq, mmsize/2
@@ -201,10 +232,10 @@ SECTION .text
 
 %macro YADIF 0
 %if ARCH_X86_32
-cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \
+cglobal yadif_filter_line, 4, 6, 8, 4*mmsize, dst, prev, cur, next, w, prefs, \
                                         mrefs, parity, mode
 %else
-cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \
+cglobal yadif_filter_line, 4, 7, 8, 4*mmsize, dst, prev, cur, next, w, prefs, \
                                         mrefs, parity, mode
 %endif
 %if ARCH_X86_32
@@ -233,3 +264,5 @@ INIT_XMM ssse3
 YADIF
 INIT_XMM sse2
 YADIF
+INIT_YMM avx2
+YADIF
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index d648f0f835..48858dc295 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -29,6 +29,8 @@ void ff_yadif_filter_line_sse2(void *dst, void *prev, void *cur,
 void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur,
                                 void *next, int w, int prefs,
                                 int mrefs, int parity, int mode);
+void ff_yadif_filter_line_avx2(void *dst, void *prev, void *cur, void *next,
+        int w, int prefs, int mrefs, int parity, int mode);
 
 void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur,
                                      void *next, int w, int prefs,
@@ -68,5 +70,7 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif, int bit_depth)
             yadif->filter_line = ff_yadif_filter_line_sse2;
         if (EXTERNAL_SSSE3(cpu_flags))
             yadif->filter_line = ff_yadif_filter_line_ssse3;
+        if (EXTERNAL_AVX2(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_avx2;
     }
 }
-- 
2.39.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/3] avfilter/yadif: add avx2 filter_line function
  2023-02-10 13:06 ` [FFmpeg-devel] [PATCH 3/3] avfilter/yadif: add avx2 filter_line function James Darnley
@ 2023-02-20 12:55   ` James Darnley
  2023-02-20 12:55     ` James Darnley
  0 siblings, 1 reply; 5+ messages in thread
From: James Darnley @ 2023-02-20 12:55 UTC (permalink / raw)
  To: ffmpeg-devel

On 2/10/23 14:06, James Darnley wrote:
> snip
This patch set is broken.  The checkasm test is incomplete.  This avx2 
function has some bug that only manifests when the strides (prefs mrefs) 
are opposite signs (one positive and one negative).  That situation is 
what happens with real usage.  I fixed my checkasm test which also shows it.

Consider this patch set retracted until I can fix it.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/3] avfilter/yadif: add avx2 filter_line function
  2023-02-20 12:55   ` James Darnley
@ 2023-02-20 12:55     ` James Darnley
  0 siblings, 0 replies; 5+ messages in thread
From: James Darnley @ 2023-02-20 12:55 UTC (permalink / raw)
  To: ffmpeg-devel

On 2/10/23 14:06, James Darnley wrote:
> snip
This patch set is broken.  The checkasm test is incomplete.  This avx2 
function has some bug that only manifests when the strides (prefs mrefs) 
are opposite signs (one positive and one negative).  That situation is 
what happens with real usage.  I fixed my checkasm test which also shows it.

Consider this patch set retracted until I can fix it.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-02-20 12:57 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-10 13:06 [FFmpeg-devel] [PATCH 1/3] avfilter: move yadif's filter_line init into a dedicated function James Darnley
2023-02-10 13:06 ` [FFmpeg-devel] [PATCH 2/3] checkasm: add test for yadif James Darnley
2023-02-10 13:06 ` [FFmpeg-devel] [PATCH 3/3] avfilter/yadif: add avx2 filter_line function James Darnley
2023-02-20 12:55   ` James Darnley
2023-02-20 12:55     ` James Darnley

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git