[FFmpeg-devel] [PATCH v2 1/5] avfilter/vf_yadif: Fix edge size when MAX

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] [PATCH v2 1/5] avfilter/vf_yadif: Fix edge size when MAX_ALIGN is < 4
@ 2022-07-21  2:25 Chris Phlipot
  2022-07-21  2:25 ` [FFmpeg-devel] [PATCH v2 2/5] avfilter/vf_yadif: Allow alignment to be configurable Chris Phlipot
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Chris Phlipot @ 2022-07-21  2:25 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Chris Phlipot

If alignment is set to less than 4 filter_edges will produce incorrect
output and not filter the entire edge. To fix this, make sure that
the edge size is at least 3.

Signed-off-by: Chris Phlipot <cphlipot0@gmail.com>
---
 libavfilter/vf_yadif.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c
index afa4d1d53d..055327d7a4 100644
--- a/libavfilter/vf_yadif.c
+++ b/libavfilter/vf_yadif.c
@@ -120,7 +120,7 @@ static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
     uint8_t *prev2 = parity ? prev : cur ;
     uint8_t *next2 = parity ? cur  : next;
 
-    const int edge = MAX_ALIGN - 1;
+    const int edge = FFMAX(MAX_ALIGN - 1, 3);
     int offset = FFMAX(w - edge, 3);
 
     /* Only edge pixels need to be processed here.  A constant value of false
@@ -169,7 +169,7 @@ static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1,
     uint16_t *prev2 = parity ? prev : cur ;
     uint16_t *next2 = parity ? cur  : next;
 
-    const int edge = MAX_ALIGN / 2 - 1;
+    const int edge = FFMAX(MAX_ALIGN / 2 - 1, 3);
     int offset = FFMAX(w - edge, 3);
 
     mrefs /= 2;
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH v2 2/5] avfilter/vf_yadif: Allow alignment to be configurable
  2022-07-21  2:25 [FFmpeg-devel] [PATCH v2 1/5] avfilter/vf_yadif: Fix edge size when MAX_ALIGN is < 4 Chris Phlipot
@ 2022-07-21  2:25 ` Chris Phlipot
  2022-07-21  2:25 ` [FFmpeg-devel] [PATCH v2 3/5] avfilter/vf_yadif: reformat code to improve readability Chris Phlipot
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Chris Phlipot @ 2022-07-21  2:25 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Chris Phlipot

Allow the alignment to be determined based on what yadif_filter_line
implementation is used. Currently this is either 1, or 8 depending
on whether the C code or the x86 SSE code is used, but allows for
other future implementations that use a larger alignment.

Adjusting MAX_ALIGN to 32 in the case of an AVX2 implementation
could potentially hurt the performance of the SSE implementation,
so we allow yadif to use the smallest needed alignment instead to
maintain existing performance if implementations with wider vectors
are added.

Signed-off-by: Chris Phlipot <cphlipot0@gmail.com>
---
 libavfilter/vf_yadif.c          | 16 +++++++++-------
 libavfilter/x86/vf_yadif_init.c |  1 +
 libavfilter/yadif.h             |  4 +++-
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c
index 055327d7a4..42f6246330 100644
--- a/libavfilter/vf_yadif.c
+++ b/libavfilter/vf_yadif.c
@@ -108,9 +108,9 @@ static void filter_line_c(void *dst1,
     FILTER(0, w, 1)
 }
 
-#define MAX_ALIGN 8
 static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
-                         int w, int prefs, int mrefs, int parity, int mode)
+                         int w, int prefs, int mrefs, int parity, int mode,
+                         int alignment)
 {
     uint8_t *dst  = dst1;
     uint8_t *prev = prev1;
@@ -120,7 +120,7 @@ static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
     uint8_t *prev2 = parity ? prev : cur ;
     uint8_t *next2 = parity ? cur  : next;
 
-    const int edge = FFMAX(MAX_ALIGN - 1, 3);
+    const int edge = FFMAX(alignment - 1, 3);
     int offset = FFMAX(w - edge, 3);
 
     /* Only edge pixels need to be processed here.  A constant value of false
@@ -159,7 +159,8 @@ static void filter_line_c_16bit(void *dst1,
 }
 
 static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1,
-                               int w, int prefs, int mrefs, int parity, int mode)
+                               int w, int prefs, int mrefs, int parity, int mode,
+                               int alignment)
 {
     uint16_t *dst  = dst1;
     uint16_t *prev = prev1;
@@ -169,7 +170,7 @@ static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1,
     uint16_t *prev2 = parity ? prev : cur ;
     uint16_t *next2 = parity ? cur  : next;
 
-    const int edge = FFMAX(MAX_ALIGN / 2 - 1, 3);
+    const int edge = FFMAX(alignment / 2 - 1, 3);
     int offset = FFMAX(w - edge, 3);
 
     mrefs /= 2;
@@ -199,7 +200,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
     int slice_start = (td->h *  jobnr   ) / nb_jobs;
     int slice_end   = (td->h * (jobnr+1)) / nb_jobs;
     int y;
-    int edge = 3 + MAX_ALIGN / df - 1;
+    int edge = 3 + s->req_align / df - 1;
 
     /* filtering reads 3 pixels to the left/right; to avoid invalid reads,
      * we need to call the c variant which avoids this for border pixels
@@ -219,7 +220,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
             s->filter_edges(dst, prev, cur, next, td->w,
                             y + 1 < td->h ? refs : -refs,
                             y ? -refs : refs,
-                            td->parity ^ td->tff, mode);
+                            td->parity ^ td->tff, mode, s->req_align);
         } else {
             memcpy(&td->frame->data[td->plane][y * td->frame->linesize[td->plane]],
                    &s->cur->data[td->plane][y * refs], td->w * df);
@@ -303,6 +304,7 @@ static int config_output(AVFilterLink *outlink)
 
     s->csp = av_pix_fmt_desc_get(outlink->format);
     s->filter = filter;
+    s->req_align = 1;
     if (s->csp->comp[0].depth > 8) {
         s->filter_line  = filter_line_c_16bit;
         s->filter_edges = filter_edges_16bit;
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index 257c3f9199..9dd73f8e44 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -53,6 +53,7 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
     int bit_depth = (!yadif->csp) ? 8
                                   : yadif->csp->comp[0].depth;
 
+    yadif->req_align = 8;
     if (bit_depth >= 15) {
         if (EXTERNAL_SSE2(cpu_flags))
             yadif->filter_line = ff_yadif_filter_line_16bit_sse2;
diff --git a/libavfilter/yadif.h b/libavfilter/yadif.h
index c928911b35..b81f2fc1d9 100644
--- a/libavfilter/yadif.h
+++ b/libavfilter/yadif.h
@@ -66,11 +66,13 @@ typedef struct YADIFContext {
     /**
      * Required alignment for filter_line
      */
+    int req_align;
     void (*filter_line)(void *dst,
                         void *prev, void *cur, void *next,
                         int w, int prefs, int mrefs, int parity, int mode);
     void (*filter_edges)(void *dst, void *prev, void *cur, void *next,
-                         int w, int prefs, int mrefs, int parity, int mode);
+                         int w, int prefs, int mrefs, int parity, int mode,
+                         int alignment);
 
     const AVPixFmtDescriptor *csp;
     int eof;
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH v2 3/5] avfilter/vf_yadif: reformat code to improve readability
  2022-07-21  2:25 [FFmpeg-devel] [PATCH v2 1/5] avfilter/vf_yadif: Fix edge size when MAX_ALIGN is < 4 Chris Phlipot
  2022-07-21  2:25 ` [FFmpeg-devel] [PATCH v2 2/5] avfilter/vf_yadif: Allow alignment to be configurable Chris Phlipot
@ 2022-07-21  2:25 ` Chris Phlipot
  2022-07-21  2:25 ` [FFmpeg-devel] [PATCH v2 4/5] avfilter/vf_yadif: Process more pixels using filter_line Chris Phlipot
  2022-07-21  2:25 ` [FFmpeg-devel] [PATCH v2 5/5] avfilter/vf_yadif: Add x86_64 avx yadif asm Chris Phlipot
  3 siblings, 0 replies; 5+ messages in thread
From: Chris Phlipot @ 2022-07-21  2:25 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Chris Phlipot

Reformat some of the code to improve readability and reduce code
duplication. This change is intended to be purely cosmentic and
shouldn't result in any functional changes.

Signed-off-by: Chris Phlipot <cphlipot0@gmail.com>
---
 libavfilter/vf_yadif.c | 11 +++++------
 libavfilter/yadif.h    |  3 +--
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c
index 42f6246330..54109566be 100644
--- a/libavfilter/vf_yadif.c
+++ b/libavfilter/vf_yadif.c
@@ -211,16 +211,15 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
             uint8_t *cur  = &s->cur ->data[td->plane][y * refs];
             uint8_t *next = &s->next->data[td->plane][y * refs];
             uint8_t *dst  = &td->frame->data[td->plane][y * td->frame->linesize[td->plane]];
+            int     prefs = y + 1 < td->h ? refs : -refs;
+            int     mrefs = y ? -refs : refs;
+            int    parity = td->parity ^ td->tff;
             int     mode  = y == 1 || y + 2 == td->h ? 2 : s->mode;
             s->filter_line(dst + pix_3, prev + pix_3, cur + pix_3,
                            next + pix_3, td->w - edge,
-                           y + 1 < td->h ? refs : -refs,
-                           y ? -refs : refs,
-                           td->parity ^ td->tff, mode);
+                           prefs, mrefs, parity, mode);
             s->filter_edges(dst, prev, cur, next, td->w,
-                            y + 1 < td->h ? refs : -refs,
-                            y ? -refs : refs,
-                            td->parity ^ td->tff, mode, s->req_align);
+                            prefs, mrefs, parity, mode, s->req_align);
         } else {
             memcpy(&td->frame->data[td->plane][y * td->frame->linesize[td->plane]],
                    &s->cur->data[td->plane][y * refs], td->w * df);
diff --git a/libavfilter/yadif.h b/libavfilter/yadif.h
index b81f2fc1d9..f271fe8304 100644
--- a/libavfilter/yadif.h
+++ b/libavfilter/yadif.h
@@ -67,8 +67,7 @@ typedef struct YADIFContext {
      * Required alignment for filter_line
      */
     int req_align;
-    void (*filter_line)(void *dst,
-                        void *prev, void *cur, void *next,
+    void (*filter_line)(void *dst, void *prev, void *cur, void *next,
                         int w, int prefs, int mrefs, int parity, int mode);
     void (*filter_edges)(void *dst, void *prev, void *cur, void *next,
                          int w, int prefs, int mrefs, int parity, int mode,
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH v2 4/5] avfilter/vf_yadif: Process more pixels using filter_line
  2022-07-21  2:25 [FFmpeg-devel] [PATCH v2 1/5] avfilter/vf_yadif: Fix edge size when MAX_ALIGN is < 4 Chris Phlipot
  2022-07-21  2:25 ` [FFmpeg-devel] [PATCH v2 2/5] avfilter/vf_yadif: Allow alignment to be configurable Chris Phlipot
  2022-07-21  2:25 ` [FFmpeg-devel] [PATCH v2 3/5] avfilter/vf_yadif: reformat code to improve readability Chris Phlipot
@ 2022-07-21  2:25 ` Chris Phlipot
  2022-07-21  2:25 ` [FFmpeg-devel] [PATCH v2 5/5] avfilter/vf_yadif: Add x86_64 avx yadif asm Chris Phlipot
  3 siblings, 0 replies; 5+ messages in thread
From: Chris Phlipot @ 2022-07-21  2:25 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Chris Phlipot

filter_line is generally vectorized, wheras filter_edge is implemented
in C. Currently we rely on filter_edge to process non-edges in cases
where the width doesn't match the alignment. This causes us to process
non-edge pixels with the slow C implementation vs the faster SSE
implementation.

It is generally faster to process 8 pixels with the slowest SSE2
vectorized implementation than it is to process 2 pixels with the
C implementation. Therefore, if filter_edge needs to process 2 or
more non-edge pixels, it would be faster to process these non-edge
pixels with filter_line instead even if it processes more pixels
than necessary.

To address this, we use filter_line so long as we know that at least
2 pixels will be used in the final output even if the rest of the
computed pixels are invalid. Any incorrect output pixels generated by
filter_line will be overwritten by the following call to filter_edge.
In addtion we avoid running filter_line if it would read or write
pixels outside the current slice.

Signed-off-by: Chris Phlipot <cphlipot0@gmail.com>
---
 libavfilter/vf_yadif.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c
index 54109566be..394c04a985 100644
--- a/libavfilter/vf_yadif.c
+++ b/libavfilter/vf_yadif.c
@@ -201,6 +201,8 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
     int slice_end   = (td->h * (jobnr+1)) / nb_jobs;
     int y;
     int edge = 3 + s->req_align / df - 1;
+    int filter_width_target = td->w - 3;
+    int filter_width_rounded_up = (filter_width_target & ~(s->req_align-1)) + s->req_align;
 
     /* filtering reads 3 pixels to the left/right; to avoid invalid reads,
      * we need to call the c variant which avoids this for border pixels
@@ -215,11 +217,28 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
             int     mrefs = y ? -refs : refs;
             int    parity = td->parity ^ td->tff;
             int     mode  = y == 1 || y + 2 == td->h ? 2 : s->mode;
+
+            /* Adjust width and alignment to process extra pixels in filter_line
+             * using potentially vectorized code so long as it doesn't cause
+             * reads or writes outside of the current slice. filter_edge will
+             * correct any incorrect pixels written by filter_line in this
+             * scenario.
+             */
+            int filter_width;
+            int edge_alignment;
+            if (filter_width_rounded_up - filter_width_target >= 2
+                && y*refs + filter_width_rounded_up < slice_end * refs + refs - 3) {
+                filter_width = filter_width_rounded_up;
+                edge_alignment = 1;
+            } else {
+                filter_width = td->w - edge;
+                edge_alignment = s->req_align;
+            }
             s->filter_line(dst + pix_3, prev + pix_3, cur + pix_3,
-                           next + pix_3, td->w - edge,
+                           next + pix_3, filter_width,
                            prefs, mrefs, parity, mode);
             s->filter_edges(dst, prev, cur, next, td->w,
-                            prefs, mrefs, parity, mode, s->req_align);
+                            prefs, mrefs, parity, mode, edge_alignment);
         } else {
             memcpy(&td->frame->data[td->plane][y * td->frame->linesize[td->plane]],
                    &s->cur->data[td->plane][y * refs], td->w * df);
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH v2 5/5] avfilter/vf_yadif: Add x86_64 avx yadif asm
  2022-07-21  2:25 [FFmpeg-devel] [PATCH v2 1/5] avfilter/vf_yadif: Fix edge size when MAX_ALIGN is < 4 Chris Phlipot
                   ` (2 preceding siblings ...)
  2022-07-21  2:25 ` [FFmpeg-devel] [PATCH v2 4/5] avfilter/vf_yadif: Process more pixels using filter_line Chris Phlipot
@ 2022-07-21  2:25 ` Chris Phlipot
  3 siblings, 0 replies; 5+ messages in thread
From: Chris Phlipot @ 2022-07-21  2:25 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Chris Phlipot

Add a new version of yadif_filter_line performed using packed bytes
instead of the packed words used by the current implementaiton. As
a result this implementation runs almost 2x as fast as the current
fastest SSSE3 implementation.

This implementation is created from scratch based on the C code, with
the goal of keeping all intermediate values within 8-bits so that
the vectorized code can be computed using packed bytes. differences
are as follows:
- Use algorithms to compute avg and abs difference using only 8-bit
 intermediate values.
- Reworked the mode 1 code by applying various mathematical identities
 to keep all intermediate values within 8-bits.
- Attempt to compute the spatial score using only 8-bits. The actual
 spatial score fits within this range 97% (content dependent) of the
 time for the entire 128-bit xmm vector. In the case that spatial
 score needs more than 8-bits to be represented, we detect this case,
 and recompute the spatial score using 16-bit packed words instead.

In 3% of cases the spatial_score will need more than 8-bytes to store
so we have a slow path, where the spatial score is computed using
packed words instead.

This implementation is currently limited to x86_64 due to the number
of registers required. x86_32 is possible, but the performance benefit
over the existing SSSE3 implentation is not as great, due to all of the
stack spills that would result from having far fewer registers. ASM was
not generated for the 32-bit varient due to limited ROI, as most AVX
users are likely on 64-bit OS at this point and 32-bit users would
lose out on most of the performance benefit.

Signed-off-by: Chris Phlipot <cphlipot0@gmail.com>
---
 libavfilter/x86/Makefile         |   2 +-
 libavfilter/x86/vf_yadif_init.c  |  11 +
 libavfilter/x86/vf_yadif_x64.asm | 492 +++++++++++++++++++++++++++++++
 3 files changed, 504 insertions(+), 1 deletion(-)
 create mode 100644 libavfilter/x86/vf_yadif_x64.asm

diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index e87481bd7a..19161ffa23 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -80,4 +80,4 @@ X86ASM-OBJS-$(CONFIG_TRANSPOSE_FILTER)       += x86/vf_transpose.o
 X86ASM-OBJS-$(CONFIG_VOLUME_FILTER)          += x86/af_volume.o
 X86ASM-OBJS-$(CONFIG_V360_FILTER)            += x86/vf_v360.o
 X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER)          += x86/vf_w3fdif.o
-X86ASM-OBJS-$(CONFIG_YADIF_FILTER)           += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
+X86ASM-OBJS-$(CONFIG_YADIF_FILTER)           += x86/vf_yadif.o x86/vf_yadif_x64.o x86/yadif-16.o x86/yadif-10.o
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index 9dd73f8e44..1369081690 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -29,6 +29,11 @@ void ff_yadif_filter_line_sse2(void *dst, void *prev, void *cur,
 void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur,
                                 void *next, int w, int prefs,
                                 int mrefs, int parity, int mode);
+#if ARCH_X86_64
+void ff_yadif_filter_line_avx(void *dst, void *prev, void *cur,
+                              void *next, int w, int prefs,
+                              int mrefs, int parity, int mode);
+#endif
 
 void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur,
                                      void *next, int w, int prefs,
@@ -71,5 +76,11 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
             yadif->filter_line = ff_yadif_filter_line_sse2;
         if (EXTERNAL_SSSE3(cpu_flags))
             yadif->filter_line = ff_yadif_filter_line_ssse3;
+#if ARCH_X86_64
+        if (EXTERNAL_AVX(cpu_flags)) {
+            yadif->filter_line = ff_yadif_filter_line_avx;
+            yadif->req_align = 16;
+        }
+#endif
     }
 }
diff --git a/libavfilter/x86/vf_yadif_x64.asm b/libavfilter/x86/vf_yadif_x64.asm
new file mode 100644
index 0000000000..c7e236ade7
--- /dev/null
+++ b/libavfilter/x86/vf_yadif_x64.asm
@@ -0,0 +1,492 @@
+;******************************************************************************
+;* Copyright (C) 2006-2011 Michael Niedermayer <michaelni@gmx.at>
+;*               2010      James Darnley <james.darnley@gmail.com>
+;*               2013-2022 Chris Phlipot <cphlipot0@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software;* you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation;* either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%if ARCH_X86_64
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+pb_127: times 16 db 127
+pb_128: times 16 db 128
+
+SECTION .text
+
+; Rename a register so that it can be used for a new purpose. The old name is
+; will become undefined so that any additional usage of the old name will
+; result in a compiler/assembler error.
+%macro RENAME_REGISTER 2
+    %ifidni %1,%2
+        %error "Can't rename a register to itself."
+    %endif
+    %xdefine %1 %2
+    %undef %2
+%endmacro
+
+; Usage: dst, arg1, arg2, temp1
+; Compute the absolute difference of arg1 and arg2 and places them in dst.
+; all operations are perfomed using packed bytes. Unlike ARM NEON there is no
+; instruction to do this, so instead we emulate it with multiple instructions.
+; eg. dst = abs(arg1 - arg2)
+%macro absdif_pb 4
+    %ifidni %1,%3
+        %error "arg1 and arg3 must be different"
+    %elifidni %1,%4
+        %error "arg1 and arg4 must be different"
+    %elifidni %3,%4
+        %error "arg3 and arg4 must be different"
+    %endif
+    psubusb        %4, %3, %2
+    psubusb        %1, %2, %3
+    por            %1, %1, %4
+%endmacro
+
+; Usage: dst, arg1, arg2, pb_1, temp1
+; Compute the average of 2 unsigned values rounded down.
+; SSE provides pavgb, which rounds up. Unlike ARM NEON, SSE doen't provide
+; an instruction that computes the avg of 2 unsigned bytes rounded down, so
+; instead we emulate it with this macro.
+; eg. dst = (arg1 + arg2) >> 1
+%macro avg_truncate_pb 5
+    %ifidni %1,%3
+        %error "arg1 and arg3 must be different"
+    %elifidni %1,%4
+        %error "arg1 and arg5 must be different"
+    %endif
+    pxor   %5, %2, %3
+    pavgb  %1, %2, %3
+    pand   %5, %5, %4
+    psubb  %1, %1, %5
+%endmacro
+
+INIT_XMM avx
+
+cglobal yadif_filter_line, 5, 15, 8, 240, dst, prev, cur, next, width, prefs, \
+                                        mrefs, parity, mode
+%xdefine cur_plus_prefs  r5
+%xdefine cur_plus_mrefs  r6
+%xdefine prefs r7
+%xdefine next2  r8
+%xdefine prev2_2mrefs  r9
+%xdefine mrefs r10
+%xdefine prev2_2prefs  r11
+%xdefine next2_2mrefs  r12
+%xdefine prev_plus_mrefs  r13
+%xdefine next_plus_mrefs  r14
+%xdefine prev2_2mrefs_stack_spill  [rsp - 24]
+%xdefine pb_1_reg m15
+
+%xdefine old_absdif_ahead_stack [rsp - 128]
+%xdefine absdif_here [rsp - 80]
+%xdefine absdif_behind [rsp - 64]
+
+%xdefine spatial_predicate_stack [rsp - 112]
+%xdefine spatial_pred_check_minus_1 [rsp - 16]
+
+; unaligned loads are slower than aligned loads. It is often benificial to
+; store values in an aligned location after doing an aligned load so that all
+; future loads of that value will be aligned.
+%xdefine cur_plus_prefs_x_stack [rsp]
+%xdefine cur_plus_mrefs_x_stack [rsp + 16]
+%xdefine cur_plus_mrefs_x_2_stack [rsp + 96]
+%xdefine cur_plus_prefs_x_minus_2 [rsp + 80]
+
+; Absolute differences used for CHECK(-1)
+%xdefine chkneg1_ad2_stack [rsp - 96]
+%xdefine chkneg1_ad1_stack [rsp + 176]
+%xdefine chkneg1_ad0_stack [rsp - 48]
+
+; Absolute differences used for CHECK(-2)
+%xdefine chkneg2_ad2_stack [rsp + 160]
+%xdefine chkneg2_ad1_stack [rsp + 144]
+%xdefine chkneg2_ad0_stack [rsp + 208]
+
+; Absolute differences used for CHECK(1)
+%xdefine chkpos1_ad2_stack [rsp + 112]
+%xdefine chkpos1_ad1_stack [rsp + 128]
+; chkpos1_ad0 has no stack locatation since it is kept in a register.
+
+; Absolute differences used for CHECK(2)
+%xdefine chkpos2_ad2_stack [rsp + 64]
+%xdefine chkpos2_ad1_stack [rsp + 48]
+%xdefine chkpos2_ad0_stack [rsp + 32]
+
+    movsxd          prefs, DWORD prefsm
+    movsxd          mrefs, DWORD mrefsm
+; Bail out early if width is zero.
+    test            widthd, widthd
+    jle             .return
+
+; Initialize all pointers. Unlike the C code the pointers all point to the
+; location where x equals 0 and remain unchanged instead of the pointers being
+; incremented on every loop iteration. Instead only x is incremented, and x86
+; memory addressing is used to add the current value of x on every memory
+; access at (most likely) zero cost.
+    lea             cur_plus_prefs, [curq + prefs]
+    movu            m0, [curq + prefs - 1]
+    lea             cur_plus_mrefs, [curq + mrefs]
+    movu            m1, [curq + mrefs - 1]
+    absdif_pb       m0, m0, m1, m5
+    cmp             dword paritym, 0
+    mov             next2, curq
+RENAME_REGISTER prev2, curq
+    cmove           next2, nextq
+    pslldq          m8, m0, 14
+    mova            old_absdif_ahead_stack, m8
+    cmovne          prev2, prevq
+    lea             prev_plus_mrefs, [prevq + mrefs]
+    add             prevq, prefs
+RENAME_REGISTER prev_plus_prefs, prevq
+    lea             next_plus_mrefs, [nextq + mrefs]
+    add             nextq, prefs
+RENAME_REGISTER next_plus_prefs, nextq
+    lea             prev2_2mrefs, [prev2 + 2*mrefs]
+    mov             prev2_2mrefs_stack_spill, prev2_2mrefs
+    lea             prev2_2prefs, [prev2 + 2*prefs]
+    lea             next2_2mrefs, [next2 + 2*mrefs]
+RENAME_REGISTER next2_2prefs, mrefs
+    lea             next2_2prefs, [next2 + 2*prefs]
+RENAME_REGISTER x, prefs
+    xor             x, x
+    mova            pb_1_reg, [pb_1]
+    mov             prev2_2mrefs,  prev2_2mrefs_stack_spill
+
+    jmp             .loop_start
+.loop_tail:
+    paddusb         m3, m2, m1
+    pminub          m0, m9, m3
+    psubusb         m2, m2, m1
+    pmaxub          m0, m0, m2
+    movu            [dstq + x], m0
+    add             x, 16
+    cmp             x, widthq
+    jge             .return
+.loop_start:
+; Start by computing the spatial score
+; We attempt to Compute the spatial score using saturated adds. In real
+; world content the entire spatial score 16-byte xmm vector will be able
+; to accurately represent the spatial score in 8-bits > 97% of the
+; time. Because of this we try computing the spatial score with 8-bit
+; first since it is 2x as fast, and check if we saturated the computation later.
+; The original spatial score can potentially be in the range of -1 to 765
+; Instead for this approach, we map the lower end of that to 8-bits using
+; the range -128 to 127.
+; If we detected that this assumption may have failed we instead re-compute
+; the spatial score using the full 16-bit range needed to represent -1 to 765.
+;
+; Before we compute the spatial score, we pre-compute most of the absolute
+; difference values used in the C code's CHECK() macros. These absolute
+; differences are then stored to the stack so that they can be re-used for the
+; slower 16-bit spatial score approach in case that is needed.
+    movu            m6, [cur_plus_mrefs + x - 3]
+    movu            m11, [cur_plus_mrefs + x - 2]
+    movu            m2, [cur_plus_mrefs + x - 1]
+    movu            m3, [cur_plus_mrefs + x]
+    movu            m13, [cur_plus_prefs + x]
+    movu            m0, [cur_plus_mrefs + x + 1]
+    movu            m1, [cur_plus_prefs + x + 1]
+    absdif_pb       m14, m0, m1, m5             ; abs(cur[mrefs+1]-cur[prefs+1])
+    avg_truncate_pb m10, m13, m3, pb_1_reg, m5  ; spatial_pred = (c+d) >> 2
+    mova            spatial_predicate_stack, m10
+    movu            m7, [cur_plus_prefs + x + 2]
+    absdif_pb       m10, m11, m13, m5           ; abs(cur[mrefs-2]-cur[prefs])
+    mova            chkneg1_ad2_stack, m10
+    absdif_pb       m8, m2, m1, m5              ; abs(cur[mrefs-1]-cur[prefs+1])
+    absdif_pb       m9, m3, m7, m5              ; abs(cur[mrefs]-cur[prefs+2])
+    absdif_pb       m10, m6, m1, m5             ; abs(cur[mrefs-3]-cur[prefs+1])
+    mova            chkneg2_ad2_stack, m10
+    absdif_pb       m10, m11, m7, m5            ; abs(cur[mrefs-2]-cur[prefs+2])
+    mova            chkneg2_ad1_stack, m10
+    movu            m4, [cur_plus_prefs + x + 3]
+    absdif_pb       m10, m2, m4, m5             ; abs(cur[mrefs-1]-cur[prefs+3])
+    mova            chkneg2_ad0_stack, m10
+    movu            m12, [cur_plus_mrefs + x + 2]
+    absdif_pb       m10, m12, m13, m5           ; abs(cur[mrefs+2]-cur[prefs])
+    mova            cur_plus_prefs_x_stack, m13
+    mova            chkpos1_ad2_stack, m10
+    movu            m6, [cur_plus_prefs + x - 1]
+    absdif_pb       m10, m0, m6, m5             ; abs(cur[mrefs+1]-cur[prefs-1])
+    mova            chkpos1_ad1_stack, m10
+    movu            m10, [cur_plus_prefs + x - 2]
+    mova            cur_plus_mrefs_x_stack, m3
+    absdif_pb       m13, m10, m3, m5            ; abs(cur[mrefs]-cur[prefs-2])
+    movu            m4, [cur_plus_mrefs + x + 3]
+    absdif_pb       m4, m4, m6, m5              ; abs(cur[mrefs+3]-cur[prefs-1])
+    mova            chkpos2_ad2_stack, m4
+    absdif_pb       m3, m12, m10, m5            ; abs(cur[mrefs+2]-cur[prefs-2])
+    mova            chkpos2_ad1_stack, m3
+    movu            m4, [cur_plus_prefs + x - 3]
+    absdif_pb       m3, m0, m4, m5              ; abs(cur[mrefs+1]-cur[prefs-3])
+    mova            chkpos2_ad0_stack, m3
+    mova            chkneg1_ad1_stack, m8
+    paddusb         m5, m8, chkneg1_ad2_stack
+    mova            chkneg1_ad0_stack, m9
+    paddusb         m4, m9, pb_1_reg
+    paddusb         m5, m5, m4
+    mova            m3, old_absdif_ahead_stack
+    palignr         m4, m14, m3, 15
+    palignr         m3, m14, m3, 14
+    mova            old_absdif_ahead_stack, m14
+    mova            absdif_here, m4
+    paddusb         m4, m14, m4
+    mova            absdif_behind, m3
+    paddusb         m4, m4, m3
+    pxor            m4, m4, [pb_128]
+    pxor            m5, m5, [pb_128]
+    pcmpgtb         m8, m4, m5
+    pcmpeqb         m14, m4, [pb_127]
+    por             m8, m8, m14
+    pminsb          m4, m4, m5
+    avg_truncate_pb m1, m1, m2, pb_1_reg, m5
+    mova            spatial_pred_check_minus_1, m1
+    mova            m1, chkneg2_ad1_stack
+    paddusb         m2, m1, chkneg2_ad2_stack
+    paddusb         m5, pb_1_reg, chkneg2_ad0_stack
+    paddusb         m2, m2, m5
+    avg_truncate_pb m11, m11, m7, pb_1_reg, m5
+    mova            m3, chkpos1_ad1_stack
+    paddusb         m3, m3, chkpos1_ad2_stack
+    paddusb         m5, m13, pb_1_reg
+    paddusb         m7, m3, m5
+    avg_truncate_pb m3, m6, m0, pb_1_reg, m5
+    pxor            m0, m2, [pb_128]
+    pcmpgtb         m2, m4, m0
+    pand            m5, m8, m2
+    pblendvb        m6, m4, m0, m5
+    pcmpeqb         m0, m4, [pb_127]
+    pand            m2, m5, m0
+    pxor            m7, m7, [pb_128]
+    pminsb          m0, m6, m7
+    pcmpeqb         m4, m0, [pb_127]
+    por             m2, m4, m2
+    ptest           m2, m2
+    jne             .spatial_check_16_bit
+; At this point we know if we can continue on the fast path with saturating
+; spatial score computation while maintaining bit-accuracy, or if we need to
+; bail out and perform the spatial score computation using full 16-bit words
+; to store the score value. check_2_saturate is only executed here if we know
+; we don't need to go down the slow path.
+.check_2_saturate:
+    mova            m2, spatial_predicate_stack
+    pblendvb        m1, m2, spatial_pred_check_minus_1, m8
+    pblendvb        m1, m11, m5
+    pcmpgtb         m2, m6, m7
+    pcmpeqb         m5, m6, [pb_127]
+    por             m2, m5, m2
+    pblendvb        m1, m3, m2
+    mova            m3, chkpos2_ad1_stack
+    paddusb         m3, m3, chkpos2_ad2_stack
+    paddusb         m4, pb_1_reg, chkpos2_ad0_stack
+    paddusb         m3, m3, m4
+    pxor            m3, m3, [pb_128]
+    pcmpgtb         m0, m0, m3
+    pand            m0, m0, m2
+    avg_truncate_pb m2, m12, m10, pb_1_reg, m5
+    pblendvb        m9, m1, m2, m0
+.temporal_check:
+    mova            m0, cur_plus_mrefs_x_stack
+    mova            m8, cur_plus_prefs_x_stack
+    movu            m1, [prev2 + x]
+    movu            m6, [next2 + x]
+    avg_truncate_pb m2, m6, m1, pb_1_reg, m5
+    absdif_pb       m1, m1, m6, m5
+    movu            m6, [prev_plus_mrefs + x]
+    movu            m4, [prev_plus_prefs + x]
+    absdif_pb       m6, m6, m0, m5
+    absdif_pb       m4, m4, m8, m5
+    avg_truncate_pb m6, m6, m4, pb_1_reg, m5
+    movu            m4, [next_plus_mrefs + x]
+    movu            m3, [next_plus_prefs + x]
+    absdif_pb       m4, m4, m0, m5
+    absdif_pb       m3, m3, m8, m5
+    avg_truncate_pb m4, m4, m3, pb_1_reg, m5
+    pmaxub          m6, m6, m4
+    psrlw           m1, m1, 1
+    pand            m1, m1, [pb_127]
+    pmaxub          m1, m1, m6
+    cmp             DWORD modem, 1
+    jg              .loop_tail
+.handle_mode_1:
+; Handle the "if (!(mode&2))" section.
+; This section has undergone some complex
+; tranformations with respect to the c implementation in order to
+; ensure that all inputs, outputs and intermeidate values can be
+; stored in 8-bit unsigned values. The code is transformed with
+; various identities to prevent signed intermediate values which
+; would require an extra 9th bit for the sign, which we don't have.
+; The main identities are applied:
+; 1. -MAX(a-b, c-d) = MIN(b-c, d-c)
+; 2. MIN(a-c, b-c) = MIN(a, b)-c
+; The following from the C code:
+;
+; int max = FFMAX3(d-e, d-c, FFMIN(b-c, f-e));
+; diff = FFMAX3(diff, min, -max);
+;
+; becomes:
+; int negative_max = FFMIN( FFMIN(e, c)-d, FFMAX(c-b, e-f))
+; diff = FFMAX3(diff, min, negative_max);
+;
+; Lastly we know that diff must be non-negative in the end, so
+; intermediate negative values don't matter. to keep computations
+; within 8 bits, we use saturating subtraction which replaces all
+; negative intermediate results with 0, but doesn't affect the
+; final value assigned to diff.
+    movu            m6, [prev2_2mrefs + x]
+    movu            m4, [next2_2mrefs + x]
+    avg_truncate_pb m6, m6, m4, pb_1_reg, m5
+    movu            m4, [prev2_2prefs + x]
+    movu            m3, [next2_2prefs + x]
+    avg_truncate_pb m4, m4, m3, pb_1_reg, m5
+    psubusb         m3, m8, m2
+    psubusb         m5, m0, m2
+    pminub          m3, m3, m5
+    psubusb         m5, m0, m6
+    psubusb         m7, m8, m4
+    pmaxub          m5, m5, m7
+    pminub          m3, m3, m5
+    psubusb         m5, m2, m8
+    psubusb         m7, m2, m0
+    pminub          m5, m5, m7
+    psubusb         m6, m6, m0
+    psubusb         m4, m4, m8
+    pmaxub          m6, m6, m4
+    pminub          m6, m5, m6
+    pmaxub          m6, m6, m3
+    pmaxub          m1, m1, m6
+    jmp             .loop_tail
+.spatial_check_16_bit:
+; Assuming all else fails, we compute the spatial score using packed words to
+; store the temporary values. Every input register containing packed bytes is
+; unpacked into 2 separate registers with packed words, which are then
+; processed identically. This path should generally be run < 3% of time, and
+; is kept mainly to ensure output is bit-accurate compared to the C
+; impelmentation
+    mova            cur_plus_mrefs_x_2_stack, m12
+    mova            cur_plus_prefs_x_minus_2, m10
+    mova            m5, old_absdif_ahead_stack
+    pmovzxbw        m0, m5
+    mova            m4, absdif_here
+    pmovzxbw        m2, m4
+    paddw           m0, m0, m2
+    pxor            m12, m12, m12
+    punpckhbw       m2, m4, m12
+    punpckhbw       m5, m5, m12
+    paddw           m2, m5, m2
+    mova            m7, absdif_behind
+    pmovzxbw        m5, m7
+    pcmpeqd         m4, m4, m4
+    paddw           m5, m5, m4
+    paddw           m9, m0, m5
+    punpckhbw       m5, m7, m12
+    paddw           m5, m5, m4
+    paddw           m7, m2, m5
+    mova            m0, chkneg1_ad2_stack
+    pmovzxbw        m2, m0
+    mova            m4, chkneg1_ad1_stack
+    pmovzxbw        m5, m4
+    paddw           m2, m2, m5
+    punpckhbw       m5, m0, m12
+    punpckhbw       m4, m4, m12
+    paddw           m4, m5, m4
+    mova            m0, chkneg1_ad0_stack
+    pmovzxbw        m5, m0
+    paddw           m5, m2, m5
+    pminsw          m6, m9, m5
+    punpckhbw       m2, m0, m12
+    paddw           m2, m4, m2
+    pminsw          m14, m2, m7
+    pcmpgtw         m4, m9, m5
+    pcmpgtw         m10, m7, m2
+    packsswb        m0, m4, m10
+    mova            m2, spatial_predicate_stack
+    pblendvb        m0, m2, spatial_pred_check_minus_1, m0
+    mova            spatial_predicate_stack, m0
+    mova            m0, chkneg2_ad2_stack
+    pmovzxbw        m2, m0
+    mova            m1, chkneg2_ad0_stack
+    pmovzxbw        m7, m1
+    paddw           m2, m7, m2
+    mova            m9, chkneg2_ad1_stack
+    pmovzxbw        m7, m9
+    paddw           m2, m2, m7
+    punpckhbw       m7, m0, m12
+    punpckhbw       m5, m1, m12
+    paddw           m5, m5, m7
+    punpckhbw       m7, m9, m12
+    paddw           m5, m5, m7
+    mova            m0, chkpos1_ad2_stack
+    pmovzxbw        m7, m0
+    mova            m1, chkpos1_ad1_stack
+    pmovzxbw        m8, m1
+    paddw           m8, m8, m7
+    punpckhbw       m7, m0, m12
+    punpckhbw       m0, m1, m12
+    paddw           m7, m0, m7
+    pmovzxbw        m1, m13
+    paddw           m9, m8, m1
+    punpckhbw       m0, m13, m12
+    paddw           m8, m7, m0
+    pcmpgtw         m0, m6, m2
+    pand            m0, m0, m4
+    pcmpgtw         m4, m14, m5
+    pand            m4, m10, m4
+    pblendvb        m1, m6, m2, m0
+    pblendvb        m14, m5, m4
+    packsswb        m0, m0, m4
+    mova            m5, spatial_predicate_stack
+    pblendvb        m0, m5, m11, m0
+    pcmpgtw         m5, m1, m9
+    pcmpgtw         m4, m14, m8
+    packsswb        m6, m5, m4
+    pblendvb        m13, m0, m3, m6
+    mova            m0, chkpos2_ad2_stack
+    pmovzxbw        m3, m0
+    mova            m7, chkpos2_ad1_stack
+    pmovzxbw        m6, m7
+    paddw           m3, m6, m3
+    punpckhbw       m6, m0, m12
+    punpckhbw       m0, m7, m12
+    paddw           m0, m0, m6
+    mova            m7, chkpos2_ad0_stack
+    pmovzxbw        m6, m7
+    paddw           m3, m3, m6
+    punpckhbw       m6, m7, m12
+    paddw           m0, m0, m6
+    pminsw          m1, m9, m1
+    pcmpgtw         m1, m1, m3
+    pminsw          m14, m8, m14
+    pcmpgtw         m14, m14, m0
+    pand            m1, m1, m5
+    pand            m14, m14, m4
+    packsswb        m14, m1, m14
+    mova            m0, cur_plus_mrefs_x_2_stack
+    mova            m5, cur_plus_prefs_x_minus_2
+    pxor            m1, m5, m0
+    pavgb           m0, m0, m5
+    pand            m1, m1, [pb_1]
+    psubb           m1, m0, m1
+    pblendvb        m9, m13, m1, m14
+    jmp     .temporal_check
+.return:
+    RET
+
+%endif  ; ARCH_X86_64
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-07-21  2:26 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-21  2:25 [FFmpeg-devel] [PATCH v2 1/5] avfilter/vf_yadif: Fix edge size when MAX_ALIGN is < 4 Chris Phlipot
2022-07-21  2:25 ` [FFmpeg-devel] [PATCH v2 2/5] avfilter/vf_yadif: Allow alignment to be configurable Chris Phlipot
2022-07-21  2:25 ` [FFmpeg-devel] [PATCH v2 3/5] avfilter/vf_yadif: reformat code to improve readability Chris Phlipot
2022-07-21  2:25 ` [FFmpeg-devel] [PATCH v2 4/5] avfilter/vf_yadif: Process more pixels using filter_line Chris Phlipot
2022-07-21  2:25 ` [FFmpeg-devel] [PATCH v2 5/5] avfilter/vf_yadif: Add x86_64 avx yadif asm Chris Phlipot

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git