* [FFmpeg-devel] [PATCH 1/5] avfilter/vf_yadif: Fix edge size when MAX_ALIGN is < 4
@ 2022-07-20 4:41 Chris Phlipot
2022-07-20 4:41 ` [FFmpeg-devel] [PATCH 2/5] avfilter/vf_yadif: Allow alignment to be configurable Chris Phlipot
` (3 more replies)
0 siblings, 4 replies; 7+ messages in thread
From: Chris Phlipot @ 2022-07-20 4:41 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Chris Phlipot
If alignment is set to less than 4 filter_edges will produce incorrect
output and not filter the entire edge. To fix this, make sure that
the edge size is at least 3.
Signed-off-by: Chris Phlipot <cphlipot0@gmail.com>
---
libavfilter/vf_yadif.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c
index afa4d1d53d..055327d7a4 100644
--- a/libavfilter/vf_yadif.c
+++ b/libavfilter/vf_yadif.c
@@ -120,7 +120,7 @@ static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
uint8_t *prev2 = parity ? prev : cur ;
uint8_t *next2 = parity ? cur : next;
- const int edge = MAX_ALIGN - 1;
+ const int edge = FFMAX(MAX_ALIGN - 1, 3);
int offset = FFMAX(w - edge, 3);
/* Only edge pixels need to be processed here. A constant value of false
@@ -169,7 +169,7 @@ static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1,
uint16_t *prev2 = parity ? prev : cur ;
uint16_t *next2 = parity ? cur : next;
- const int edge = MAX_ALIGN / 2 - 1;
+ const int edge = FFMAX(MAX_ALIGN / 2 - 1, 3);
int offset = FFMAX(w - edge, 3);
mrefs /= 2;
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH 2/5] avfilter/vf_yadif: Allow alignment to be configurable
2022-07-20 4:41 [FFmpeg-devel] [PATCH 1/5] avfilter/vf_yadif: Fix edge size when MAX_ALIGN is < 4 Chris Phlipot
@ 2022-07-20 4:41 ` Chris Phlipot
2022-07-20 4:41 ` [FFmpeg-devel] [PATCH 3/5] avfilter/vf_yadif: reformat code to improve readability Chris Phlipot
` (2 subsequent siblings)
3 siblings, 0 replies; 7+ messages in thread
From: Chris Phlipot @ 2022-07-20 4:41 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Chris Phlipot
Allow the alignment to be determined based on what yadif_filter_line
implementation is used. Currently this is either 1, or 8 depending
on whether the C code or the x86 SSE code is used, but allows for
other future implementations that use a larger alignment.
Adjusting MAX_ALIGN to 32 in the case of an AVX2 implementation
could potentially hurt the performance of the SSE implementation,
so we allow yadif to use the smallest needed alignment instead to
maintain existing performance if implementations with wider vectors
are added.
Signed-off-by: Chris Phlipot <cphlipot0@gmail.com>
---
libavfilter/vf_yadif.c | 16 +++++++++-------
libavfilter/x86/vf_yadif_init.c | 1 +
libavfilter/yadif.h | 4 +++-
3 files changed, 13 insertions(+), 8 deletions(-)
diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c
index 055327d7a4..42f6246330 100644
--- a/libavfilter/vf_yadif.c
+++ b/libavfilter/vf_yadif.c
@@ -108,9 +108,9 @@ static void filter_line_c(void *dst1,
FILTER(0, w, 1)
}
-#define MAX_ALIGN 8
static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
- int w, int prefs, int mrefs, int parity, int mode)
+ int w, int prefs, int mrefs, int parity, int mode,
+ int alignment)
{
uint8_t *dst = dst1;
uint8_t *prev = prev1;
@@ -120,7 +120,7 @@ static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
uint8_t *prev2 = parity ? prev : cur ;
uint8_t *next2 = parity ? cur : next;
- const int edge = FFMAX(MAX_ALIGN - 1, 3);
+ const int edge = FFMAX(alignment - 1, 3);
int offset = FFMAX(w - edge, 3);
/* Only edge pixels need to be processed here. A constant value of false
@@ -159,7 +159,8 @@ static void filter_line_c_16bit(void *dst1,
}
static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1,
- int w, int prefs, int mrefs, int parity, int mode)
+ int w, int prefs, int mrefs, int parity, int mode,
+ int alignment)
{
uint16_t *dst = dst1;
uint16_t *prev = prev1;
@@ -169,7 +170,7 @@ static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1,
uint16_t *prev2 = parity ? prev : cur ;
uint16_t *next2 = parity ? cur : next;
- const int edge = FFMAX(MAX_ALIGN / 2 - 1, 3);
+ const int edge = FFMAX(alignment / 2 - 1, 3);
int offset = FFMAX(w - edge, 3);
mrefs /= 2;
@@ -199,7 +200,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
int slice_start = (td->h * jobnr ) / nb_jobs;
int slice_end = (td->h * (jobnr+1)) / nb_jobs;
int y;
- int edge = 3 + MAX_ALIGN / df - 1;
+ int edge = 3 + s->req_align / df - 1;
/* filtering reads 3 pixels to the left/right; to avoid invalid reads,
* we need to call the c variant which avoids this for border pixels
@@ -219,7 +220,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
s->filter_edges(dst, prev, cur, next, td->w,
y + 1 < td->h ? refs : -refs,
y ? -refs : refs,
- td->parity ^ td->tff, mode);
+ td->parity ^ td->tff, mode, s->req_align);
} else {
memcpy(&td->frame->data[td->plane][y * td->frame->linesize[td->plane]],
&s->cur->data[td->plane][y * refs], td->w * df);
@@ -303,6 +304,7 @@ static int config_output(AVFilterLink *outlink)
s->csp = av_pix_fmt_desc_get(outlink->format);
s->filter = filter;
+ s->req_align = 1;
if (s->csp->comp[0].depth > 8) {
s->filter_line = filter_line_c_16bit;
s->filter_edges = filter_edges_16bit;
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index 257c3f9199..9dd73f8e44 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -53,6 +53,7 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
int bit_depth = (!yadif->csp) ? 8
: yadif->csp->comp[0].depth;
+ yadif->req_align = 8;
if (bit_depth >= 15) {
if (EXTERNAL_SSE2(cpu_flags))
yadif->filter_line = ff_yadif_filter_line_16bit_sse2;
diff --git a/libavfilter/yadif.h b/libavfilter/yadif.h
index c928911b35..b81f2fc1d9 100644
--- a/libavfilter/yadif.h
+++ b/libavfilter/yadif.h
@@ -66,11 +66,13 @@ typedef struct YADIFContext {
/**
* Required alignment for filter_line
*/
+ int req_align;
void (*filter_line)(void *dst,
void *prev, void *cur, void *next,
int w, int prefs, int mrefs, int parity, int mode);
void (*filter_edges)(void *dst, void *prev, void *cur, void *next,
- int w, int prefs, int mrefs, int parity, int mode);
+ int w, int prefs, int mrefs, int parity, int mode,
+ int alignment);
const AVPixFmtDescriptor *csp;
int eof;
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH 3/5] avfilter/vf_yadif: reformat code to improve readability
2022-07-20 4:41 [FFmpeg-devel] [PATCH 1/5] avfilter/vf_yadif: Fix edge size when MAX_ALIGN is < 4 Chris Phlipot
2022-07-20 4:41 ` [FFmpeg-devel] [PATCH 2/5] avfilter/vf_yadif: Allow alignment to be configurable Chris Phlipot
@ 2022-07-20 4:41 ` Chris Phlipot
2022-07-20 4:41 ` [FFmpeg-devel] [PATCH 4/5] avfilter/vf_yadif: Process more pixels using filter_line Chris Phlipot
2022-07-20 4:41 ` [FFmpeg-devel] [PATCH 5/5] avfilter/vf_yadif: Add x86_64 avx yadif asm Chris Phlipot
3 siblings, 0 replies; 7+ messages in thread
From: Chris Phlipot @ 2022-07-20 4:41 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Chris Phlipot
Reformat some of the code to improve readability and reduce code
duplication. This change is intended to be purely cosmentic and
shouldn't result in any functional changes.
Signed-off-by: Chris Phlipot <cphlipot0@gmail.com>
---
libavfilter/vf_yadif.c | 11 +++++------
libavfilter/yadif.h | 3 +--
2 files changed, 6 insertions(+), 8 deletions(-)
diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c
index 42f6246330..54109566be 100644
--- a/libavfilter/vf_yadif.c
+++ b/libavfilter/vf_yadif.c
@@ -211,16 +211,15 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
uint8_t *cur = &s->cur ->data[td->plane][y * refs];
uint8_t *next = &s->next->data[td->plane][y * refs];
uint8_t *dst = &td->frame->data[td->plane][y * td->frame->linesize[td->plane]];
+ int prefs = y + 1 < td->h ? refs : -refs;
+ int mrefs = y ? -refs : refs;
+ int parity = td->parity ^ td->tff;
int mode = y == 1 || y + 2 == td->h ? 2 : s->mode;
s->filter_line(dst + pix_3, prev + pix_3, cur + pix_3,
next + pix_3, td->w - edge,
- y + 1 < td->h ? refs : -refs,
- y ? -refs : refs,
- td->parity ^ td->tff, mode);
+ prefs, mrefs, parity, mode);
s->filter_edges(dst, prev, cur, next, td->w,
- y + 1 < td->h ? refs : -refs,
- y ? -refs : refs,
- td->parity ^ td->tff, mode, s->req_align);
+ prefs, mrefs, parity, mode, s->req_align);
} else {
memcpy(&td->frame->data[td->plane][y * td->frame->linesize[td->plane]],
&s->cur->data[td->plane][y * refs], td->w * df);
diff --git a/libavfilter/yadif.h b/libavfilter/yadif.h
index b81f2fc1d9..f271fe8304 100644
--- a/libavfilter/yadif.h
+++ b/libavfilter/yadif.h
@@ -67,8 +67,7 @@ typedef struct YADIFContext {
* Required alignment for filter_line
*/
int req_align;
- void (*filter_line)(void *dst,
- void *prev, void *cur, void *next,
+ void (*filter_line)(void *dst, void *prev, void *cur, void *next,
int w, int prefs, int mrefs, int parity, int mode);
void (*filter_edges)(void *dst, void *prev, void *cur, void *next,
int w, int prefs, int mrefs, int parity, int mode,
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH 4/5] avfilter/vf_yadif: Process more pixels using filter_line
2022-07-20 4:41 [FFmpeg-devel] [PATCH 1/5] avfilter/vf_yadif: Fix edge size when MAX_ALIGN is < 4 Chris Phlipot
2022-07-20 4:41 ` [FFmpeg-devel] [PATCH 2/5] avfilter/vf_yadif: Allow alignment to be configurable Chris Phlipot
2022-07-20 4:41 ` [FFmpeg-devel] [PATCH 3/5] avfilter/vf_yadif: reformat code to improve readability Chris Phlipot
@ 2022-07-20 4:41 ` Chris Phlipot
2022-07-20 4:41 ` [FFmpeg-devel] [PATCH 5/5] avfilter/vf_yadif: Add x86_64 avx yadif asm Chris Phlipot
3 siblings, 0 replies; 7+ messages in thread
From: Chris Phlipot @ 2022-07-20 4:41 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Chris Phlipot
filter_line is generally vectorized, wheras filter_edge is implemented
in C. Currently we rely on filter_edge to process non-edges in cases
where the width doesn't match the alignment. This causes us to process
non-edge pixels with the slow C implementation vs the faster SSE
implementation.
It is generally faster to process 8 pixels with the slowest SSE2
vectorized implementation than it is to process 2 pixels with the
C implementation. Therefore, if filter_edge needs to process 2 or
more non-edge pixels, it would be faster to process these non-edge
pixels with filter_line instead even if it processes more pixels
than necessary.
To address this, we use filter_line so long as we know that at least
2 pixels will be used in the final output even if the rest of the
computed pixels are invalid. Any incorrect output pixels generated by
filter_line will be overwritten by the following call to filter_edge.
In addtion we avoid running filter_line if it would read or write
pixels outside the current slice.
Signed-off-by: Chris Phlipot <cphlipot0@gmail.com>
---
libavfilter/vf_yadif.c | 23 +++++++++++++++++++++--
1 file changed, 21 insertions(+), 2 deletions(-)
diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c
index 54109566be..394c04a985 100644
--- a/libavfilter/vf_yadif.c
+++ b/libavfilter/vf_yadif.c
@@ -201,6 +201,8 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
int slice_end = (td->h * (jobnr+1)) / nb_jobs;
int y;
int edge = 3 + s->req_align / df - 1;
+ int filter_width_target = td->w - 3;
+ int filter_width_rounded_up = (filter_width_target & ~(s->req_align-1)) + s->req_align;
/* filtering reads 3 pixels to the left/right; to avoid invalid reads,
* we need to call the c variant which avoids this for border pixels
@@ -215,11 +217,28 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
int mrefs = y ? -refs : refs;
int parity = td->parity ^ td->tff;
int mode = y == 1 || y + 2 == td->h ? 2 : s->mode;
+
+ /* Adjust width and alignment to process extra pixels in filter_line
+ * using potentially vectorized code so long as it doesn't cause
+ * reads or writes outside of the current slice. filter_edge will
+ * correct any incorrect pixels written by filter_line in this
+ * scenario.
+ */
+ int filter_width;
+ int edge_alignment;
+ if (filter_width_rounded_up - filter_width_target >= 2
+ && y*refs + filter_width_rounded_up < slice_end * refs + refs - 3) {
+ filter_width = filter_width_rounded_up;
+ edge_alignment = 1;
+ } else {
+ filter_width = td->w - edge;
+ edge_alignment = s->req_align;
+ }
s->filter_line(dst + pix_3, prev + pix_3, cur + pix_3,
- next + pix_3, td->w - edge,
+ next + pix_3, filter_width,
prefs, mrefs, parity, mode);
s->filter_edges(dst, prev, cur, next, td->w,
- prefs, mrefs, parity, mode, s->req_align);
+ prefs, mrefs, parity, mode, edge_alignment);
} else {
memcpy(&td->frame->data[td->plane][y * td->frame->linesize[td->plane]],
&s->cur->data[td->plane][y * refs], td->w * df);
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH 5/5] avfilter/vf_yadif: Add x86_64 avx yadif asm
2022-07-20 4:41 [FFmpeg-devel] [PATCH 1/5] avfilter/vf_yadif: Fix edge size when MAX_ALIGN is < 4 Chris Phlipot
` (2 preceding siblings ...)
2022-07-20 4:41 ` [FFmpeg-devel] [PATCH 4/5] avfilter/vf_yadif: Process more pixels using filter_line Chris Phlipot
@ 2022-07-20 4:41 ` Chris Phlipot
2022-07-20 13:16 ` Michael Niedermayer
3 siblings, 1 reply; 7+ messages in thread
From: Chris Phlipot @ 2022-07-20 4:41 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Chris Phlipot
Add a new version of yadif_filter_line performed using packed bytes
instead of the packed words used by the current implementaiton. As
a result this implementation runs almost 2x as fast as the current
fastest SSSE3 implementation.
This implementation is created from scratch based on the C code, with
the goal of keeping all intermediate values within 8-bits so that
the vectorized code can be computed using packed bytes. differences
are as follows:
- Use algorithms to compute avg and abs difference using only 8-bit
intermediate values.
- Reworked the mode 1 code by applying various mathematical identities
to keep all intermediate values within 8-bits.
- Attempt to compute the spatial score using only 8-bits. The actual
spatial score fits within this range 97% (content dependent) of the
time for the entire 128-bit xmm vector. In the case that spatial
score needs more than 8-bits to be represented, we detect this case,
and recompute the spatial score using 16-bit packed words instead.
In 3% of cases the spatial_score will need more than 8-bytes to store
so we have a slow path, where the spatial score is computed using
packed words instead.
This implementation is currently limited to x86_64 due to the number
of registers required. x86_32 is possible, but the performance benefit
over the existing SSSE3 implentation is not as great, due to all of the
stack spills that would result from having far fewer registers. ASM was
not generated for the 32-bit varient due to limited ROI, as most AVX
users are likely on 64-bit OS at this point and 32-bit users would
lose out on most of the performance benefit.
Signed-off-by: Chris Phlipot <cphlipot0@gmail.com>
---
libavfilter/x86/Makefile | 2 +-
libavfilter/x86/vf_yadif_init.c | 9 +
libavfilter/x86/vf_yadif_x64.asm | 489 +++++++++++++++++++++++++++++++
3 files changed, 499 insertions(+), 1 deletion(-)
create mode 100644 libavfilter/x86/vf_yadif_x64.asm
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index e87481bd7a..19161ffa23 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -80,4 +80,4 @@ X86ASM-OBJS-$(CONFIG_TRANSPOSE_FILTER) += x86/vf_transpose.o
X86ASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o
X86ASM-OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360.o
X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif.o
-X86ASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
+X86ASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/vf_yadif_x64.o x86/yadif-16.o x86/yadif-10.o
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index 9dd73f8e44..a46bd7ccca 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -29,6 +29,9 @@ void ff_yadif_filter_line_sse2(void *dst, void *prev, void *cur,
void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur,
void *next, int w, int prefs,
int mrefs, int parity, int mode);
+void ff_yadif_filter_line_avx(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur,
void *next, int w, int prefs,
@@ -71,5 +74,11 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
yadif->filter_line = ff_yadif_filter_line_sse2;
if (EXTERNAL_SSSE3(cpu_flags))
yadif->filter_line = ff_yadif_filter_line_ssse3;
+#if ARCH_X86_64
+ if (EXTERNAL_AVX(cpu_flags)) {
+ yadif->filter_line = ff_yadif_filter_line_avx;
+ yadif->req_align = 16;
+ }
+#endif
}
}
diff --git a/libavfilter/x86/vf_yadif_x64.asm b/libavfilter/x86/vf_yadif_x64.asm
new file mode 100644
index 0000000000..3f70aa0fd2
--- /dev/null
+++ b/libavfilter/x86/vf_yadif_x64.asm
@@ -0,0 +1,489 @@
+;******************************************************************************
+;* Copyright (C) 2006-2011 Michael Niedermayer <michaelni@gmx.at>
+;* 2010 James Darnley <james.darnley@gmail.com>
+;* 2013-2022 Chris Phlipot <cphlipot0@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software;* you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation;* either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+pb_127: times 16 db 127
+pb_128: times 16 db 128
+
+SECTION .text
+
+; Rename a register so that it can be used for a new purpose. The old name is
+; will become undefined so that any additional usage of the old name will
+; result in a compiler/assembler error.
+%macro RENAME_REGISTER 2
+ %ifidni %1,%2
+ %error "Can't rename a register to itself."
+ %endif
+ %xdefine %1 %2
+ %undef %2
+%endmacro
+
+; Usage: dst, arg1, arg2, temp1
+; Compute the absolute difference of arg1 and arg2 and places them in dst.
+; all operations are perfomed using packed bytes. Unlike ARM NEON there is no
+; instruction to do this, so instead we emulate it with multiple instructions.
+; eg. dst = abs(arg1 - arg2)
+%macro absdif_pb 4
+ %ifidni %1,%3
+ %error "arg1 and arg3 must be different"
+ %elifidni %1,%4
+ %error "arg1 and arg4 must be different"
+ %elifidni %3,%4
+ %error "arg3 and arg4 must be different"
+ %endif
+ psubusb %4, %3, %2
+ psubusb %1, %2, %3
+ por %1, %1, %4
+%endmacro
+
+; Usage: dst, arg1, arg2, pb_1, temp1
+; Compute the average of 2 unsigned values rounded down.
+; SSE provides pavgb, which rounds up. Unlike ARM NEON, SSE doen't provide
+; an instruction that computes the avg of 2 unsigned bytes rounded down, so
+; instead we emulate it with this macro.
+; eg. dst = (arg1 + arg2) >> 1
+%macro avg_truncate_pb 5
+ %ifidni %1,%3
+ %error "arg1 and arg3 must be different"
+ %elifidni %1,%4
+ %error "arg1 and arg5 must be different"
+ %endif
+ pxor %5, %2, %3
+ pavgb %1, %2, %3
+ pand %5, %5, %4
+ psubb %1, %1, %5
+%endmacro
+
+INIT_XMM avx
+
+cglobal yadif_filter_line, 5, 15, 8, 240, dst, prev, cur, next, width, prefs, \
+ mrefs, parity, mode
+%xdefine cur_plus_prefs r5
+%xdefine cur_plus_mrefs r6
+%xdefine prefs r7
+%xdefine next2 r8
+%xdefine prev2_2mrefs r9
+%xdefine mrefs r10
+%xdefine prev2_2prefs r11
+%xdefine next2_2mrefs r12
+%xdefine prev_plus_mrefs r13
+%xdefine next_plus_mrefs r14
+%xdefine prev2_2mrefs_stack_spill [rsp - 24]
+%xdefine pb_1_reg m15
+
+%xdefine old_absdif_ahead_stack [rsp - 128]
+%xdefine absdif_here [rsp - 80]
+%xdefine absdif_behind [rsp - 64]
+
+%xdefine spatial_predicate_stack [rsp - 112]
+%xdefine spatial_pred_check_minus_1 [rsp - 16]
+
+; unaligned loads are slower than aligned loads. It is often benificial to
+; store values in an aligned location after doing an aligned load so that all
+; future loads of that value will be aligned.
+%xdefine cur_plus_prefs_x_stack [rsp]
+%xdefine cur_plus_mrefs_x_stack [rsp + 16]
+%xdefine cur_plus_mrefs_x_2_stack [rsp + 96]
+%xdefine cur_plus_prefs_x_minus_2 [rsp + 80]
+
+; Absolute differences used for CHECK(-1)
+%xdefine chkneg1_ad2_stack [rsp - 96]
+%xdefine chkneg1_ad1_stack [rsp + 176]
+%xdefine chkneg1_ad0_stack [rsp - 48]
+
+; Absolute differences used for CHECK(-2)
+%xdefine chkneg2_ad2_stack [rsp + 160]
+%xdefine chkneg2_ad1_stack [rsp + 144]
+%xdefine chkneg2_ad0_stack [rsp + 208]
+
+; Absolute differences used for CHECK(1)
+%xdefine chkpos1_ad2_stack [rsp + 112]
+%xdefine chkpos1_ad1_stack [rsp + 128]
+; chkpos1_ad0 has no stack locatation since it is kept in a register.
+
+; Absolute differences used for CHECK(2)
+%xdefine chkpos2_ad2_stack [rsp + 64]
+%xdefine chkpos2_ad1_stack [rsp + 48]
+%xdefine chkpos2_ad0_stack [rsp + 32]
+
+ movsxd prefs, DWORD prefsm
+ movsxd mrefs, DWORD mrefsm
+; Bail out early if width is zero.
+ test widthd, widthd
+ jle .return
+
+; Initialize all pointers. Unlike the C code the pointers all point to the
+; location where x equals 0 and remain unchanged instead of the pointers being
+; incremented on every loop iteration. Instead only x is incremented, and x86
+; memory addressing is used to add the current value of x on every memory
+; access at (most likely) zero cost.
+ lea cur_plus_prefs, [curq + prefs]
+ movu m0, [curq + prefs - 1]
+ lea cur_plus_mrefs, [curq + mrefs]
+ movu m1, [curq + mrefs - 1]
+ absdif_pb m0, m0, m1, m5
+ cmp dword paritym, 0
+ mov next2, curq
+RENAME_REGISTER prev2, curq
+ cmove next2, nextq
+ pslldq m8, m0, 14
+ mova old_absdif_ahead_stack, m8
+ cmovne prev2, prevq
+ lea prev_plus_mrefs, [prevq + mrefs]
+ add prevq, prefs
+RENAME_REGISTER prev_plus_prefs, prevq
+ lea next_plus_mrefs, [nextq + mrefs]
+ add nextq, prefs
+RENAME_REGISTER next_plus_prefs, nextq
+ lea prev2_2mrefs, [prev2 + 2*mrefs]
+ mov prev2_2mrefs_stack_spill, prev2_2mrefs
+ lea prev2_2prefs, [prev2 + 2*prefs]
+ lea next2_2mrefs, [next2 + 2*mrefs]
+RENAME_REGISTER next2_2prefs, mrefs
+ lea next2_2prefs, [next2 + 2*prefs]
+RENAME_REGISTER x, prefs
+ xor x, x
+ mova pb_1_reg, [pb_1]
+ mov prev2_2mrefs, prev2_2mrefs_stack_spill
+
+ jmp .loop_start
+.loop_tail:
+ paddusb m3, m2, m1
+ pminub m0, m9, m3
+ psubusb m2, m2, m1
+ pmaxub m0, m0, m2
+ movu [dstq + x], m0
+ add x, 16
+ cmp x, widthq
+ jge .return
+.loop_start:
+; Start by computing the spatial score
+; We attempt to Compute the spatial score using saturated adds. In real
+; world content the entire spatial score 16-byte xmm vector will be able
+; to accurately represent the spatial score in 8-bits > 97% of the
+; time. Because of this we try computing the spatial score with 8-bit
+; first since it is 2x as fast, and check if we saturated the computation later.
+; The original spatial score can potentially be in the range of -1 to 765
+; Instead for this approach, we map the lower end of that to 8-bits using
+; the range -128 to 127.
+; If we detected that this assumption may have failed we instead re-compute
+; the spatial score using the full 16-bit range needed to represent -1 to 765.
+;
+; Before we compute the spatial score, we pre-compute most of the absolute
+; difference values used in the C code's CHECK() macros. These absolute
+; differences are then stored to the stack so that they can be re-used for the
+; slower 16-bit spatial score approach in case that is needed.
+ movu m6, [cur_plus_mrefs + x - 3]
+ movu m11, [cur_plus_mrefs + x - 2]
+ movu m2, [cur_plus_mrefs + x - 1]
+ movu m3, [cur_plus_mrefs + x]
+ movu m13, [cur_plus_prefs + x]
+ movu m0, [cur_plus_mrefs + x + 1]
+ movu m1, [cur_plus_prefs + x + 1]
+ absdif_pb m14, m0, m1, m5 ; abs(cur[mrefs+1]-cur[prefs+1])
+ avg_truncate_pb m10, m13, m3, pb_1_reg, m5 ; spatial_pred = (c+d) >> 2
+ mova spatial_predicate_stack, m10
+ movu m7, [cur_plus_prefs + x + 2]
+ absdif_pb m10, m11, m13, m5 ; abs(cur[mrefs-2]-cur[prefs])
+ mova chkneg1_ad2_stack, m10
+ absdif_pb m8, m2, m1, m5 ; abs(cur[mrefs-1]-cur[prefs+1])
+ absdif_pb m9, m3, m7, m5 ; abs(cur[mrefs]-cur[prefs+2])
+ absdif_pb m10, m6, m1, m5 ; abs(cur[mrefs-3]-cur[prefs+1])
+ mova chkneg2_ad2_stack, m10
+ absdif_pb m10, m11, m7, m5 ; abs(cur[mrefs-2]-cur[prefs+2])
+ mova chkneg2_ad1_stack, m10
+ movu m4, [cur_plus_prefs + x + 3]
+ absdif_pb m10, m2, m4, m5 ; abs(cur[mrefs-1]-cur[prefs+3])
+ mova chkneg2_ad0_stack, m10
+ movu m12, [cur_plus_mrefs + x + 2]
+ absdif_pb m10, m12, m13, m5 ; abs(cur[mrefs+2]-cur[prefs])
+ mova cur_plus_prefs_x_stack, m13
+ mova chkpos1_ad2_stack, m10
+ movu m6, [cur_plus_prefs + x - 1]
+ absdif_pb m10, m0, m6, m5 ; abs(cur[mrefs+1]-cur[prefs-1])
+ mova chkpos1_ad1_stack, m10
+ movu m10, [cur_plus_prefs + x - 2]
+ mova cur_plus_mrefs_x_stack, m3
+ absdif_pb m13, m10, m3, m5 ; abs(cur[mrefs]-cur[prefs-2])
+ movu m4, [cur_plus_mrefs + x + 3]
+ absdif_pb m4, m4, m6, m5 ; abs(cur[mrefs+3]-cur[prefs-1])
+ mova chkpos2_ad2_stack, m4
+ absdif_pb m3, m12, m10, m5 ; abs(cur[mrefs+2]-cur[prefs-2])
+ mova chkpos2_ad1_stack, m3
+ movu m4, [cur_plus_prefs + x - 3]
+ absdif_pb m3, m0, m4, m5 ; abs(cur[mrefs+1]-cur[prefs-3])
+ mova chkpos2_ad0_stack, m3
+ mova chkneg1_ad1_stack, m8
+ paddusb m5, m8, chkneg1_ad2_stack
+ mova chkneg1_ad0_stack, m9
+ paddusb m4, m9, pb_1_reg
+ paddusb m5, m5, m4
+ mova m3, old_absdif_ahead_stack
+ palignr m4, m14, m3, 15
+ palignr m3, m14, m3, 14
+ mova old_absdif_ahead_stack, m14
+ mova absdif_here, m4
+ paddusb m4, m14, m4
+ mova absdif_behind, m3
+ paddusb m4, m4, m3
+ pxor m4, m4, [pb_128]
+ pxor m5, m5, [pb_128]
+ pcmpgtb m8, m4, m5
+ pcmpeqb m14, m4, [pb_127]
+ por m8, m8, m14
+ pminsb m4, m4, m5
+ avg_truncate_pb m1, m1, m2, pb_1_reg, m5
+ mova spatial_pred_check_minus_1, m1
+ mova m1, chkneg2_ad1_stack
+ paddusb m2, m1, chkneg2_ad2_stack
+ paddusb m5, pb_1_reg, chkneg2_ad0_stack
+ paddusb m2, m2, m5
+ avg_truncate_pb m11, m11, m7, pb_1_reg, m5
+ mova m3, chkpos1_ad1_stack
+ paddusb m3, m3, chkpos1_ad2_stack
+ paddusb m5, m13, pb_1_reg
+ paddusb m7, m3, m5
+ avg_truncate_pb m3, m6, m0, pb_1_reg, m5
+ pxor m0, m2, [pb_128]
+ pcmpgtb m2, m4, m0
+ pand m5, m8, m2
+ pblendvb m6, m4, m0, m5
+ pcmpeqb m0, m4, [pb_127]
+ pand m2, m5, m0
+ pxor m7, m7, [pb_128]
+ pminsb m0, m6, m7
+ pcmpeqb m4, m0, [pb_127]
+ por m2, m4, m2
+ ptest m2, m2
+ jne .spatial_check_16_bit
+; At this point we know if we can continue on the fast path with saturating
+; spatial score computation while maintaining bit-accuracy, or if we need to
+; bail out and perform the spatial score computation using full 16-bit words
+; to store the score value. check_2_saturate is only executed here if we know
+; we don't need to go down the slow path.
+.check_2_saturate:
+ mova m2, spatial_predicate_stack
+ pblendvb m1, m2, spatial_pred_check_minus_1, m8
+ pblendvb m1, m11, m5
+ pcmpgtb m2, m6, m7
+ pcmpeqb m5, m6, [pb_127]
+ por m2, m5, m2
+ pblendvb m1, m3, m2
+ mova m3, chkpos2_ad1_stack
+ paddusb m3, m3, chkpos2_ad2_stack
+ paddusb m4, pb_1_reg, chkpos2_ad0_stack
+ paddusb m3, m3, m4
+ pxor m3, m3, [pb_128]
+ pcmpgtb m0, m0, m3
+ pand m0, m0, m2
+ avg_truncate_pb m2, m12, m10, pb_1_reg, m5
+ pblendvb m9, m1, m2, m0
+.temporal_check:
+ mova m0, cur_plus_mrefs_x_stack
+ mova m8, cur_plus_prefs_x_stack
+ movu m1, [prev2 + x]
+ movu m6, [next2 + x]
+ avg_truncate_pb m2, m6, m1, pb_1_reg, m5
+ absdif_pb m1, m1, m6, m5
+ movu m6, [prev_plus_mrefs + x]
+ movu m4, [prev_plus_prefs + x]
+ absdif_pb m6, m6, m0, m5
+ absdif_pb m4, m4, m8, m5
+ avg_truncate_pb m6, m6, m4, pb_1_reg, m5
+ movu m4, [next_plus_mrefs + x]
+ movu m3, [next_plus_prefs + x]
+ absdif_pb m4, m4, m0, m5
+ absdif_pb m3, m3, m8, m5
+ avg_truncate_pb m4, m4, m3, pb_1_reg, m5
+ pmaxub m6, m6, m4
+ psrlw m1, m1, 1
+ pand m1, m1, [pb_127]
+ pmaxub m1, m1, m6
+ cmp DWORD modem, 1
+ jg .loop_tail
+.handle_mode_1:
+; Handle the "if (!(mode&2))" section.
+; This section has undergone some complex
+; tranformations with respect to the c implementation in order to
+; ensure that all inputs, outputs and intermeidate values can be
+; stored in 8-bit unsigned values. The code is transformed with
+; various identities to prevent signed intermediate values which
+; would require an extra 9th bit for the sign, which we don't have.
+; The main identities are applied:
+; 1. -MAX(a-b, c-d) = MIN(b-c, d-c)
+; 2. MIN(a-c, b-c) = MIN(a, b)-c
+; The following from the C code:
+;
+; int max = FFMAX3(d-e, d-c, FFMIN(b-c, f-e));
+; diff = FFMAX3(diff, min, -max);
+;
+; becomes:
+; int negative_max = FFMIN( FFMIN(e, c)-d, FFMAX(c-b, e-f))
+; diff = FFMAX3(diff, min, negative_max);
+;
+; Lastly we know that diff must be non-negative in the end, so
+; intermediate negative values don't matter. to keep computations
+; within 8 bits, we use saturating subtraction which replaces all
+; negative intermediate results with 0, but doesn't affect the
+; final value assigned to diff.
+ movu m6, [prev2_2mrefs + x]
+ movu m4, [next2_2mrefs + x]
+ avg_truncate_pb m6, m6, m4, pb_1_reg, m5
+ movu m4, [prev2_2prefs + x]
+ movu m3, [next2_2prefs + x]
+ avg_truncate_pb m4, m4, m3, pb_1_reg, m5
+ psubusb m3, m8, m2
+ psubusb m5, m0, m2
+ pminub m3, m3, m5
+ psubusb m5, m0, m6
+ psubusb m7, m8, m4
+ pmaxub m5, m5, m7
+ pminub m3, m3, m5
+ psubusb m5, m2, m8
+ psubusb m7, m2, m0
+ pminub m5, m5, m7
+ psubusb m6, m6, m0
+ psubusb m4, m4, m8
+ pmaxub m6, m6, m4
+ pminub m6, m5, m6
+ pmaxub m6, m6, m3
+ pmaxub m1, m1, m6
+ jmp .loop_tail
+.spatial_check_16_bit:
+; Assuming all else fails, we compute the spatial score using packed words to
+; store the temporary values. Every input register containing packed bytes is
+; unpacked into 2 separate registers with packed words, which are then
+; processed identically. This path should generally be run < 3% of time, and
+; is kept mainly to ensure output is bit-accurate compared to the C
+; impelmentation
+ mova cur_plus_mrefs_x_2_stack, m12
+ mova cur_plus_prefs_x_minus_2, m10
+ mova m5, old_absdif_ahead_stack
+ pmovzxbw m0, m5
+ mova m4, absdif_here
+ pmovzxbw m2, m4
+ paddw m0, m0, m2
+ pxor m12, m12, m12
+ punpckhbw m2, m4, m12
+ punpckhbw m5, m5, m12
+ paddw m2, m5, m2
+ mova m7, absdif_behind
+ pmovzxbw m5, m7
+ pcmpeqd m4, m4, m4
+ paddw m5, m5, m4
+ paddw m9, m0, m5
+ punpckhbw m5, m7, m12
+ paddw m5, m5, m4
+ paddw m7, m2, m5
+ mova m0, chkneg1_ad2_stack
+ pmovzxbw m2, m0
+ mova m4, chkneg1_ad1_stack
+ pmovzxbw m5, m4
+ paddw m2, m2, m5
+ punpckhbw m5, m0, m12
+ punpckhbw m4, m4, m12
+ paddw m4, m5, m4
+ mova m0, chkneg1_ad0_stack
+ pmovzxbw m5, m0
+ paddw m5, m2, m5
+ pminsw m6, m9, m5
+ punpckhbw m2, m0, m12
+ paddw m2, m4, m2
+ pminsw m14, m2, m7
+ pcmpgtw m4, m9, m5
+ pcmpgtw m10, m7, m2
+ packsswb m0, m4, m10
+ mova m2, spatial_predicate_stack
+ pblendvb m0, m2, spatial_pred_check_minus_1, m0
+ mova spatial_predicate_stack, m0
+ mova m0, chkneg2_ad2_stack
+ pmovzxbw m2, m0
+ mova m1, chkneg2_ad0_stack
+ pmovzxbw m7, m1
+ paddw m2, m7, m2
+ mova m9, chkneg2_ad1_stack
+ pmovzxbw m7, m9
+ paddw m2, m2, m7
+ punpckhbw m7, m0, m12
+ punpckhbw m5, m1, m12
+ paddw m5, m5, m7
+ punpckhbw m7, m9, m12
+ paddw m5, m5, m7
+ mova m0, chkpos1_ad2_stack
+ pmovzxbw m7, m0
+ mova m1, chkpos1_ad1_stack
+ pmovzxbw m8, m1
+ paddw m8, m8, m7
+ punpckhbw m7, m0, m12
+ punpckhbw m0, m1, m12
+ paddw m7, m0, m7
+ pmovzxbw m1, m13
+ paddw m9, m8, m1
+ punpckhbw m0, m13, m12
+ paddw m8, m7, m0
+ pcmpgtw m0, m6, m2
+ pand m0, m0, m4
+ pcmpgtw m4, m14, m5
+ pand m4, m10, m4
+ pblendvb m1, m6, m2, m0
+ pblendvb m14, m5, m4
+ packsswb m0, m0, m4
+ mova m5, spatial_predicate_stack
+ pblendvb m0, m5, m11, m0
+ pcmpgtw m5, m1, m9
+ pcmpgtw m4, m14, m8
+ packsswb m6, m5, m4
+ pblendvb m13, m0, m3, m6
+ mova m0, chkpos2_ad2_stack
+ pmovzxbw m3, m0
+ mova m7, chkpos2_ad1_stack
+ pmovzxbw m6, m7
+ paddw m3, m6, m3
+ punpckhbw m6, m0, m12
+ punpckhbw m0, m7, m12
+ paddw m0, m0, m6
+ mova m7, chkpos2_ad0_stack
+ pmovzxbw m6, m7
+ paddw m3, m3, m6
+ punpckhbw m6, m7, m12
+ paddw m0, m0, m6
+ pminsw m1, m9, m1
+ pcmpgtw m1, m1, m3
+ pminsw m14, m8, m14
+ pcmpgtw m14, m14, m0
+ pand m1, m1, m5
+ pand m14, m14, m4
+ packsswb m14, m1, m14
+ mova m0, cur_plus_mrefs_x_2_stack
+ mova m5, cur_plus_prefs_x_minus_2
+ pxor m1, m5, m0
+ pavgb m0, m0, m5
+ pand m1, m1, [pb_1]
+ psubb m1, m0, m1
+ pblendvb m9, m13, m1, m14
+ jmp .temporal_check
+.return:
+ RET
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] avfilter/vf_yadif: Add x86_64 avx yadif asm
2022-07-20 4:41 ` [FFmpeg-devel] [PATCH 5/5] avfilter/vf_yadif: Add x86_64 avx yadif asm Chris Phlipot
@ 2022-07-20 13:16 ` Michael Niedermayer
2022-07-21 2:30 ` Chris Phlipot
0 siblings, 1 reply; 7+ messages in thread
From: Michael Niedermayer @ 2022-07-20 13:16 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 2590 bytes --]
On Tue, Jul 19, 2022 at 09:41:17PM -0700, Chris Phlipot wrote:
> Add a new version of yadif_filter_line performed using packed bytes
> instead of the packed words used by the current implementaiton. As
> a result this implementation runs almost 2x as fast as the current
> fastest SSSE3 implementation.
>
> This implementation is created from scratch based on the C code, with
> the goal of keeping all intermediate values within 8-bits so that
> the vectorized code can be computed using packed bytes. differences
> are as follows:
> - Use algorithms to compute avg and abs difference using only 8-bit
> intermediate values.
> - Reworked the mode 1 code by applying various mathematical identities
> to keep all intermediate values within 8-bits.
> - Attempt to compute the spatial score using only 8-bits. The actual
> spatial score fits within this range 97% (content dependent) of the
> time for the entire 128-bit xmm vector. In the case that spatial
> score needs more than 8-bits to be represented, we detect this case,
> and recompute the spatial score using 16-bit packed words instead.
>
> In 3% of cases the spatial_score will need more than 8-bytes to store
> so we have a slow path, where the spatial score is computed using
> packed words instead.
>
> This implementation is currently limited to x86_64 due to the number
> of registers required. x86_32 is possible, but the performance benefit
> over the existing SSSE3 implentation is not as great, due to all of the
> stack spills that would result from having far fewer registers. ASM was
> not generated for the 32-bit varient due to limited ROI, as most AVX
> users are likely on 64-bit OS at this point and 32-bit users would
> lose out on most of the performance benefit.
>
> Signed-off-by: Chris Phlipot <cphlipot0@gmail.com>
theres no need to support 32it but ffmpeg build must not break
on linux x86-32
src/libavfilter/x86/vf_yadif_x64.asm:145: error: impossible combination of address sizes
src/libavfilter/x86/vf_yadif_x64.asm:145: error: invalid effective address
src/libavfilter/x86/vf_yadif_x64.asm:146: error: impossible combination of address sizes
src//libavutil/x86/x86inc.asm:1399: ... from macro `movdqu' defined here
src//libavutil/x86/x86inc.asm:1264: ... from macro `RUN_AVX_INSTR' defined here
src//libavutil/x86/x86inc.asm:1717: ... from macro `vmovdqu' defined here
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Everything should be made as simple as possible, but not simpler.
-- Albert Einstein
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] avfilter/vf_yadif: Add x86_64 avx yadif asm
2022-07-20 13:16 ` Michael Niedermayer
@ 2022-07-21 2:30 ` Chris Phlipot
0 siblings, 0 replies; 7+ messages in thread
From: Chris Phlipot @ 2022-07-21 2:30 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Thanks for calling that out. It looks like I was cross-compiling for 32-bit
incorrectly from my 64-bit host. I've reproduced the failure and submitted
a v2 with the fix. If you're still seeing build failures even after v2, can
you also provide more details on how you are building so I can reproduce
and fix?
- Chris
On Wed, Jul 20, 2022 at 6:17 AM Michael Niedermayer <michael@niedermayer.cc>
wrote:
> On Tue, Jul 19, 2022 at 09:41:17PM -0700, Chris Phlipot wrote:
> > Add a new version of yadif_filter_line performed using packed bytes
> > instead of the packed words used by the current implementaiton. As
> > a result this implementation runs almost 2x as fast as the current
> > fastest SSSE3 implementation.
> >
> > This implementation is created from scratch based on the C code, with
> > the goal of keeping all intermediate values within 8-bits so that
> > the vectorized code can be computed using packed bytes. differences
> > are as follows:
> > - Use algorithms to compute avg and abs difference using only 8-bit
> > intermediate values.
> > - Reworked the mode 1 code by applying various mathematical identities
> > to keep all intermediate values within 8-bits.
> > - Attempt to compute the spatial score using only 8-bits. The actual
> > spatial score fits within this range 97% (content dependent) of the
> > time for the entire 128-bit xmm vector. In the case that spatial
> > score needs more than 8-bits to be represented, we detect this case,
> > and recompute the spatial score using 16-bit packed words instead.
> >
> > In 3% of cases the spatial_score will need more than 8-bytes to store
> > so we have a slow path, where the spatial score is computed using
> > packed words instead.
> >
> > This implementation is currently limited to x86_64 due to the number
> > of registers required. x86_32 is possible, but the performance benefit
> > over the existing SSSE3 implentation is not as great, due to all of the
> > stack spills that would result from having far fewer registers. ASM was
> > not generated for the 32-bit varient due to limited ROI, as most AVX
> > users are likely on 64-bit OS at this point and 32-bit users would
> > lose out on most of the performance benefit.
> >
> > Signed-off-by: Chris Phlipot <cphlipot0@gmail.com>
>
> theres no need to support 32it but ffmpeg build must not break
> on linux x86-32
>
> src/libavfilter/x86/vf_yadif_x64.asm:145: error: impossible combination of
> address sizes
> src/libavfilter/x86/vf_yadif_x64.asm:145: error: invalid effective address
> src/libavfilter/x86/vf_yadif_x64.asm:146: error: impossible combination of
> address sizes
> src//libavutil/x86/x86inc.asm:1399: ... from macro `movdqu' defined here
> src//libavutil/x86/x86inc.asm:1264: ... from macro `RUN_AVX_INSTR' defined
> here
> src//libavutil/x86/x86inc.asm:1717: ... from macro `vmovdqu' defined here
>
>
> [...]
> --
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> Everything should be made as simple as possible, but not simpler.
> -- Albert Einstein
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2022-07-21 2:31 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-20 4:41 [FFmpeg-devel] [PATCH 1/5] avfilter/vf_yadif: Fix edge size when MAX_ALIGN is < 4 Chris Phlipot
2022-07-20 4:41 ` [FFmpeg-devel] [PATCH 2/5] avfilter/vf_yadif: Allow alignment to be configurable Chris Phlipot
2022-07-20 4:41 ` [FFmpeg-devel] [PATCH 3/5] avfilter/vf_yadif: reformat code to improve readability Chris Phlipot
2022-07-20 4:41 ` [FFmpeg-devel] [PATCH 4/5] avfilter/vf_yadif: Process more pixels using filter_line Chris Phlipot
2022-07-20 4:41 ` [FFmpeg-devel] [PATCH 5/5] avfilter/vf_yadif: Add x86_64 avx yadif asm Chris Phlipot
2022-07-20 13:16 ` Michael Niedermayer
2022-07-21 2:30 ` Chris Phlipot
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git