* [FFmpeg-devel] [PATCH v3 1/7] tests/checkasm: Add test for vf_bwdif filter_intra
2023-07-03 19:04 [FFmpeg-devel] [PATCH v3 0/7] avfilter/vf_bwdif: Add aarch64 neon functions John Cox
@ 2023-07-03 19:04 ` John Cox
2023-07-03 19:04 ` [FFmpeg-devel] [PATCH v3 2/7] avfilter/vf_bwdif: Add neon for filter_intra John Cox
` (5 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: John Cox @ 2023-07-03 19:04 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: thomas.mundt, John Cox, martin
Signed-off-by: John Cox <jc@kynesim.co.uk>
---
tests/checkasm/vf_bwdif.c | 37 +++++++++++++++++++++++++++++++++++++
1 file changed, 37 insertions(+)
diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
index 46224bb575..034bbabb4c 100644
--- a/tests/checkasm/vf_bwdif.c
+++ b/tests/checkasm/vf_bwdif.c
@@ -20,6 +20,7 @@
#include "checkasm.h"
#include "libavcodec/internal.h"
#include "libavfilter/bwdif.h"
+#include "libavutil/mem_internal.h"
#define WIDTH 256
@@ -81,4 +82,40 @@ void checkasm_check_vf_bwdif(void)
BODY(uint16_t, 10);
report("bwdif10");
}
+
+ if (check_func(ctx_8.filter_intra, "bwdif8.intra")) {
+ LOCAL_ALIGNED_16(uint8_t, cur0, [11*WIDTH]);
+ LOCAL_ALIGNED_16(uint8_t, cur1, [11*WIDTH]);
+ LOCAL_ALIGNED_16(uint8_t, dst0, [WIDTH*3]);
+ LOCAL_ALIGNED_16(uint8_t, dst1, [WIDTH*3]);
+ const int stride = WIDTH;
+ const int mask = (1<<8)-1;
+
+ declare_func(void, void *dst1, void *cur1, int w, int prefs, int mrefs,
+ int prefs3, int mrefs3, int parity, int clip_max);
+
+ randomize_buffers( cur0, cur1, mask, 11*WIDTH);
+ memset(dst0, 0xba, WIDTH * 3);
+ memset(dst1, 0xba, WIDTH * 3);
+
+ call_ref(dst0 + stride,
+ cur0 + stride * 4, WIDTH,
+ stride, -stride, stride * 3, -stride * 3,
+ 0, mask);
+ call_new(dst1 + stride,
+ cur0 + stride * 4, WIDTH,
+ stride, -stride, stride * 3, -stride * 3,
+ 0, mask);
+
+ if (memcmp(dst0, dst1, WIDTH*3)
+ || memcmp( cur0, cur1, WIDTH*11))
+ fail();
+
+ bench_new(dst1 + stride,
+ cur0 + stride * 4, WIDTH,
+ stride, -stride, stride * 3, -stride * 3,
+ 0, mask);
+
+ report("bwdif8.intra");
+ }
}
--
2.39.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* [FFmpeg-devel] [PATCH v3 2/7] avfilter/vf_bwdif: Add neon for filter_intra
2023-07-03 19:04 [FFmpeg-devel] [PATCH v3 0/7] avfilter/vf_bwdif: Add aarch64 neon functions John Cox
2023-07-03 19:04 ` [FFmpeg-devel] [PATCH v3 1/7] tests/checkasm: Add test for vf_bwdif filter_intra John Cox
@ 2023-07-03 19:04 ` John Cox
2023-07-03 19:04 ` [FFmpeg-devel] [PATCH v3 3/7] tests/checkasm: Add test for vf_bwdif filter_edge John Cox
` (4 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: John Cox @ 2023-07-03 19:04 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: thomas.mundt, John Cox, martin
Adds an outline for aarch neon functions
Adds common macros and consts for aarch64 neon
Exports C filter_intra needed for tail fixup of neon code
Adds neon for filter_intra
Signed-off-by: John Cox <jc@kynesim.co.uk>
---
libavfilter/aarch64/Makefile | 2 +
libavfilter/aarch64/vf_bwdif_init_aarch64.c | 56 ++++++++
libavfilter/aarch64/vf_bwdif_neon.S | 136 ++++++++++++++++++++
libavfilter/bwdif.h | 4 +
libavfilter/vf_bwdif.c | 8 +-
5 files changed, 203 insertions(+), 3 deletions(-)
create mode 100644 libavfilter/aarch64/vf_bwdif_init_aarch64.c
create mode 100644 libavfilter/aarch64/vf_bwdif_neon.S
diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
index b58daa3a3f..b68209bc94 100644
--- a/libavfilter/aarch64/Makefile
+++ b/libavfilter/aarch64/Makefile
@@ -1,3 +1,5 @@
+OBJS-$(CONFIG_BWDIF_FILTER) += aarch64/vf_bwdif_init_aarch64.o
OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_init.o
+NEON-OBJS-$(CONFIG_BWDIF_FILTER) += aarch64/vf_bwdif_neon.o
NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_neon.o
diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
new file mode 100644
index 0000000000..3ffaa07ab3
--- /dev/null
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -0,0 +1,56 @@
+/*
+ * bwdif aarch64 NEON optimisations
+ *
+ * Copyright (c) 2023 John Cox <jc@kynesim.co.uk>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavfilter/bwdif.h"
+#include "libavutil/aarch64/cpu.h"
+
+void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
+ int prefs3, int mrefs3, int parity, int clip_max);
+
+
+static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
+ int prefs3, int mrefs3, int parity, int clip_max)
+{
+ const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+ ff_bwdif_filter_intra_neon(dst1, cur1, w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
+
+ if (w0 < w)
+ ff_bwdif_filter_intra_c((char *)dst1 + w0, (char *)cur1 + w0,
+ w - w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
+}
+
+void
+ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
+{
+ const int cpu_flags = av_get_cpu_flags();
+
+ if (bit_depth != 8)
+ return;
+
+ if (!have_neon(cpu_flags))
+ return;
+
+ s->filter_intra = filter_intra_helper;
+}
+
diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
new file mode 100644
index 0000000000..e288efbe6c
--- /dev/null
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -0,0 +1,136 @@
+/*
+ * bwdif aarch64 NEON optimisations
+ *
+ * Copyright (c) 2023 John Cox <jc@kynesim.co.uk>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/aarch64/asm.S"
+
+// Space taken on the stack by an int (32-bit)
+#ifdef __APPLE__
+.set SP_INT, 4
+#else
+.set SP_INT, 8
+#endif
+
+.macro SQSHRUNN b, s0, s1, s2, s3, n
+ sqshrun \s0\().4h, \s0\().4s, #\n - 8
+ sqshrun2 \s0\().8h, \s1\().4s, #\n - 8
+ sqshrun \s1\().4h, \s2\().4s, #\n - 8
+ sqshrun2 \s1\().8h, \s3\().4s, #\n - 8
+ uzp2 \b\().16b, \s0\().16b, \s1\().16b
+.endm
+
+.macro SMULL4K a0, a1, a2, a3, s0, s1, k
+ smull \a0\().4s, \s0\().4h, \k
+ smull2 \a1\().4s, \s0\().8h, \k
+ smull \a2\().4s, \s1\().4h, \k
+ smull2 \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro UMULL4K a0, a1, a2, a3, s0, s1, k
+ umull \a0\().4s, \s0\().4h, \k
+ umull2 \a1\().4s, \s0\().8h, \k
+ umull \a2\().4s, \s1\().4h, \k
+ umull2 \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro UMLAL4K a0, a1, a2, a3, s0, s1, k
+ umlal \a0\().4s, \s0\().4h, \k
+ umlal2 \a1\().4s, \s0\().8h, \k
+ umlal \a2\().4s, \s1\().4h, \k
+ umlal2 \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro UMLSL4K a0, a1, a2, a3, s0, s1, k
+ umlsl \a0\().4s, \s0\().4h, \k
+ umlsl2 \a1\().4s, \s0\().8h, \k
+ umlsl \a2\().4s, \s1\().4h, \k
+ umlsl2 \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro LDR_COEFFS d, t0
+ movrel \t0, coeffs, 0
+ ld1 {\d\().8h}, [\t0]
+.endm
+
+// static const uint16_t coef_lf[2] = { 4309, 213 };
+// static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
+// static const uint16_t coef_sp[2] = { 5077, 981 };
+
+const coeffs, align=4 // align 4 means align on 2^4 boundry
+ .hword 4309 * 4, 213 * 4 // lf[0]*4 = v0.h[0]
+ .hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2], -hf[1] = v0.h[5]
+ .hword 5077, 981 // sp[0] = v0.h[6]
+endconst
+
+// ============================================================================
+//
+// void ff_bwdif_filter_intra_neon(
+// void *dst1, // x0
+// void *cur1, // x1
+// int w, // w2
+// int prefs, // w3
+// int mrefs, // w4
+// int prefs3, // w5
+// int mrefs3, // w6
+// int parity, // w7 unused
+// int clip_max) // [sp, #0] unused
+
+function ff_bwdif_filter_intra_neon, export=1
+ cmp w2, #0
+ ble 99f
+
+ LDR_COEFFS v0, x17
+
+// for (x = 0; x < w; x++) {
+10:
+
+// interpol = (coef_sp[0] * (cur[mrefs] + cur[prefs]) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13;
+ ldr q31, [x1, w4, sxtw]
+ ldr q30, [x1, w3, sxtw]
+ ldr q29, [x1, w6, sxtw]
+ ldr q28, [x1, w5, sxtw]
+
+ uaddl v20.8h, v31.8b, v30.8b
+ uaddl2 v21.8h, v31.16b, v30.16b
+
+ UMULL4K v2, v3, v4, v5, v20, v21, v0.h[6]
+
+ uaddl v20.8h, v29.8b, v28.8b
+ uaddl2 v21.8h, v29.16b, v28.16b
+
+ UMLSL4K v2, v3, v4, v5, v20, v21, v0.h[7]
+
+// dst[0] = av_clip(interpol, 0, clip_max);
+ SQSHRUNN v2, v2, v3, v4, v5, 13
+ str q2, [x0], #16
+
+// dst++;
+// cur++;
+// }
+
+ subs w2, w2, #16
+ add x1, x1, #16
+ bgt 10b
+
+99:
+ ret
+endfunc
diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index 5749345f78..ae6f6ce223 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -39,5 +39,9 @@ typedef struct BWDIFContext {
void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);
+void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth);
+
+void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
+ int prefs3, int mrefs3, int parity, int clip_max);
#endif /* AVFILTER_BWDIF_H */
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index e278cf1217..035fc58670 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -122,8 +122,8 @@ typedef struct ThreadData {
next2++; \
}
-static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
- int prefs3, int mrefs3, int parity, int clip_max)
+void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
+ int prefs3, int mrefs3, int parity, int clip_max)
{
uint8_t *dst = dst1;
uint8_t *cur = cur1;
@@ -362,13 +362,15 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
s->filter_line = filter_line_c_16bit;
s->filter_edge = filter_edge_16bit;
} else {
- s->filter_intra = filter_intra;
+ s->filter_intra = ff_bwdif_filter_intra_c;
s->filter_line = filter_line_c;
s->filter_edge = filter_edge;
}
#if ARCH_X86
ff_bwdif_init_x86(s, bit_depth);
+#elif ARCH_AARCH64
+ ff_bwdif_init_aarch64(s, bit_depth);
#endif
}
--
2.39.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* [FFmpeg-devel] [PATCH v3 3/7] tests/checkasm: Add test for vf_bwdif filter_edge
2023-07-03 19:04 [FFmpeg-devel] [PATCH v3 0/7] avfilter/vf_bwdif: Add aarch64 neon functions John Cox
2023-07-03 19:04 ` [FFmpeg-devel] [PATCH v3 1/7] tests/checkasm: Add test for vf_bwdif filter_intra John Cox
2023-07-03 19:04 ` [FFmpeg-devel] [PATCH v3 2/7] avfilter/vf_bwdif: Add neon for filter_intra John Cox
@ 2023-07-03 19:04 ` John Cox
2023-07-03 19:04 ` [FFmpeg-devel] [PATCH v3 4/7] avfilter/vf_bwdif: Add neon for filter_edge John Cox
` (3 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: John Cox @ 2023-07-03 19:04 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: thomas.mundt, John Cox, martin
Signed-off-by: John Cox <jc@kynesim.co.uk>
---
tests/checkasm/vf_bwdif.c | 54 +++++++++++++++++++++++++++++++++++++++
1 file changed, 54 insertions(+)
diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
index 034bbabb4c..5fdba09fdc 100644
--- a/tests/checkasm/vf_bwdif.c
+++ b/tests/checkasm/vf_bwdif.c
@@ -83,6 +83,60 @@ void checkasm_check_vf_bwdif(void)
report("bwdif10");
}
+ {
+ LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
+ LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
+ LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]);
+ LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]);
+ LOCAL_ALIGNED_16(uint8_t, cur0, [11*WIDTH]);
+ LOCAL_ALIGNED_16(uint8_t, cur1, [11*WIDTH]);
+ LOCAL_ALIGNED_16(uint8_t, dst0, [WIDTH*3]);
+ LOCAL_ALIGNED_16(uint8_t, dst1, [WIDTH*3]);
+ const int stride = WIDTH;
+ const int mask = (1<<8)-1;
+ int spat;
+ int parity;
+
+ for (spat = 0; spat != 2; ++spat) {
+ for (parity = 0; parity != 2; ++parity) {
+ if (check_func(ctx_8.filter_edge, "bwdif8.edge.s%d.p%d", spat, parity)) {
+
+ declare_func(void, void *dst1, void *prev1, void *cur1, void *next1,
+ int w, int prefs, int mrefs, int prefs2, int mrefs2,
+ int parity, int clip_max, int spat);
+
+ randomize_buffers(prev0, prev1, mask, 11*WIDTH);
+ randomize_buffers(next0, next1, mask, 11*WIDTH);
+ randomize_buffers( cur0, cur1, mask, 11*WIDTH);
+ memset(dst0, 0xba, WIDTH * 3);
+ memset(dst1, 0xba, WIDTH * 3);
+
+ call_ref(dst0 + stride,
+ prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, WIDTH,
+ stride, -stride, stride * 2, -stride * 2,
+ parity, mask, spat);
+ call_new(dst1 + stride,
+ prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH,
+ stride, -stride, stride * 2, -stride * 2,
+ parity, mask, spat);
+
+ if (memcmp(dst0, dst1, WIDTH*3)
+ || memcmp(prev0, prev1, WIDTH*11)
+ || memcmp(next0, next1, WIDTH*11)
+ || memcmp( cur0, cur1, WIDTH*11))
+ fail();
+
+ bench_new(dst1 + stride,
+ prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH,
+ stride, -stride, stride * 2, -stride * 2,
+ parity, mask, spat);
+ }
+ }
+ }
+
+ report("bwdif8.edge");
+ }
+
if (check_func(ctx_8.filter_intra, "bwdif8.intra")) {
LOCAL_ALIGNED_16(uint8_t, cur0, [11*WIDTH]);
LOCAL_ALIGNED_16(uint8_t, cur1, [11*WIDTH]);
--
2.39.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* [FFmpeg-devel] [PATCH v3 4/7] avfilter/vf_bwdif: Add neon for filter_edge
2023-07-03 19:04 [FFmpeg-devel] [PATCH v3 0/7] avfilter/vf_bwdif: Add aarch64 neon functions John Cox
` (2 preceding siblings ...)
2023-07-03 19:04 ` [FFmpeg-devel] [PATCH v3 3/7] tests/checkasm: Add test for vf_bwdif filter_edge John Cox
@ 2023-07-03 19:04 ` John Cox
2023-07-03 19:04 ` [FFmpeg-devel] [PATCH v3 5/7] avfilter/vf_bwdif: Add neon for filter_line Exports C filter_line needed for tail fixup of neon code John Cox
` (2 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: John Cox @ 2023-07-03 19:04 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: thomas.mundt, John Cox, martin
Adds clip and spatial macros for aarch64 neon
Exports C filter_edge needed for tail fixup of neon code
Adds neon for filter_edge
Signed-off-by: John Cox <jc@kynesim.co.uk>
---
libavfilter/aarch64/vf_bwdif_init_aarch64.c | 20 +++
libavfilter/aarch64/vf_bwdif_neon.S | 177 ++++++++++++++++++++
libavfilter/bwdif.h | 4 +
libavfilter/vf_bwdif.c | 8 +-
4 files changed, 205 insertions(+), 4 deletions(-)
diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
index 3ffaa07ab3..e75cf2f204 100644
--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -24,10 +24,29 @@
#include "libavfilter/bwdif.h"
#include "libavutil/aarch64/cpu.h"
+void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
+ int w, int prefs, int mrefs, int prefs2, int mrefs2,
+ int parity, int clip_max, int spat);
+
void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
int prefs3, int mrefs3, int parity, int clip_max);
+static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
+ int w, int prefs, int mrefs, int prefs2, int mrefs2,
+ int parity, int clip_max, int spat)
+{
+ const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+ ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2,
+ parity, clip_max, spat);
+
+ if (w0 < w)
+ ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
+ w - w0, prefs, mrefs, prefs2, mrefs2,
+ parity, clip_max, spat);
+}
+
static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
int prefs3, int mrefs3, int parity, int clip_max)
{
@@ -52,5 +71,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
return;
s->filter_intra = filter_intra_helper;
+ s->filter_edge = filter_edge_helper;
}
diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
index e288efbe6c..389302b813 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -66,6 +66,79 @@
umlsl2 \a3\().4s, \s1\().8h, \k
.endm
+// int b = m2s1 - m1;
+// int f = p2s1 - p1;
+// int dc = c0s1 - m1;
+// int de = c0s1 - p1;
+// int sp_max = FFMIN(p1 - c0s1, m1 - c0s1);
+// sp_max = FFMIN(sp_max, FFMAX(-b,-f));
+// int sp_min = FFMIN(c0s1 - p1, c0s1 - m1);
+// sp_min = FFMIN(sp_min, FFMAX(b,f));
+// diff = diff == 0 ? 0 : FFMAX3(diff, sp_min, sp_max);
+.macro SPAT_CHECK diff, m2s1, m1, c0s1, p1, p2s1, t0, t1, t2, t3
+ uqsub \t0\().16b, \p1\().16b, \c0s1\().16b
+ uqsub \t2\().16b, \m1\().16b, \c0s1\().16b
+ umin \t2\().16b, \t0\().16b, \t2\().16b
+
+ uqsub \t1\().16b, \m1\().16b, \m2s1\().16b
+ uqsub \t3\().16b, \p1\().16b, \p2s1\().16b
+ umax \t3\().16b, \t3\().16b, \t1\().16b
+ umin \t3\().16b, \t3\().16b, \t2\().16b
+
+ uqsub \t0\().16b, \c0s1\().16b, \p1\().16b
+ uqsub \t2\().16b, \c0s1\().16b, \m1\().16b
+ umin \t2\().16b, \t0\().16b, \t2\().16b
+
+ uqsub \t1\().16b, \m2s1\().16b, \m1\().16b
+ uqsub \t0\().16b, \p2s1\().16b, \p1\().16b
+ umax \t0\().16b, \t0\().16b, \t1\().16b
+ umin \t2\().16b, \t2\().16b, \t0\().16b
+
+ cmeq \t1\().16b, \diff\().16b, #0
+ umax \diff\().16b, \diff\().16b, \t3\().16b
+ umax \diff\().16b, \diff\().16b, \t2\().16b
+ bic \diff\().16b, \diff\().16b, \t1\().16b
+.endm
+
+// i0 = s0;
+// if (i0 > d0 + diff0)
+// i0 = d0 + diff0;
+// else if (i0 < d0 - diff0)
+// i0 = d0 - diff0;
+//
+// i0 = s0 is safe
+.macro DIFF_CLIP i0, s0, d0, diff, t0, t1
+ uqadd \t0\().16b, \d0\().16b, \diff\().16b
+ uqsub \t1\().16b, \d0\().16b, \diff\().16b
+ umin \i0\().16b, \s0\().16b, \t0\().16b
+ umax \i0\().16b, \i0\().16b, \t1\().16b
+.endm
+
+// i0 = FFABS(m1 - p1) > td0 ? i1 : i2;
+// DIFF_CLIP
+//
+// i0 = i1 is safe
+.macro INTERPOL i0, i1, i2, m1, d0, p1, td0, diff, t0, t1, t2
+ uabd \t0\().16b, \m1\().16b, \p1\().16b
+ cmhi \t0\().16b, \t0\().16b, \td0\().16b
+ bsl \t0\().16b, \i1\().16b, \i2\().16b
+ DIFF_CLIP \i0, \t0, \d0, \diff, \t1, \t2
+.endm
+
+.macro PUSH_VREGS
+ stp d8, d9, [sp, #-64]!
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+.endm
+
+.macro POP_VREGS
+ ldp d14, d15, [sp, #48]
+ ldp d12, d13, [sp, #32]
+ ldp d10, d11, [sp, #16]
+ ldp d8, d9, [sp], #64
+.endm
+
.macro LDR_COEFFS d, t0
movrel \t0, coeffs, 0
ld1 {\d\().8h}, [\t0]
@@ -81,6 +154,110 @@ const coeffs, align=4 // align 4 means align on 2^4 boundry
.hword 5077, 981 // sp[0] = v0.h[6]
endconst
+// ============================================================================
+//
+// void ff_bwdif_filter_edge_neon(
+// void *dst1, // x0
+// void *prev1, // x1
+// void *cur1, // x2
+// void *next1, // x3
+// int w, // w4
+// int prefs, // w5
+// int mrefs, // w6
+// int prefs2, // w7
+// int mrefs2, // [sp, #0]
+// int parity, // [sp, #SP_INT]
+// int clip_max, // [sp, #SP_INT*2] unused
+// int spat); // [sp, #SP_INT*3]
+
+function ff_bwdif_filter_edge_neon, export=1
+ // Sanity check w
+ cmp w4, #0
+ ble 99f
+
+// #define prev2 cur
+// const uint8_t * restrict next2 = parity ? prev : next;
+
+ ldr w8, [sp, #0] // mrefs2
+
+ ldr w17, [sp, #SP_INT] // parity
+ ldr w16, [sp, #SP_INT*3] // spat
+ cmp w17, #0
+ csel x17, x1, x3, ne
+
+// for (x = 0; x < w; x++) {
+
+10:
+// int m1 = cur[mrefs];
+// int d = (prev2[0] + next2[0]) >> 1;
+// int p1 = cur[prefs];
+// int temporal_diff0 = FFABS(prev2[0] - next2[0]);
+// int temporal_diff1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
+// int temporal_diff2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
+// int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2);
+ ldr q31, [x2]
+ ldr q21, [x17]
+ uhadd v16.16b, v31.16b, v21.16b // d0 = v16
+ uabd v17.16b, v31.16b, v21.16b // td0 = v17
+ ldr q24, [x2, w6, sxtw] // m1 = v24
+ ldr q22, [x2, w5, sxtw] // p1 = v22
+
+ ldr q0, [x1, w6, sxtw] // prev[mrefs]
+ ldr q2, [x1, w5, sxtw] // prev[prefs]
+ ldr q1, [x3, w6, sxtw] // next[mrefs]
+ ldr q3, [x3, w5, sxtw] // next[prefs]
+
+ ushr v29.16b, v17.16b, #1
+
+ uabd v31.16b, v0.16b, v24.16b
+ uabd v30.16b, v2.16b, v22.16b
+ uhadd v0.16b, v31.16b, v30.16b // td1 = q0
+
+ uabd v31.16b, v1.16b, v24.16b
+ uabd v30.16b, v3.16b, v22.16b
+ uhadd v1.16b, v31.16b, v30.16b // td2 = q1
+
+ umax v0.16b, v0.16b, v29.16b
+ umax v0.16b, v0.16b, v1.16b // diff = v0
+
+// if (spat) {
+// SPAT_CHECK()
+// }
+// i0 = (m1 + p1) >> 1;
+ cbz w16, 1f
+
+ ldr q31, [x2, w8, sxtw]
+ ldr q18, [x17, w8, sxtw]
+ ldr q30, [x2, w7, sxtw]
+ ldr q19, [x17, w7, sxtw]
+ uhadd v18.16b, v18.16b, v31.16b
+ uhadd v19.16b, v19.16b, v30.16b
+
+ SPAT_CHECK v0, v18, v24, v16, v22, v19, v31, v30, v29, v28
+
+1:
+ uhadd v2.16b, v22.16b, v24.16b
+
+ // i0 = v2, s0 = v2, d0 = v16, diff = v0, t0 = v31, t1 = v30
+ DIFF_CLIP v2, v2, v16, v0, v31, v30
+
+// dst[0] = av_clip(interpol, 0, clip_max);
+ str q2, [x0], #16
+
+// dst++;
+// cur++;
+// }
+ subs w4, w4, #16
+ add x1, x1, #16
+ add x2, x2, #16
+ add x3, x3, #16
+ add x17, x17, #16
+ bgt 10b
+
+99:
+ ret
+endfunc
+
// ============================================================================
//
// void ff_bwdif_filter_intra_neon(
diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index ae6f6ce223..ae1616d366 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -41,6 +41,10 @@ void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);
void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth);
+void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
+ int w, int prefs, int mrefs, int prefs2, int mrefs2,
+ int parity, int clip_max, int spat);
+
void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
int prefs3, int mrefs3, int parity, int clip_max);
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 035fc58670..bec83111b4 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -150,9 +150,9 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
FILTER2()
}
-static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
- int w, int prefs, int mrefs, int prefs2, int mrefs2,
- int parity, int clip_max, int spat)
+void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
+ int w, int prefs, int mrefs, int prefs2, int mrefs2,
+ int parity, int clip_max, int spat)
{
uint8_t *dst = dst1;
uint8_t *prev = prev1;
@@ -364,7 +364,7 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
} else {
s->filter_intra = ff_bwdif_filter_intra_c;
s->filter_line = filter_line_c;
- s->filter_edge = filter_edge;
+ s->filter_edge = ff_bwdif_filter_edge_c;
}
#if ARCH_X86
--
2.39.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* [FFmpeg-devel] [PATCH v3 5/7] avfilter/vf_bwdif: Add neon for filter_line Exports C filter_line needed for tail fixup of neon code
2023-07-03 19:04 [FFmpeg-devel] [PATCH v3 0/7] avfilter/vf_bwdif: Add aarch64 neon functions John Cox
` (3 preceding siblings ...)
2023-07-03 19:04 ` [FFmpeg-devel] [PATCH v3 4/7] avfilter/vf_bwdif: Add neon for filter_edge John Cox
@ 2023-07-03 19:04 ` John Cox
2023-07-03 19:04 ` [FFmpeg-devel] [PATCH v3 6/7] avfilter/vf_bwdif: Add a filter_line3 method for optimisation John Cox
2023-07-03 19:04 ` [FFmpeg-devel] [PATCH v3 7/7] avfilter/vf_bwdif: Add neon for filter_line3 John Cox
6 siblings, 0 replies; 8+ messages in thread
From: John Cox @ 2023-07-03 19:04 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: thomas.mundt, John Cox, martin
Signed-off-by: John Cox <jc@kynesim.co.uk>
---
libavfilter/aarch64/vf_bwdif_init_aarch64.c | 21 ++
libavfilter/aarch64/vf_bwdif_neon.S | 208 ++++++++++++++++++++
libavfilter/bwdif.h | 5 +
libavfilter/vf_bwdif.c | 10 +-
4 files changed, 239 insertions(+), 5 deletions(-)
diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
index e75cf2f204..21e67884ab 100644
--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
int prefs3, int mrefs3, int parity, int clip_max);
+void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
+ int w, int prefs, int mrefs, int prefs2, int mrefs2,
+ int prefs3, int mrefs3, int prefs4, int mrefs4,
+ int parity, int clip_max);
+
+
+static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
+ int w, int prefs, int mrefs, int prefs2, int mrefs2,
+ int prefs3, int mrefs3, int prefs4, int mrefs4,
+ int parity, int clip_max)
+{
+ const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+ ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
+ w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
+
+ if (w0 < w)
+ ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
+ w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
+}
static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
int w, int prefs, int mrefs, int prefs2, int mrefs2,
@@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
return;
s->filter_intra = filter_intra_helper;
+ s->filter_line = filter_line_helper;
s->filter_edge = filter_edge_helper;
}
diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
index 389302b813..ae5f09c511 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -154,6 +154,214 @@ const coeffs, align=4 // align 4 means align on 2^4 boundry
.hword 5077, 981 // sp[0] = v0.h[6]
endconst
+// ===========================================================================
+//
+// void filter_line(
+// void *dst1, // x0
+// void *prev1, // x1
+// void *cur1, // x2
+// void *next1, // x3
+// int w, // w4
+// int prefs, // w5
+// int mrefs, // w6
+// int prefs2, // w7
+// int mrefs2, // [sp, #0]
+// int prefs3, // [sp, #SP_INT]
+// int mrefs3, // [sp, #SP_INT*2]
+// int prefs4, // [sp, #SP_INT*3]
+// int mrefs4, // [sp, #SP_INT*4]
+// int parity, // [sp, #SP_INT*5]
+// int clip_max) // [sp, #SP_INT*6]
+
+function ff_bwdif_filter_line_neon, export=1
+ // Sanity check w
+ cmp w4, #0
+ ble 99f
+
+ // Rearrange regs to be the same as line3 for ease of debug!
+ mov w10, w4 // w10 = loop count
+ mov w9, w6 // w9 = mref
+ mov w12, w7 // w12 = pref2
+ mov w11, w5 // w11 = pref
+ ldr w8, [sp, #0] // w8 = mref2
+ ldr w7, [sp, #SP_INT*2] // w7 = mref3
+ ldr w6, [sp, #SP_INT*4] // w6 = mref4
+ ldr w13, [sp, #SP_INT] // w13 = pref3
+ ldr w14, [sp, #SP_INT*3] // w14 = pref4
+
+ mov x4, x3
+ mov x3, x2
+ mov x2, x1
+
+ LDR_COEFFS v0, x17
+
+// #define prev2 cur
+// const uint8_t * restrict next2 = parity ? prev : next;
+ ldr w17, [sp, #SP_INT*5] // parity
+ cmp w17, #0
+ csel x17, x2, x4, ne
+
+ PUSH_VREGS
+
+// for (x = 0; x < w; x++) {
+// int diff0, diff2;
+// int d0, d2;
+// int temporal_diff0, temporal_diff2;
+//
+// int i1, i2;
+// int j1, j2;
+// int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
+
+10:
+// c0 = prev2[0] + next2[0]; // c0 = v20, v21
+// d0 = c0 >> 1; // d0 = v10
+// temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
+ ldr q31, [x3]
+ ldr q21, [x17]
+ uhadd v10.16b, v31.16b, v21.16b
+ uabd v11.16b, v31.16b, v21.16b
+ uaddl v20.8h, v21.8b, v31.8b
+ uaddl2 v21.8h, v21.16b, v31.16b
+
+ ldr q31, [x3, w6, sxtw]
+ ldr q23, [x17, w6, sxtw]
+
+// i1 = coef_hf[0] * c0; // i1 = v2-v5
+ UMULL4K v2, v3, v4, v5, v20, v21, v0.h[2]
+
+ ldr q30, [x3, w14, sxtw]
+ ldr q25, [x17, w14, sxtw]
+
+// m4 = prev2[mrefs4] + next2[mrefs4]; // m4 = v22,v23
+ uaddl v22.8h, v23.8b, v31.8b
+ uaddl2 v23.8h, v23.16b, v31.16b
+
+// p4 = prev2[prefs4] + next2[prefs4]; // p4 = v24,v25, (p4 >> 1) = v12
+ uhadd v12.16b, v25.16b, v30.16b
+ uaddl v24.8h, v25.8b, v30.8b
+ uaddl2 v25.8h, v25.16b, v30.16b
+
+// m3 = cur[mrefs3]; // m3 = v20
+ ldr q20, [x3, w7, sxtw]
+
+// p3 = cur[prefs3]; // p3 = v21
+ ldr q21, [x3, w13, sxtw]
+
+// i1 += coef_hf[2] * (m4 + p4); // (-m4:v22,v23) (-p4:v24,v25)
+ add v22.8h, v22.8h, v24.8h
+ add v23.8h, v23.8h, v25.8h
+ UMLAL4K v2, v3, v4, v5, v22, v23, v0.h[4]
+
+ ldr q29, [x3, w8, sxtw]
+ ldr q23, [x17, w8, sxtw]
+
+// i1 -= coef_lf[1] * 4 * (m3 + p3); // -
+ uaddl v30.8h, v20.8b, v21.8b
+ uaddl2 v31.8h, v20.16b, v21.16b
+
+ UMLSL4K v2, v3, v4, v5, v30, v31, v0.h[1]
+
+// m2 = prev2[mrefs2] + next2[mrefs2]; // m2 = v22,v23, (m2 >> 1) = v13
+ uhadd v13.16b, v23.16b, v29.16b
+ uaddl v22.8h, v23.8b, v29.8b
+ uaddl2 v23.8h, v23.16b, v29.16b
+
+ ldr q31, [x3, w12, sxtw]
+ ldr q27, [x17, w12, sxtw]
+
+// j1 += coef_hf[2] * (m2 + p6); // (-p6:v24,v25)
+ add v24.8h, v24.8h, v22.8h
+ add v25.8h, v25.8h, v23.8h
+ UMLAL4K v6, v7, v8, v9, v24, v25, v0.h[4]
+
+// m1 = cur[mrefs]; // m1 = v24
+ ldr q24, [x3, w9, sxtw]
+
+// p2 = prev2[prefs2] + next2[prefs2]; // p2 = v26, v27
+// temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
+// d2 = p2 >> 1; // d2 = v15
+ uabd v14.16b, v31.16b, v27.16b
+ uhadd v15.16b, v31.16b, v27.16b
+ uaddl v26.8h, v27.8b, v31.8b
+ uaddl2 v27.8h, v27.16b, v31.16b
+
+// i1 -= coef_hf[1] * (m2 + p2); // (-m2:v22,v23*) (-p2:v26*,v27*)
+ add v22.8h, v22.8h, v26.8h
+ add v23.8h, v23.8h, v27.8h
+ UMLSL4K v2, v3, v4, v5, v22, v23, v0.h[3]
+
+// p1 = cur[prefs]; // p1 = v22
+ ldr q22, [x3, w11, sxtw]
+
+// i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
+ uaddl v18.8h, v22.8b, v24.8b
+ uaddl2 v19.8h, v22.16b, v24.16b
+ UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6]
+
+ uaddl v18.8h, v20.8b, v21.8b
+ uaddl2 v19.8h, v20.16b, v21.16b
+ UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7]
+
+ SQSHRUNN v17, v28, v29, v30, v31, 13
+
+// i1 += coef_lf[0] * 4 * (m1 + p1); // p1 = v22, m1 = v24
+ uaddl v26.8h, v24.8b, v22.8b
+ uaddl2 v27.8h, v24.16b, v22.16b
+ UMLAL4K v2, v3, v4, v5, v26, v27, v0.h[0]
+
+ ldr q31, [x2, w9, sxtw]
+ ldr q29, [x4, w9, sxtw]
+
+ ldr q30, [x2, w11, sxtw]
+ ldr q28, [x4, w11, sxtw]
+
+// i1 >>= 15; // i1 = v2, -v3, -v4*, -v5*
+ SQSHRUNN v2, v2, v3, v4, v5, 15
+
+// {
+// int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
+// int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
+ uabd v30.16b, v22.16b, v30.16b
+ uabd v31.16b, v24.16b, v31.16b
+ uabd v28.16b, v22.16b, v28.16b
+ uabd v29.16b, v24.16b, v29.16b
+ uhadd v31.16b, v31.16b, v30.16b
+ uhadd v29.16b, v29.16b, v28.16b
+
+// diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
+ ushr v18.16b, v11.16b, #1
+ umax v18.16b, v18.16b, v31.16b
+ umax v18.16b, v18.16b, v29.16b
+
+ // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15
+ SPAT_CHECK v18, v13, v24, v10, v22, v15, v31, v30, v29, v28
+
+ // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18
+ INTERPOL v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29
+
+// dst[0] = av_clip_uint8(interpol);
+ str q2, [x0], #16
+// }
+//
+// dst++;
+// cur++;
+// prev++;
+// prev2++;
+// next++;
+// }
+
+ subs w10, w10, #16
+ add x2, x2, #16
+ add x3, x3, #16
+ add x4, x4, #16
+ add x17, x17, #16
+ bgt 10b
+
+ POP_VREGS
+99:
+ ret
+endfunc
+
// ============================================================================
//
// void ff_bwdif_filter_edge_neon(
diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index ae1616d366..cce99953f3 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -48,4 +48,9 @@ void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
int prefs3, int mrefs3, int parity, int clip_max);
+void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+ int w, int prefs, int mrefs, int prefs2, int mrefs2,
+ int prefs3, int mrefs3, int prefs4, int mrefs4,
+ int parity, int clip_max);
+
#endif /* AVFILTER_BWDIF_H */
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index bec83111b4..26349da1fd 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -132,10 +132,10 @@ void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs
FILTER_INTRA()
}
-static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
- int w, int prefs, int mrefs, int prefs2, int mrefs2,
- int prefs3, int mrefs3, int prefs4, int mrefs4,
- int parity, int clip_max)
+void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+ int w, int prefs, int mrefs, int prefs2, int mrefs2,
+ int prefs3, int mrefs3, int prefs4, int mrefs4,
+ int parity, int clip_max)
{
uint8_t *dst = dst1;
uint8_t *prev = prev1;
@@ -363,7 +363,7 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
s->filter_edge = filter_edge_16bit;
} else {
s->filter_intra = ff_bwdif_filter_intra_c;
- s->filter_line = filter_line_c;
+ s->filter_line = ff_bwdif_filter_line_c;
s->filter_edge = ff_bwdif_filter_edge_c;
}
--
2.39.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* [FFmpeg-devel] [PATCH v3 6/7] avfilter/vf_bwdif: Add a filter_line3 method for optimisation
2023-07-03 19:04 [FFmpeg-devel] [PATCH v3 0/7] avfilter/vf_bwdif: Add aarch64 neon functions John Cox
` (4 preceding siblings ...)
2023-07-03 19:04 ` [FFmpeg-devel] [PATCH v3 5/7] avfilter/vf_bwdif: Add neon for filter_line Exports C filter_line needed for tail fixup of neon code John Cox
@ 2023-07-03 19:04 ` John Cox
2023-07-03 19:04 ` [FFmpeg-devel] [PATCH v3 7/7] avfilter/vf_bwdif: Add neon for filter_line3 John Cox
6 siblings, 0 replies; 8+ messages in thread
From: John Cox @ 2023-07-03 19:04 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: thomas.mundt, John Cox, martin
Add an optional filter_line3 to the available optimisations.
filter_line3 is equivalent to filter_line, memcpy, filter_line
filter_line shares quite a number of loads and some calculations in
common with its next iteration and testing shows that using aarch64
neon filter_line3s performance is 30% better than two filter_lines
and a memcpy.
Adds a test for vf_bwdif filter_line3 to checkasm
Rounds job start lines down to a multiple of 4. This means that if
filter_line3 exists then filter_line will not sometimes be called
once at the end of a slice depending on thread count. The final slice
may do up to 3 extra lines but filter_edge is faster than filter_line
so it is unlikely to create any noticable thread load variation.
Signed-off-by: John Cox <jc@kynesim.co.uk>
---
libavfilter/bwdif.h | 7 ++++
libavfilter/vf_bwdif.c | 44 +++++++++++++++++++--
tests/checkasm/vf_bwdif.c | 81 +++++++++++++++++++++++++++++++++++++++
3 files changed, 129 insertions(+), 3 deletions(-)
diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index cce99953f3..496cec72ef 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -35,6 +35,9 @@ typedef struct BWDIFContext {
void (*filter_edge)(void *dst, void *prev, void *cur, void *next,
int w, int prefs, int mrefs, int prefs2, int mrefs2,
int parity, int clip_max, int spat);
+ void (*filter_line3)(void *dst, int dstride,
+ const void *prev, const void *cur, const void *next, int prefs,
+ int w, int parity, int clip_max);
} BWDIFContext;
void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
@@ -53,4 +56,8 @@ void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
int prefs3, int mrefs3, int prefs4, int mrefs4,
int parity, int clip_max);
+void ff_bwdif_filter_line3_c(void * dst1, int d_stride,
+ const void * prev1, const void * cur1, const void * next1, int s_stride,
+ int w, int parity, int clip_max);
+
#endif /* AVFILTER_BWDIF_H */
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 26349da1fd..6701208efe 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -150,6 +150,31 @@ void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
FILTER2()
}
+#define NEXT_LINE()\
+ dst += d_stride; \
+ prev += prefs; \
+ cur += prefs; \
+ next += prefs;
+
+void ff_bwdif_filter_line3_c(void * dst1, int d_stride,
+ const void * prev1, const void * cur1, const void * next1, int s_stride,
+ int w, int parity, int clip_max)
+{
+ const int prefs = s_stride;
+ uint8_t * dst = dst1;
+ const uint8_t * prev = prev1;
+ const uint8_t * cur = cur1;
+ const uint8_t * next = next1;
+
+ ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
+ prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max);
+ NEXT_LINE();
+ memcpy(dst, cur, w);
+ NEXT_LINE();
+ ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
+ prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max);
+}
+
void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
int w, int prefs, int mrefs, int prefs2, int mrefs2,
int parity, int clip_max, int spat)
@@ -212,6 +237,13 @@ static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1,
FILTER2()
}
+// Round job start line down to multiple of 4 so that if filter_line3 exists
+// and the frame is a multiple of 4 high then filter_line will never be called
+static inline int job_start(const int jobnr, const int nb_jobs, const int h)
+{
+ return jobnr >= nb_jobs ? h : ((h * jobnr) / nb_jobs) & ~3;
+}
+
static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
BWDIFContext *s = ctx->priv;
@@ -221,8 +253,8 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
int clip_max = (1 << (yadif->csp->comp[td->plane].depth)) - 1;
int df = (yadif->csp->comp[td->plane].depth + 7) / 8;
int refs = linesize / df;
- int slice_start = (td->h * jobnr ) / nb_jobs;
- int slice_end = (td->h * (jobnr+1)) / nb_jobs;
+ int slice_start = job_start(jobnr, nb_jobs, td->h);
+ int slice_end = job_start(jobnr + 1, nb_jobs, td->h);
int y;
for (y = slice_start; y < slice_end; y++) {
@@ -244,6 +276,11 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
refs << 1, -(refs << 1),
td->parity ^ td->tff, clip_max,
(y < 2) || ((y + 3) > td->h) ? 0 : 1);
+ } else if (s->filter_line3 && y + 2 < slice_end && y + 6 < td->h) {
+ s->filter_line3(dst, td->frame->linesize[td->plane],
+ prev, cur, next, linesize, td->w,
+ td->parity ^ td->tff, clip_max);
+ y += 2;
} else {
s->filter_line(dst, prev, cur, next, td->w,
refs, -refs, refs << 1, -(refs << 1),
@@ -280,7 +317,7 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic,
td.plane = i;
ff_filter_execute(ctx, filter_slice, &td, NULL,
- FFMIN(h, ff_filter_get_nb_threads(ctx)));
+ FFMIN((h+3)/4, ff_filter_get_nb_threads(ctx)));
}
if (yadif->current_field == YADIF_FIELD_END) {
yadif->current_field = YADIF_FIELD_NORMAL;
@@ -357,6 +394,7 @@ static int config_props(AVFilterLink *link)
av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
{
+ s->filter_line3 = 0;
if (bit_depth > 8) {
s->filter_intra = filter_intra_16bit;
s->filter_line = filter_line_c_16bit;
diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
index 5fdba09fdc..3399cacdf7 100644
--- a/tests/checkasm/vf_bwdif.c
+++ b/tests/checkasm/vf_bwdif.c
@@ -28,6 +28,10 @@
for (size_t i = 0; i < count; i++) \
buf0[i] = buf1[i] = rnd() & mask
+#define randomize_overflow_check(buf0, buf1, mask, count) \
+ for (size_t i = 0; i < count; i++) \
+ buf0[i] = buf1[i] = (rnd() & 1) != 0 ? mask : 0;
+
#define BODY(type, depth) \
do { \
type prev0[9*WIDTH], prev1[9*WIDTH]; \
@@ -83,6 +87,83 @@ void checkasm_check_vf_bwdif(void)
report("bwdif10");
}
+ if (!ctx_8.filter_line3)
+ ctx_8.filter_line3 = ff_bwdif_filter_line3_c;
+
+ {
+ LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
+ LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
+ LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]);
+ LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]);
+ LOCAL_ALIGNED_16(uint8_t, cur0, [11*WIDTH]);
+ LOCAL_ALIGNED_16(uint8_t, cur1, [11*WIDTH]);
+ LOCAL_ALIGNED_16(uint8_t, dst0, [WIDTH*3]);
+ LOCAL_ALIGNED_16(uint8_t, dst1, [WIDTH*3]);
+ const int stride = WIDTH;
+ const int mask = (1<<8)-1;
+ int parity;
+
+ for (parity = 0; parity != 2; ++parity) {
+ if (check_func(ctx_8.filter_line3, "bwdif8.line3.rnd.p%d", parity)) {
+
+ declare_func(void, void * dst1, int d_stride,
+ const void * prev1, const void * cur1, const void * next1, int prefs,
+ int w, int parity, int clip_max);
+
+ randomize_buffers(prev0, prev1, mask, 11*WIDTH);
+ randomize_buffers(next0, next1, mask, 11*WIDTH);
+ randomize_buffers( cur0, cur1, mask, 11*WIDTH);
+
+ call_ref(dst0, stride,
+ prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride,
+ WIDTH, parity, mask);
+ call_new(dst1, stride,
+ prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
+ WIDTH, parity, mask);
+
+ if (memcmp(dst0, dst1, WIDTH*3)
+ || memcmp(prev0, prev1, WIDTH*11)
+ || memcmp(next0, next1, WIDTH*11)
+ || memcmp( cur0, cur1, WIDTH*11))
+ fail();
+
+ bench_new(dst1, stride,
+ prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
+ WIDTH, parity, mask);
+ }
+ }
+
+ // Use just 0s and ~0s to try to provoke bad cropping or overflow
+ // Parity makes no difference to this test so just test 0
+ if (check_func(ctx_8.filter_line3, "bwdif8.line3.overflow")) {
+
+ declare_func(void, void * dst1, int d_stride,
+ const void * prev1, const void * cur1, const void * next1, int prefs,
+ int w, int parity, int clip_max);
+
+ randomize_overflow_check(prev0, prev1, mask, 11*WIDTH);
+ randomize_overflow_check(next0, next1, mask, 11*WIDTH);
+ randomize_overflow_check( cur0, cur1, mask, 11*WIDTH);
+
+ call_ref(dst0, stride,
+ prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride,
+ WIDTH, 0, mask);
+ call_new(dst1, stride,
+ prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
+ WIDTH, 0, mask);
+
+ if (memcmp(dst0, dst1, WIDTH*3)
+ || memcmp(prev0, prev1, WIDTH*11)
+ || memcmp(next0, next1, WIDTH*11)
+ || memcmp( cur0, cur1, WIDTH*11))
+ fail();
+
+ // No point to benching
+ }
+
+ report("bwdif8.line3");
+ }
+
{
LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
--
2.39.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* [FFmpeg-devel] [PATCH v3 7/7] avfilter/vf_bwdif: Add neon for filter_line3
2023-07-03 19:04 [FFmpeg-devel] [PATCH v3 0/7] avfilter/vf_bwdif: Add aarch64 neon functions John Cox
` (5 preceding siblings ...)
2023-07-03 19:04 ` [FFmpeg-devel] [PATCH v3 6/7] avfilter/vf_bwdif: Add a filter_line3 method for optimisation John Cox
@ 2023-07-03 19:04 ` John Cox
6 siblings, 0 replies; 8+ messages in thread
From: John Cox @ 2023-07-03 19:04 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: thomas.mundt, John Cox, martin
Signed-off-by: John Cox <jc@kynesim.co.uk>
---
libavfilter/aarch64/vf_bwdif_init_aarch64.c | 28 ++
libavfilter/aarch64/vf_bwdif_neon.S | 272 ++++++++++++++++++++
2 files changed, 300 insertions(+)
diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
index 21e67884ab..f52bc4b9b4 100644
--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -36,6 +36,33 @@ void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
int prefs3, int mrefs3, int prefs4, int mrefs4,
int parity, int clip_max);
+void ff_bwdif_filter_line3_neon(void * dst1, int d_stride,
+ const void * prev1, const void * cur1, const void * next1, int s_stride,
+ int w, int parity, int clip_max);
+
+
+static void filter_line3_helper(void * dst1, int d_stride,
+ const void * prev1, const void * cur1, const void * next1, int s_stride,
+ int w, int parity, int clip_max)
+{
+ // Asm works on 16 byte chunks
+ // If w is a multiple of 16 then all is good - if not then if width rounded
+ // up to nearest 16 will fit in both src & dst strides then allow the asm
+ // to write over the padding bytes as that is almost certainly faster than
+ // having to invoke the C version to clean up the tail.
+ const int w1 = FFALIGN(w, 16);
+ const int w0 = clip_max != 255 ? 0 :
+ d_stride <= w1 && s_stride <= w1 ? w : w & ~15;
+
+ ff_bwdif_filter_line3_neon(dst1, d_stride,
+ prev1, cur1, next1, s_stride,
+ w0, parity, clip_max);
+
+ if (w0 < w)
+ ff_bwdif_filter_line3_c((char *)dst1 + w0, d_stride,
+ (const char *)prev1 + w0, (const char *)cur1 + w0, (const char *)next1 + w0, s_stride,
+ w - w0, parity, clip_max);
+}
static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
int w, int prefs, int mrefs, int prefs2, int mrefs2,
@@ -93,5 +120,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
s->filter_intra = filter_intra_helper;
s->filter_line = filter_line_helper;
s->filter_edge = filter_edge_helper;
+ s->filter_line3 = filter_line3_helper;
}
diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
index ae5f09c511..bc092477b9 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -154,6 +154,278 @@ const coeffs, align=4 // align 4 means align on 2^4 boundry
.hword 5077, 981 // sp[0] = v0.h[6]
endconst
+// ===========================================================================
+//
+// void ff_bwdif_filter_line3_neon(
+// void * dst1, // x0
+// int d_stride, // w1
+// const void * prev1, // x2
+// const void * cur1, // x3
+// const void * next1, // x4
+// int s_stride, // w5
+// int w, // w6
+// int parity, // w7
+// int clip_max); // [sp, #0] (Ignored)
+
+function ff_bwdif_filter_line3_neon, export=1
+ // Sanity check w
+ cmp w6, #0
+ ble 99f
+
+ LDR_COEFFS v0, x17
+
+// #define prev2 cur
+// const uint8_t * restrict next2 = parity ? prev : next;
+ cmp w7, #0
+ csel x17, x2, x4, ne
+
+ // We want all the V registers - save all the ones we must
+ PUSH_VREGS
+
+ // Some rearrangement of initial values for nice layout of refs in regs
+ mov w10, w6 // w10 = loop count
+ neg w9, w5 // w9 = mref
+ lsl w8, w9, #1 // w8 = mref2
+ add w7, w9, w9, LSL #1 // w7 = mref3
+ lsl w6, w9, #2 // w6 = mref4
+ mov w11, w5 // w11 = pref
+ lsl w12, w5, #1 // w12 = pref2
+ add w13, w5, w5, LSL #1 // w13 = pref3
+ lsl w14, w5, #2 // w14 = pref4
+ add w15, w5, w5, LSL #2 // w15 = pref5
+ add w16, w14, w12 // w16 = pref6
+
+ lsl w5, w1, #1 // w5 = d_stride * 2
+
+// for (x = 0; x < w; x++) {
+// int diff0, diff2;
+// int d0, d2;
+// int temporal_diff0, temporal_diff2;
+//
+// int i1, i2;
+// int j1, j2;
+// int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
+
+10:
+// c0 = prev2[0] + next2[0]; // c0 = v20, v21
+// d0 = c0 >> 1; // d0 = v10
+// temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
+ ldr q31, [x3]
+ ldr q21, [x17]
+ uhadd v10.16b, v31.16b, v21.16b
+ uabd v11.16b, v31.16b, v21.16b
+ uaddl v20.8h, v21.8b, v31.8b
+ uaddl2 v21.8h, v21.16b, v31.16b
+
+ ldr q31, [x3, w6, sxtw]
+ ldr q23, [x17, w6, sxtw]
+
+// i1 = coef_hf[0] * c0; // i1 = v2-v5
+ UMULL4K v2, v3, v4, v5, v20, v21, v0.h[2]
+
+ ldr q30, [x3, w14, sxtw]
+ ldr q25, [x17, w14, sxtw]
+
+// m4 = prev2[mrefs4] + next2[mrefs4]; // m4 = v22,v23
+ uaddl v22.8h, v23.8b, v31.8b
+ uaddl2 v23.8h, v23.16b, v31.16b
+
+// p4 = prev2[prefs4] + next2[prefs4]; // p4 = v24,v25, (p4 >> 1) = v12
+ uhadd v12.16b, v25.16b, v30.16b
+ uaddl v24.8h, v25.8b, v30.8b
+ uaddl2 v25.8h, v25.16b, v30.16b
+
+// j1 = -coef_hf[1] * (c0 + p4); // j1 = v6-v9 (-c0:v20,v21)
+ add v20.8h, v20.8h, v24.8h
+ add v21.8h, v21.8h, v25.8h
+ SMULL4K v6, v7, v8, v9, v20, v21, v0.h[5]
+
+// m3 = cur[mrefs3]; // m3 = v20
+ ldr q20, [x3, w7, sxtw]
+
+// p3 = cur[prefs3]; // p3 = v21
+ ldr q21, [x3, w13, sxtw]
+
+// i1 += coef_hf[2] * (m4 + p4); // (-m4:v22,v23) (-p4:v24,v25)
+ add v22.8h, v22.8h, v24.8h
+ add v23.8h, v23.8h, v25.8h
+ UMLAL4K v2, v3, v4, v5, v22, v23, v0.h[4]
+
+ ldr q29, [x3, w8, sxtw]
+ ldr q23, [x17, w8, sxtw]
+
+// i1 -= coef_lf[1] * 4 * (m3 + p3); // -
+ uaddl v30.8h, v20.8b, v21.8b
+ uaddl2 v31.8h, v20.16b, v21.16b
+
+ ldr q28, [x3, w16, sxtw]
+ ldr q25, [x17, w16, sxtw]
+
+ UMLSL4K v2, v3, v4, v5, v30, v31, v0.h[1]
+
+// m2 = prev2[mrefs2] + next2[mrefs2]; // m2 = v22,v23, (m2 >> 1) = v13
+ uhadd v13.16b, v23.16b, v29.16b
+ uaddl v22.8h, v23.8b, v29.8b
+ uaddl2 v23.8h, v23.16b, v29.16b
+
+ ldr q31, [x3, w12, sxtw]
+ ldr q27, [x17, w12, sxtw]
+
+// p6 = prev2[prefs6] + next2[prefs6]; // p6 = v24,v25
+ uaddl v24.8h, v25.8b, v28.8b
+ uaddl2 v25.8h, v25.16b, v28.16b
+
+// j1 += coef_hf[2] * (m2 + p6); // (-p6:v24,v25)
+ add v24.8h, v24.8h, v22.8h
+ add v25.8h, v25.8h, v23.8h
+ UMLAL4K v6, v7, v8, v9, v24, v25, v0.h[4]
+
+// m1 = cur[mrefs]; // m1 = v24
+ ldr q24, [x3, w9, sxtw]
+
+// p5 = cur[prefs5]; // p5 = v25
+ ldr q25, [x3, w15, sxtw]
+
+// p2 = prev2[prefs2] + next2[prefs2]; // p2 = v26, v27
+// temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
+// d2 = p2 >> 1; // d2 = v15
+ uabd v14.16b, v31.16b, v27.16b
+ uhadd v15.16b, v31.16b, v27.16b
+ uaddl v26.8h, v27.8b, v31.8b
+ uaddl2 v27.8h, v27.16b, v31.16b
+
+// j1 += coef_hf[0] * p2; // -
+ UMLAL4K v6, v7, v8, v9, v26, v27, v0.h[2]
+
+// i1 -= coef_hf[1] * (m2 + p2); // (-m2:v22,v23*) (-p2:v26*,v27*)
+ add v22.8h, v22.8h, v26.8h
+ add v23.8h, v23.8h, v27.8h
+ UMLSL4K v2, v3, v4, v5, v22, v23, v0.h[3]
+
+// p1 = cur[prefs]; // p1 = v22
+ ldr q22, [x3, w11, sxtw]
+
+// j1 -= coef_lf[1] * 4 * (m1 + p5); // -
+ uaddl v26.8h, v24.8b, v25.8b
+ uaddl2 v27.8h, v24.16b, v25.16b
+ UMLSL4K v6, v7, v8, v9, v26, v27, v0.h[1]
+
+// j2 = (coef_sp[0] * (p1 + p3) - coef_sp[1] * (m1 + p5)) >> 13; // (-p5:v25*) j2=v16
+ uaddl v18.8h, v22.8b, v21.8b
+ uaddl2 v19.8h, v22.16b, v21.16b
+ UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6]
+
+ uaddl v18.8h, v24.8b, v25.8b
+ uaddl2 v19.8h, v24.16b, v25.16b
+ UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7]
+
+ SQSHRUNN v16, v28, v29, v30, v31, 13
+
+// i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
+ uaddl v18.8h, v22.8b, v24.8b
+ uaddl2 v19.8h, v22.16b, v24.16b
+ UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6]
+
+ uaddl v18.8h, v20.8b, v21.8b
+ uaddl2 v19.8h, v20.16b, v21.16b
+ UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7]
+
+ SQSHRUNN v17, v28, v29, v30, v31, 13
+
+// i1 += coef_lf[0] * 4 * (m1 + p1); // p1 = v22, m1 = v24
+ uaddl v26.8h, v24.8b, v22.8b
+ uaddl2 v27.8h, v24.16b, v22.16b
+ UMLAL4K v2, v3, v4, v5, v26, v27, v0.h[0]
+
+ ldr q31, [x2, w9, sxtw]
+ ldr q29, [x4, w9, sxtw]
+
+// j1 += coef_lf[0] * 4 * (p1 + p3); // p1 = v22, p3 = v21
+ uaddl v26.8h, v21.8b, v22.8b
+ uaddl2 v27.8h, v21.16b, v22.16b
+ UMLAL4K v6, v7, v8, v9, v26, v27, v0.h[0]
+
+ ldr q30, [x2, w11, sxtw]
+ ldr q28, [x4, w11, sxtw]
+
+// i1 >>= 15; // i1 = v2, -v3, -v4*, -v5*
+ SQSHRUNN v2, v2, v3, v4, v5, 15
+
+// j1 >>= 15; // j1 = v3, -v6*, -v7*, -v8*, -v9*
+ SQSHRUNN v3, v6, v7, v8, v9, 15
+
+// {
+// int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
+// int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
+ uabd v30.16b, v22.16b, v30.16b
+ uabd v31.16b, v24.16b, v31.16b
+ uabd v28.16b, v22.16b, v28.16b
+ uabd v29.16b, v24.16b, v29.16b
+ uhadd v31.16b, v31.16b, v30.16b
+ uhadd v29.16b, v29.16b, v28.16b
+
+ ldr q27, [x2, w13, sxtw]
+ ldr q26, [x4, w13, sxtw]
+
+// diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
+ ushr v18.16b, v11.16b, #1
+ umax v18.16b, v18.16b, v31.16b
+ umax v18.16b, v18.16b, v29.16b
+// } // v28, v30 preserved for next block
+// { // tdiff2 = v14
+// int t1 =(FFABS(prev[prefs] - p1) + FFABS(prev[prefs3] - p3)) >> 1;
+// int t2 =(FFABS(next[prefs] - p1) + FFABS(next[prefs3] - p3)) >> 1;
+ uabd v31.16b, v21.16b, v27.16b
+ uabd v29.16b, v21.16b, v26.16b
+ uhadd v31.16b, v31.16b, v30.16b
+ uhadd v29.16b, v29.16b, v28.16b
+
+// diff2 = FFMAX3(temporal_diff2 >> 1, t1, t2); // diff2=v19
+ ushr v19.16b, v14.16b, #1
+ umax v19.16b, v19.16b, v31.16b
+ umax v19.16b, v19.16b, v29.16b
+// }
+
+ // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15
+ SPAT_CHECK v18, v13, v24, v10, v22, v15, v31, v30, v29, v28
+
+ // diff2 = v19, d0 = v10, p1 = v22, d2 = v15, p3 = v21, (p4 >> 1) = v12
+ SPAT_CHECK v19, v10, v22, v15, v21, v12, v31, v30, v29, v28
+
+ // j1 = v3, j2 = v16, p1 = v22, d2 = v15, p3 = v21, td2 = v14, diff2 = v19
+ INTERPOL v3, v3, v16, v22, v15, v21, v14, v19, v31, v30, v29
+
+// dst[d_stride * 2] = av_clip_uint8(interpol);
+ str q3, [x0, w5, sxtw]
+
+// dst[d_stride] = p1;
+ str q22, [x0, w1, sxtw]
+
+ // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18
+ INTERPOL v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29
+
+// dst[0] = av_clip_uint8(interpol);
+ str q2, [x0], #16
+// }
+//
+// dst++;
+// cur++;
+// prev++;
+// prev2++;
+// next++;
+// }
+ subs w10, w10, #16
+ add x2, x2, #16
+ add x3, x3, #16
+ add x4, x4, #16
+ add x17, x17, #16
+ bgt 10b
+
+ POP_VREGS
+99:
+ ret
+endfunc
+
// ===========================================================================
//
// void filter_line(
--
2.39.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread