* [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version
@ 2025-07-17 10:45 Niklas Haas
2025-07-17 10:45 ` [FFmpeg-devel] [PATCH v2 2/2] tests/checkasm: add test for vf_blackdetect Niklas Haas
` (2 more replies)
0 siblings, 3 replies; 13+ messages in thread
From: Niklas Haas @ 2025-07-17 10:45 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
From: Niklas Haas <git@haasn.dev>
Requested by a user. Even with autovectorization enabled, the compiler
performs a quite poor job of optimizing this function, due to not being
able to take advantage of the pmaxub + pcmpeqb trick for counting the number
of pixels less than or equal-to a threshold.
blackdetect8_c: 4625.0 ( 1.00x)
blackdetect8_avx2: 155.1 (29.83x)
blackdetect16_c: 2529.4 ( 1.00x)
blackdetect16_avx2: 163.6 (15.46x)
---
libavfilter/vf_blackdetect.c | 33 +++---------
libavfilter/vf_blackdetect.h | 71 ++++++++++++++++++++++++++
libavfilter/x86/Makefile | 2 +
libavfilter/x86/vf_blackdetect.asm | 73 +++++++++++++++++++++++++++
libavfilter/x86/vf_blackdetect_init.c | 34 +++++++++++++
5 files changed, 188 insertions(+), 25 deletions(-)
create mode 100644 libavfilter/vf_blackdetect.h
create mode 100644 libavfilter/x86/vf_blackdetect.asm
create mode 100644 libavfilter/x86/vf_blackdetect_init.c
diff --git a/libavfilter/vf_blackdetect.c b/libavfilter/vf_blackdetect.c
index 8be33a814d..b233bdfd60 100644
--- a/libavfilter/vf_blackdetect.c
+++ b/libavfilter/vf_blackdetect.c
@@ -33,6 +33,7 @@
#include "filters.h"
#include "formats.h"
#include "video.h"
+#include "vf_blackdetect.h"
typedef struct BlackDetectContext {
const AVClass *class;
@@ -53,6 +54,8 @@ typedef struct BlackDetectContext {
int depth;
int nb_threads;
unsigned int *counter;
+
+ ff_blackdetect_fn func;
} BlackDetectContext;
#define OFFSET(x) offsetof(BlackDetectContext, x)
@@ -133,6 +136,7 @@ static int config_input(AVFilterLink *inlink)
s->time_base = inlink->time_base;
s->black_min_duration = s->black_min_duration_time / av_q2d(s->time_base);
s->counter = av_calloc(s->nb_threads, sizeof(*s->counter));
+ s->func = ff_blackdetect_get_fn(depth);
if (!s->counter)
return AVERROR(ENOMEM);
@@ -160,37 +164,16 @@ static int black_counter(AVFilterContext *ctx, void *arg,
int jobnr, int nb_jobs)
{
BlackDetectContext *s = ctx->priv;
- const unsigned int threshold = s->pixel_black_th_i;
- unsigned int *counterp = &s->counter[jobnr];
- AVFrame *in = arg;
+ const AVFrame *in = arg;
const int plane = s->alpha ? 3 : 0;
const int linesize = in->linesize[plane];
- const int w = in->width;
const int h = in->height;
const int start = (h * jobnr) / nb_jobs;
const int end = (h * (jobnr+1)) / nb_jobs;
- const int size = end - start;
- unsigned int counter = 0;
-
- if (s->depth == 8) {
- const uint8_t *p = in->data[plane] + start * linesize;
-
- for (int i = 0; i < size; i++) {
- for (int x = 0; x < w; x++)
- counter += p[x] <= threshold;
- p += linesize;
- }
- } else {
- const uint16_t *p = (const uint16_t *)(in->data[plane] + start * linesize);
-
- for (int i = 0; i < size; i++) {
- for (int x = 0; x < w; x++)
- counter += p[x] <= threshold;
- p += linesize / 2;
- }
- }
- *counterp = counter;
+ s->counter[jobnr] = s->func(in->data[plane] + start * linesize,
+ linesize, in->width, end - start,
+ s->pixel_black_th_i);
return 0;
}
diff --git a/libavfilter/vf_blackdetect.h b/libavfilter/vf_blackdetect.h
new file mode 100644
index 0000000000..361da2c5bc
--- /dev/null
+++ b/libavfilter/vf_blackdetect.h
@@ -0,0 +1,71 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_VF_BLACKDETECT_H
+#define AVFILTER_VF_BLACKDETECT_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef unsigned (*ff_blackdetect_fn)(const uint8_t *src, ptrdiff_t stride,
+ ptrdiff_t width, ptrdiff_t height,
+ unsigned threshold);
+
+ff_blackdetect_fn ff_blackdetect_get_fn_x86(int depth);
+
+static unsigned count_pixels8_c(const uint8_t *src, ptrdiff_t stride,
+ ptrdiff_t width, ptrdiff_t height,
+ unsigned threshold)
+{
+ unsigned int counter = 0;
+ while (height--) {
+ for (int x = 0; x < width; x++)
+ counter += src[x] <= threshold;
+ src += stride;
+ }
+ return counter;
+}
+
+static unsigned count_pixels16_c(const uint8_t *src, ptrdiff_t stride,
+ ptrdiff_t width, ptrdiff_t height,
+ unsigned threshold)
+{
+ unsigned int counter = 0;
+ while (height--) {
+ const uint16_t *src16 = (const uint16_t *) src;
+ for (int x = 0; x < width; x++)
+ counter += src16[x] <= threshold;
+ src += stride;
+ }
+ return counter;
+}
+
+
+static inline ff_blackdetect_fn ff_blackdetect_get_fn(int depth)
+{
+ ff_blackdetect_fn fn = NULL;
+#if ARCH_X86
+ fn = ff_blackdetect_get_fn_x86(depth);
+#endif
+
+ if (!fn)
+ fn = depth == 8 ? count_pixels8_c : count_pixels16_c;
+ return fn;
+}
+
+#endif /* AVFILTER_VF_BLACKDETECT_H */
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 0efe3f8d2c..86f7119a7b 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -3,6 +3,7 @@ OBJS-$(CONFIG_SCENE_SAD) += x86/scene_sad_init.o
OBJS-$(CONFIG_AFIR_FILTER) += x86/af_afir_init.o
OBJS-$(CONFIG_ANLMDN_FILTER) += x86/af_anlmdn_init.o
OBJS-$(CONFIG_ATADENOISE_FILTER) += x86/vf_atadenoise_init.o
+OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect_init.o
OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o
OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o
OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o
@@ -49,6 +50,7 @@ X86ASM-OBJS-$(CONFIG_SCENE_SAD) += x86/scene_sad.o
X86ASM-OBJS-$(CONFIG_AFIR_FILTER) += x86/af_afir.o
X86ASM-OBJS-$(CONFIG_ANLMDN_FILTER) += x86/af_anlmdn.o
X86ASM-OBJS-$(CONFIG_ATADENOISE_FILTER) += x86/vf_atadenoise.o
+X86ASM-OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect.o
X86ASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o
X86ASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o
X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o
diff --git a/libavfilter/x86/vf_blackdetect.asm b/libavfilter/x86/vf_blackdetect.asm
new file mode 100644
index 0000000000..78c24c5adc
--- /dev/null
+++ b/libavfilter/x86/vf_blackdetect.asm
@@ -0,0 +1,73 @@
+;*****************************************************************************
+;* x86-optimized functions for blackdetect filter
+;*
+;* Copyright (C) 2025 Niklas Haas
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro count_pixels_fn 1 ; depth
+cglobal blackdetect_%1, 5, 7, 2, src, stride, width, height, threshold
+ movd xm1, thresholdd
+ %if %1 == 8
+ vpbroadcastb m1, xm1
+ %else
+ vpbroadcastw m1, xm1
+ shl widthq, 1
+ %endif
+ add srcq, widthq
+ neg widthq
+ xor r4, r4
+ mov r5, widthq
+ jmp .start
+.loop:
+ popcnt r6d, r6d
+ add r4, r6
+.start:
+ movu m0, [srcq + r5]
+ %if %1 == 8
+ pmaxub m0, m1
+ pcmpeqb m0, m1
+ %else
+ pmaxuw m0, m1
+ pcmpeqw m0, m1
+ %endif
+ pmovmskb r6d, m0
+ add r5, mmsize
+ jl .loop
+ ; handle tail by shifting away unused high elements
+ shlx r6d, r6d, r5d
+ popcnt r6d, r6d
+ add r4, r6
+ add srcq, strideq
+ mov r5, widthq
+ dec heightq
+ jg .start
+ %if %1 > 8
+ shr r4, 1
+ %endif
+ mov rax, r4
+ RET
+%endmacro
+
+INIT_YMM avx2
+count_pixels_fn 8
+count_pixels_fn 16
diff --git a/libavfilter/x86/vf_blackdetect_init.c b/libavfilter/x86/vf_blackdetect_init.c
new file mode 100644
index 0000000000..3780072589
--- /dev/null
+++ b/libavfilter/x86/vf_blackdetect_init.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_blackdetect.h"
+
+unsigned ff_blackdetect_8_avx2(const uint8_t *, ptrdiff_t, ptrdiff_t, ptrdiff_t, unsigned);
+unsigned ff_blackdetect_16_avx2(const uint8_t *, ptrdiff_t, ptrdiff_t, ptrdiff_t, unsigned);
+
+av_cold ff_blackdetect_fn ff_blackdetect_get_fn_x86(int depth)
+{
+ int cpu_flags = av_get_cpu_flags();
+ if (EXTERNAL_AVX2(cpu_flags))
+ return depth == 8 ? ff_blackdetect_8_avx2 : ff_blackdetect_16_avx2;
+ return NULL;
+}
--
2.50.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 13+ messages in thread
* [FFmpeg-devel] [PATCH v2 2/2] tests/checkasm: add test for vf_blackdetect
2025-07-17 10:45 [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version Niklas Haas
@ 2025-07-17 10:45 ` Niklas Haas
2025-07-18 11:35 ` [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version Zhao Zhili
2025-07-18 11:36 ` Kacper Michajlow
2 siblings, 0 replies; 13+ messages in thread
From: Niklas Haas @ 2025-07-17 10:45 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
From: Niklas Haas <git@haasn.dev>
---
tests/checkasm/Makefile | 1 +
tests/checkasm/checkasm.c | 3 ++
tests/checkasm/checkasm.h | 1 +
tests/checkasm/vf_blackdetect.c | 69 +++++++++++++++++++++++++++++++++
tests/fate/checkasm.mak | 1 +
5 files changed, 75 insertions(+)
create mode 100644 tests/checkasm/vf_blackdetect.c
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 1938468bbd..c6d5b0ba1f 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -54,6 +54,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes)
# libavfilter tests
AVFILTEROBJS-$(CONFIG_SCENE_SAD) += scene_sad.o
AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o
+AVFILTEROBJS-$(CONFIG_BLACKDETECT_FILTER) += vf_blackdetect.o
AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o
AVFILTEROBJS-$(CONFIG_BWDIF_FILTER) += vf_bwdif.o
AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 66a8f8ff86..2532405f29 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -266,6 +266,9 @@ static const struct {
#if CONFIG_AFIR_FILTER
{ "af_afir", checkasm_check_afir },
#endif
+ #if CONFIG_BLACKDETECT_FILTER
+ { "vf_blackdetect", checkasm_check_blackdetect },
+ #endif
#if CONFIG_BLEND_FILTER
{ "vf_blend", checkasm_check_blend },
#endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 825e7ef52f..d85bbaf7fa 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -86,6 +86,7 @@ void checkasm_check_alacdsp(void);
void checkasm_check_apv_dsp(void);
void checkasm_check_audiodsp(void);
void checkasm_check_av_tx(void);
+void checkasm_check_blackdetect(void);
void checkasm_check_blend(void);
void checkasm_check_blockdsp(void);
void checkasm_check_bswapdsp(void);
diff --git a/tests/checkasm/vf_blackdetect.c b/tests/checkasm/vf_blackdetect.c
new file mode 100644
index 0000000000..30e59740d7
--- /dev/null
+++ b/tests/checkasm/vf_blackdetect.c
@@ -0,0 +1,69 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+#include "checkasm.h"
+
+#include "libavfilter/vf_blackdetect.h"
+#include "libavutil/mem_internal.h"
+
+#define WIDTH 256
+#define HEIGHT 16
+#define STRIDE (WIDTH + 32)
+
+static void check_blackdetect(int depth)
+{
+ LOCAL_ALIGNED_32(uint8_t, in, [HEIGHT * STRIDE]);
+
+ declare_func(unsigned, const uint8_t *in, ptrdiff_t stride,
+ ptrdiff_t width, ptrdiff_t height,
+ unsigned threshold);
+
+ memset(in, 0, HEIGHT * STRIDE);
+ for (int y = 0; y < HEIGHT; y++) {
+ for (int x = 0; x < WIDTH; x++)
+ in[y * STRIDE + x] = rnd() & 0xFF;
+ }
+
+ const unsigned threshold = 16 << (depth - 8);
+
+ int w = WIDTH;
+ if (depth == 16)
+ w /= 2;
+
+ if (check_func(ff_blackdetect_get_fn(depth), "blackdetect%d", depth)) {
+ /* Ensure odd tail is handled correctly */
+ unsigned count_ref = call_ref(in, STRIDE, w - 8, HEIGHT, threshold);
+ unsigned count_new = call_new(in, STRIDE, w - 8, HEIGHT, threshold);
+ if (count_ref != count_new) {
+ fprintf(stderr, "blackdetect%d: count mismatch: %u != %u\n",
+ depth, count_ref, count_new);
+ fail();
+ }
+ bench_new(in, STRIDE, w, HEIGHT, 16);
+ }
+}
+
+void checkasm_check_blackdetect(void)
+{
+ check_blackdetect(8);
+ report("blackdetect8");
+
+ check_blackdetect(16);
+ report("blackdetect16");
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index 2dd46f1001..0ae402cad4 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -56,6 +56,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \
fate-checkasm-v210dec \
fate-checkasm-v210enc \
fate-checkasm-vc1dsp \
+ fate-checkasm-vf_blackdetect \
fate-checkasm-vf_blend \
fate-checkasm-vf_bwdif \
fate-checkasm-vf_colorspace \
--
2.50.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version
2025-07-17 10:45 [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version Niklas Haas
2025-07-17 10:45 ` [FFmpeg-devel] [PATCH v2 2/2] tests/checkasm: add test for vf_blackdetect Niklas Haas
@ 2025-07-18 11:35 ` Zhao Zhili
2025-07-18 11:36 ` Kacper Michajlow
2 siblings, 0 replies; 13+ messages in thread
From: Zhao Zhili @ 2025-07-18 11:35 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Niklas Haas
> On Jul 17, 2025, at 18:45, Niklas Haas <ffmpeg@haasn.xyz> wrote:
>
> From: Niklas Haas <git@haasn.dev>
>
> Requested by a user. Even with autovectorization enabled, the compiler
> performs a quite poor job of optimizing this function, due to not being
> able to take advantage of the pmaxub + pcmpeqb trick for counting the number
> of pixels less than or equal-to a threshold.
>
> blackdetect8_c: 4625.0 ( 1.00x)
> blackdetect8_avx2: 155.1 (29.83x)
> blackdetect16_c: 2529.4 ( 1.00x)
> blackdetect16_avx2: 163.6 (15.46x)
> ---
> libavfilter/vf_blackdetect.c | 33 +++---------
> libavfilter/vf_blackdetect.h | 71 ++++++++++++++++++++++++++
> libavfilter/x86/Makefile | 2 +
> libavfilter/x86/vf_blackdetect.asm | 73 +++++++++++++++++++++++++++
> libavfilter/x86/vf_blackdetect_init.c | 34 +++++++++++++
> 5 files changed, 188 insertions(+), 25 deletions(-)
> create mode 100644 libavfilter/vf_blackdetect.h
> create mode 100644 libavfilter/x86/vf_blackdetect.asm
> create mode 100644 libavfilter/x86/vf_blackdetect_init.c
>
> diff --git a/libavfilter/vf_blackdetect.c b/libavfilter/vf_blackdetect.c
> index 8be33a814d..b233bdfd60 100644
> --- a/libavfilter/vf_blackdetect.c
> +++ b/libavfilter/vf_blackdetect.c
> @@ -33,6 +33,7 @@
> #include "filters.h"
> #include "formats.h"
> #include "video.h"
> +#include "vf_blackdetect.h"
>
> typedef struct BlackDetectContext {
> const AVClass *class;
> @@ -53,6 +54,8 @@ typedef struct BlackDetectContext {
> int depth;
> int nb_threads;
> unsigned int *counter;
> +
> + ff_blackdetect_fn func;
> } BlackDetectContext;
>
> #define OFFSET(x) offsetof(BlackDetectContext, x)
> @@ -133,6 +136,7 @@ static int config_input(AVFilterLink *inlink)
> s->time_base = inlink->time_base;
> s->black_min_duration = s->black_min_duration_time / av_q2d(s->time_base);
> s->counter = av_calloc(s->nb_threads, sizeof(*s->counter));
> + s->func = ff_blackdetect_get_fn(depth);
> if (!s->counter)
> return AVERROR(ENOMEM);
>
> @@ -160,37 +164,16 @@ static int black_counter(AVFilterContext *ctx, void *arg,
> int jobnr, int nb_jobs)
> {
> BlackDetectContext *s = ctx->priv;
> - const unsigned int threshold = s->pixel_black_th_i;
> - unsigned int *counterp = &s->counter[jobnr];
> - AVFrame *in = arg;
> + const AVFrame *in = arg;
> const int plane = s->alpha ? 3 : 0;
> const int linesize = in->linesize[plane];
> - const int w = in->width;
> const int h = in->height;
> const int start = (h * jobnr) / nb_jobs;
> const int end = (h * (jobnr+1)) / nb_jobs;
> - const int size = end - start;
> - unsigned int counter = 0;
> -
> - if (s->depth == 8) {
> - const uint8_t *p = in->data[plane] + start * linesize;
> -
> - for (int i = 0; i < size; i++) {
> - for (int x = 0; x < w; x++)
> - counter += p[x] <= threshold;
> - p += linesize;
> - }
> - } else {
> - const uint16_t *p = (const uint16_t *)(in->data[plane] + start * linesize);
> -
> - for (int i = 0; i < size; i++) {
> - for (int x = 0; x < w; x++)
> - counter += p[x] <= threshold;
> - p += linesize / 2;
> - }
> - }
>
> - *counterp = counter;
> + s->counter[jobnr] = s->func(in->data[plane] + start * linesize,
> + linesize, in->width, end - start,
> + s->pixel_black_th_i);
>
> return 0;
> }
> diff --git a/libavfilter/vf_blackdetect.h b/libavfilter/vf_blackdetect.h
> new file mode 100644
> index 0000000000..361da2c5bc
> --- /dev/null
> +++ b/libavfilter/vf_blackdetect.h
> @@ -0,0 +1,71 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVFILTER_VF_BLACKDETECT_H
> +#define AVFILTER_VF_BLACKDETECT_H
This break fate-source.
https://ffmpeg.org/pipermail/ffmpeg-devel/2025-July/346792.html
> +
> +#include <stddef.h>
> +#include <stdint.h>
> +
> +typedef unsigned (*ff_blackdetect_fn)(const uint8_t *src, ptrdiff_t stride,
> + ptrdiff_t width, ptrdiff_t height,
> + unsigned threshold);
> +
> +ff_blackdetect_fn ff_blackdetect_get_fn_x86(int depth);
> +
> +static unsigned count_pixels8_c(const uint8_t *src, ptrdiff_t stride,
> + ptrdiff_t width, ptrdiff_t height,
> + unsigned threshold)
> +{
> + unsigned int counter = 0;
> + while (height--) {
> + for (int x = 0; x < width; x++)
> + counter += src[x] <= threshold;
> + src += stride;
> + }
> + return counter;
> +}
> +
> +static unsigned count_pixels16_c(const uint8_t *src, ptrdiff_t stride,
> + ptrdiff_t width, ptrdiff_t height,
> + unsigned threshold)
> +{
> + unsigned int counter = 0;
> + while (height--) {
> + const uint16_t *src16 = (const uint16_t *) src;
> + for (int x = 0; x < width; x++)
> + counter += src16[x] <= threshold;
> + src += stride;
> + }
> + return counter;
> +}
> +
> +
> +static inline ff_blackdetect_fn ff_blackdetect_get_fn(int depth)
> +{
> + ff_blackdetect_fn fn = NULL;
> +#if ARCH_X86
> + fn = ff_blackdetect_get_fn_x86(depth);
> +#endif
> +
> + if (!fn)
> + fn = depth == 8 ? count_pixels8_c : count_pixels16_c;
> + return fn;
> +}
> +
> +#endif /* AVFILTER_VF_BLACKDETECT_H */
> diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
> index 0efe3f8d2c..86f7119a7b 100644
> --- a/libavfilter/x86/Makefile
> +++ b/libavfilter/x86/Makefile
> @@ -3,6 +3,7 @@ OBJS-$(CONFIG_SCENE_SAD) += x86/scene_sad_init.o
> OBJS-$(CONFIG_AFIR_FILTER) += x86/af_afir_init.o
> OBJS-$(CONFIG_ANLMDN_FILTER) += x86/af_anlmdn_init.o
> OBJS-$(CONFIG_ATADENOISE_FILTER) += x86/vf_atadenoise_init.o
> +OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect_init.o
> OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o
> OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o
> OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o
> @@ -49,6 +50,7 @@ X86ASM-OBJS-$(CONFIG_SCENE_SAD) += x86/scene_sad.o
> X86ASM-OBJS-$(CONFIG_AFIR_FILTER) += x86/af_afir.o
> X86ASM-OBJS-$(CONFIG_ANLMDN_FILTER) += x86/af_anlmdn.o
> X86ASM-OBJS-$(CONFIG_ATADENOISE_FILTER) += x86/vf_atadenoise.o
> +X86ASM-OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect.o
> X86ASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o
> X86ASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o
> X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o
> diff --git a/libavfilter/x86/vf_blackdetect.asm b/libavfilter/x86/vf_blackdetect.asm
> new file mode 100644
> index 0000000000..78c24c5adc
> --- /dev/null
> +++ b/libavfilter/x86/vf_blackdetect.asm
> @@ -0,0 +1,73 @@
> +;*****************************************************************************
> +;* x86-optimized functions for blackdetect filter
> +;*
> +;* Copyright (C) 2025 Niklas Haas
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;*****************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION .text
> +
> +%macro count_pixels_fn 1 ; depth
> +cglobal blackdetect_%1, 5, 7, 2, src, stride, width, height, threshold
> + movd xm1, thresholdd
> + %if %1 == 8
> + vpbroadcastb m1, xm1
> + %else
> + vpbroadcastw m1, xm1
> + shl widthq, 1
> + %endif
> + add srcq, widthq
> + neg widthq
> + xor r4, r4
> + mov r5, widthq
> + jmp .start
> +.loop:
> + popcnt r6d, r6d
> + add r4, r6
> +.start:
> + movu m0, [srcq + r5]
> + %if %1 == 8
> + pmaxub m0, m1
> + pcmpeqb m0, m1
> + %else
> + pmaxuw m0, m1
> + pcmpeqw m0, m1
> + %endif
> + pmovmskb r6d, m0
> + add r5, mmsize
> + jl .loop
> + ; handle tail by shifting away unused high elements
> + shlx r6d, r6d, r5d
> + popcnt r6d, r6d
> + add r4, r6
> + add srcq, strideq
> + mov r5, widthq
> + dec heightq
> + jg .start
> + %if %1 > 8
> + shr r4, 1
> + %endif
> + mov rax, r4
> + RET
> +%endmacro
> +
> +INIT_YMM avx2
> +count_pixels_fn 8
> +count_pixels_fn 16
> diff --git a/libavfilter/x86/vf_blackdetect_init.c b/libavfilter/x86/vf_blackdetect_init.c
> new file mode 100644
> index 0000000000..3780072589
> --- /dev/null
> +++ b/libavfilter/x86/vf_blackdetect_init.c
> @@ -0,0 +1,34 @@
> +/*
> + * Copyright (c) 2025 Niklas Haas
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavfilter/vf_blackdetect.h"
> +
> +unsigned ff_blackdetect_8_avx2(const uint8_t *, ptrdiff_t, ptrdiff_t, ptrdiff_t, unsigned);
> +unsigned ff_blackdetect_16_avx2(const uint8_t *, ptrdiff_t, ptrdiff_t, ptrdiff_t, unsigned);
> +
> +av_cold ff_blackdetect_fn ff_blackdetect_get_fn_x86(int depth)
> +{
> + int cpu_flags = av_get_cpu_flags();
> + if (EXTERNAL_AVX2(cpu_flags))
> + return depth == 8 ? ff_blackdetect_8_avx2 : ff_blackdetect_16_avx2;
> + return NULL;
> +}
> --
> 2.50.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version
2025-07-17 10:45 [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version Niklas Haas
2025-07-17 10:45 ` [FFmpeg-devel] [PATCH v2 2/2] tests/checkasm: add test for vf_blackdetect Niklas Haas
2025-07-18 11:35 ` [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version Zhao Zhili
@ 2025-07-18 11:36 ` Kacper Michajlow
2025-07-18 12:14 ` Kieran Kunhya via ffmpeg-devel
2025-07-18 12:26 ` Zhao Zhili
2 siblings, 2 replies; 13+ messages in thread
From: Kacper Michajlow @ 2025-07-18 11:36 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Thu, 17 Jul 2025 at 12:45, Niklas Haas <ffmpeg@haasn.xyz> wrote:
>
> From: Niklas Haas <git@haasn.dev>
>
> Requested by a user. Even with autovectorization enabled, the compiler
> performs a quite poor job of optimizing this function, due to not being
> able to take advantage of the pmaxub + pcmpeqb trick for counting the number
> of pixels less than or equal-to a threshold.
>
> blackdetect8_c: 4625.0 ( 1.00x)
> blackdetect8_avx2: 155.1 (29.83x)
> blackdetect16_c: 2529.4 ( 1.00x)
> blackdetect16_avx2: 163.6 (15.46x)
I think we should try to have better standards for reporting
performance metrics. Those numbers without context mean not so much.
What compiler, flags, cpu were used? Sure, we can omit some
information if we want to show only the scaling, but if it highly
depends on those things, then we should at least try to be more
specific.
Sorry for being pedantic about those things, but I think it's
important, especially if we put those values in a commit message which
will live forever in the repository as a vague reference.
> Even with autovectorization enabled
You mention the auto vectorization enabled, yet the reported numbers
are without it. In my mind this description implies that shown
performance comparison is with auto vectorization enabled.
When we compare apples to apples, with avx2 we get a more expectable
3.74x (gcc) / 2.38x (clang) depending on the compiler. It's still a
good improvement, no reason to oversell it.
For reference some metrics on me end:
clang 20.1.7
march=generic (default config):
blackdetect8_c: 1591.1 ( 1.00x)
blackdetect8_avx2: 225.1 ( 7.07x)
blackdetect16_c: 643.5 ( 1.00x)
blackdetect16_avx2: 220.6 ( 2.92x)
march=core-avx2:
blackdetect8_c: 526.0 ( 1.00x)
blackdetect8_avx2: 220.9 ( 2.38x)
blackdetect16_c: 318.8 ( 1.00x)
blackdetect16_avx2: 225.9 ( 1.41x)
gcc 14.2.0
-fno-tree-vectorize (default config):
blackdetect8_c: 5126.6 ( 1.00x)
blackdetect8_avx2: 198.0 (25.89x)
blackdetect16_c: 2151.9 ( 1.00x)
blackdetect16_avx2: 196.8 (10.93x)
march=generic -ftree-vectorize:
blackdetect8_c: 1354.4 ( 1.00x)
blackdetect8_avx2: 196.9 ( 6.88x)
blackdetect16_c: 644.2 ( 1.00x)
blackdetect16_avx2: 249.8 ( 2.58x)
march=core-avx2 -ftree-vectorize:
blackdetect8_c: 820.8 ( 1.00x)
blackdetect8_avx2: 219.2 ( 3.74x)
blackdetect16_c: 372.8 ( 1.00x)
blackdetect16_avx2: 201.4 ( 1.85x)
Again, sorry for being pedantic here, but it gives the wrong
impression especially if you look at this from outside.
- Kacper
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version
2025-07-18 11:36 ` Kacper Michajlow
@ 2025-07-18 12:14 ` Kieran Kunhya via ffmpeg-devel
2025-07-18 12:28 ` Kacper Michajlow
2025-07-18 12:41 ` Kacper Michajlow
2025-07-18 12:26 ` Zhao Zhili
1 sibling, 2 replies; 13+ messages in thread
From: Kieran Kunhya via ffmpeg-devel @ 2025-07-18 12:14 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Kieran Kunhya
> blackdetect8_c: 820.8 ( 1.00x)
> blackdetect8_avx2: 219.2 ( 3.74x)
> blackdetect16_c: 372.8 ( 1.00x)
> blackdetect16_avx2: 201.4 ( 1.85x)
>
> Again, sorry for being pedantic here, but it gives the wrong
> impression especially if you look at this from outside.
Also misleading as far as I understand because GCC doesn't have
runtime detection like FFmpeg.
Kieran
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version
2025-07-18 11:36 ` Kacper Michajlow
2025-07-18 12:14 ` Kieran Kunhya via ffmpeg-devel
@ 2025-07-18 12:26 ` Zhao Zhili
1 sibling, 0 replies; 13+ messages in thread
From: Zhao Zhili @ 2025-07-18 12:26 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> On Jul 18, 2025, at 19:36, Kacper Michajlow <kasper93@gmail.com> wrote:
>
> On Thu, 17 Jul 2025 at 12:45, Niklas Haas <ffmpeg@haasn.xyz> wrote:
>>
>> From: Niklas Haas <git@haasn.dev>
>>
>> Requested by a user. Even with autovectorization enabled, the compiler
>> performs a quite poor job of optimizing this function, due to not being
>> able to take advantage of the pmaxub + pcmpeqb trick for counting the number
>> of pixels less than or equal-to a threshold.
>>
>> blackdetect8_c: 4625.0 ( 1.00x)
>> blackdetect8_avx2: 155.1 (29.83x)
>> blackdetect16_c: 2529.4 ( 1.00x)
>> blackdetect16_avx2: 163.6 (15.46x)
>
> I think we should try to have better standards for reporting
> performance metrics. Those numbers without context mean not so much.
> What compiler, flags, cpu were used? Sure, we can omit some
> information if we want to show only the scaling, but if it highly
> depends on those things, then we should at least try to be more
> specific.
>
> Sorry for being pedantic about those things, but I think it's
> important, especially if we put those values in a commit message which
> will live forever in the repository as a vague reference.
The basic benchmark context can be specified in doc and leave out in commits.
And it’s easy to get the implicit conditions in the mailing list. I’m more worried about
only give the data on social media platforms, e.g., X.
The checkasm benchmark serve our purpose very well, but it’s not a fair benchmark to
compare hand written assembly to compiler optimizations. Sure we can do better than
compiler, but compiler isn’t that sucks. More importantly, compiler isn’t our enemy,
no point in embarrassing the compiler or compiler developer.
"If I have seen further it is by standing on the shoulders of Giants.”
— By Isaac Newton.
Runtime cpu detection is a thing. There are methods to use compiler auto-vectorize and
runtime cpu detection at the same time. Our approach isn’t the only working approach.
>
>> Even with autovectorization enabled
>
> You mention the auto vectorization enabled, yet the reported numbers
> are without it. In my mind this description implies that shown
> performance comparison is with auto vectorization enabled.
>
> When we compare apples to apples, with avx2 we get a more expectable
> 3.74x (gcc) / 2.38x (clang) depending on the compiler. It's still a
> good improvement, no reason to oversell it.
>
> For reference some metrics on me end:
>
> clang 20.1.7
>
> march=generic (default config):
>
> blackdetect8_c: 1591.1 ( 1.00x)
> blackdetect8_avx2: 225.1 ( 7.07x)
> blackdetect16_c: 643.5 ( 1.00x)
> blackdetect16_avx2: 220.6 ( 2.92x)
>
> march=core-avx2:
>
> blackdetect8_c: 526.0 ( 1.00x)
> blackdetect8_avx2: 220.9 ( 2.38x)
> blackdetect16_c: 318.8 ( 1.00x)
> blackdetect16_avx2: 225.9 ( 1.41x)
>
> gcc 14.2.0
>
> -fno-tree-vectorize (default config):
>
> blackdetect8_c: 5126.6 ( 1.00x)
> blackdetect8_avx2: 198.0 (25.89x)
> blackdetect16_c: 2151.9 ( 1.00x)
> blackdetect16_avx2: 196.8 (10.93x)
>
> march=generic -ftree-vectorize:
>
> blackdetect8_c: 1354.4 ( 1.00x)
> blackdetect8_avx2: 196.9 ( 6.88x)
> blackdetect16_c: 644.2 ( 1.00x)
> blackdetect16_avx2: 249.8 ( 2.58x)
>
> march=core-avx2 -ftree-vectorize:
>
> blackdetect8_c: 820.8 ( 1.00x)
> blackdetect8_avx2: 219.2 ( 3.74x)
> blackdetect16_c: 372.8 ( 1.00x)
> blackdetect16_avx2: 201.4 ( 1.85x)
>
> Again, sorry for being pedantic here, but it gives the wrong
> impression especially if you look at this from outside.
>
> - Kacper
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version
2025-07-18 12:14 ` Kieran Kunhya via ffmpeg-devel
@ 2025-07-18 12:28 ` Kacper Michajlow
2025-07-18 12:41 ` Kacper Michajlow
1 sibling, 0 replies; 13+ messages in thread
From: Kacper Michajlow @ 2025-07-18 12:28 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Fri, 18 Jul 2025 at 14:14, Kieran Kunhya via ffmpeg-devel
<ffmpeg-devel@ffmpeg.org> wrote:
>
> > blackdetect8_c: 820.8 ( 1.00x)
> > blackdetect8_avx2: 219.2 ( 3.74x)
> > blackdetect16_c: 372.8 ( 1.00x)
> > blackdetect16_avx2: 201.4 ( 1.85x)
> >
> > Again, sorry for being pedantic here, but it gives the wrong
> > impression especially if you look at this from outside.
>
> Also misleading as far as I understand because GCC doesn't have
> runtime detection like FFmpeg.
It's not misleading if you say in the commit description (or anywhere)
what you are comparing against. Nice of you to conveniently strip this
part from my email.
> march=core-avx2 -ftree-vectorize:
I know it's good publicity to say something is 100x faster. Though,
I'm looking at this from a technical point of view, not the Twitter
point of view.
- Kacper
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version
2025-07-18 12:14 ` Kieran Kunhya via ffmpeg-devel
2025-07-18 12:28 ` Kacper Michajlow
@ 2025-07-18 12:41 ` Kacper Michajlow
2025-07-18 12:46 ` Kieran Kunhya via ffmpeg-devel
1 sibling, 1 reply; 13+ messages in thread
From: Kacper Michajlow @ 2025-07-18 12:41 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Fri, 18 Jul 2025 at 14:14, Kieran Kunhya via ffmpeg-devel
<ffmpeg-devel@ffmpeg.org> wrote:
>
> > blackdetect8_c: 820.8 ( 1.00x)
> > blackdetect8_avx2: 219.2 ( 3.74x)
> > blackdetect16_c: 372.8 ( 1.00x)
> > blackdetect16_avx2: 201.4 ( 1.85x)
> >
> > Again, sorry for being pedantic here, but it gives the wrong
> > impression especially if you look at this from outside.
>
> Also misleading as far as I understand because GCC doesn't have
> runtime detection like FFmpeg.
Speak of... actually GCC does have runtime detection. All you have to
do is mark the function with `target_clones` with requested
architectures and it will dispatch automatically during runtime the
best function to use.
See for more information:
https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-target_005fclones-function-attribute
- Kacper
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version
2025-07-18 12:41 ` Kacper Michajlow
@ 2025-07-18 12:46 ` Kieran Kunhya via ffmpeg-devel
2025-07-18 13:21 ` Kacper Michajlow
0 siblings, 1 reply; 13+ messages in thread
From: Kieran Kunhya via ffmpeg-devel @ 2025-07-18 12:46 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Kieran Kunhya
On Fri, Jul 18, 2025 at 1:41 PM Kacper Michajlow <kasper93@gmail.com> wrote:
>
> On Fri, 18 Jul 2025 at 14:14, Kieran Kunhya via ffmpeg-devel
> <ffmpeg-devel@ffmpeg.org> wrote:
> >
> > > blackdetect8_c: 820.8 ( 1.00x)
> > > blackdetect8_avx2: 219.2 ( 3.74x)
> > > blackdetect16_c: 372.8 ( 1.00x)
> > > blackdetect16_avx2: 201.4 ( 1.85x)
> > >
> > > Again, sorry for being pedantic here, but it gives the wrong
> > > impression especially if you look at this from outside.
> >
> > Also misleading as far as I understand because GCC doesn't have
> > runtime detection like FFmpeg.
>
> Speak of... actually GCC does have runtime detection. All you have to
> do is mark the function with `target_clones` with requested
> architectures and it will dispatch automatically during runtime the
> best function to use.
>
> See for more information:
> https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-target_005fclones-function-attribute
It's not as sophisticated as our runtime detection (e.g avx512 vs
avx512icl which we support).
Comparing C vs autovectorised code that works only on some platforms
with forced compilation settings is also unfair.
Kieran
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version
2025-07-18 12:46 ` Kieran Kunhya via ffmpeg-devel
@ 2025-07-18 13:21 ` Kacper Michajlow
2025-07-18 13:33 ` Kieran Kunhya via ffmpeg-devel
0 siblings, 1 reply; 13+ messages in thread
From: Kacper Michajlow @ 2025-07-18 13:21 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Fri, 18 Jul 2025 at 14:46, Kieran Kunhya via ffmpeg-devel
<ffmpeg-devel@ffmpeg.org> wrote:
>
> On Fri, Jul 18, 2025 at 1:41 PM Kacper Michajlow <kasper93@gmail.com> wrote:
> >
> > On Fri, 18 Jul 2025 at 14:14, Kieran Kunhya via ffmpeg-devel
> > <ffmpeg-devel@ffmpeg.org> wrote:
> > >
> > > > blackdetect8_c: 820.8 ( 1.00x)
> > > > blackdetect8_avx2: 219.2 ( 3.74x)
> > > > blackdetect16_c: 372.8 ( 1.00x)
> > > > blackdetect16_avx2: 201.4 ( 1.85x)
> > > >
> > > > Again, sorry for being pedantic here, but it gives the wrong
> > > > impression especially if you look at this from outside.
> > >
> > > Also misleading as far as I understand because GCC doesn't have
> > > runtime detection like FFmpeg.
> >
> > Speak of... actually GCC does have runtime detection. All you have to
> > do is mark the function with `target_clones` with requested
> > architectures and it will dispatch automatically during runtime the
> > best function to use.
> >
> > See for more information:
> > https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-target_005fclones-function-attribute
>
> It's not as sophisticated as our runtime detection (e.g avx512 vs
> avx512icl which we support).
> Comparing C vs autovectorised code that works only on some platforms
> with forced compilation settings is also unfair.
In my original message clang build was completely default, no forced options.
Handwritten avx512 also works on this specific platform. So comparing
this to autovectorized code (that works on exactly the same platform)
as a baseline makes sense. Furthermore autovectorized code can scale
onto more platforms than handwritten avx512. IMHO comparing things in
the same domain makes more sense.
The point of my message was that we should have defined a baseline
target, if it is GCC without autovectorization, so be it. But it
should be specified and not implied in the commit description that the
compared result is autovectorized.
To be honest, I agree with you. It's misleading and unfair, so we
shouldn't make any comparisons. This is not only limited to
autovectorization, scalar code generation also differs. It just
happens to give the biggest difference.
Context matters, saying "C code performance " is vague. I'm not saying
one way is better than the other, but it doesn't cost anything to
specify it better to avoid miscommunication.
- Kacper
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version
2025-07-18 13:21 ` Kacper Michajlow
@ 2025-07-18 13:33 ` Kieran Kunhya via ffmpeg-devel
2025-07-18 14:16 ` Kacper Michajlow
0 siblings, 1 reply; 13+ messages in thread
From: Kieran Kunhya via ffmpeg-devel @ 2025-07-18 13:33 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Kieran Kunhya
On Fri, Jul 18, 2025 at 2:22 PM Kacper Michajlow <kasper93@gmail.com> wrote:
>
> On Fri, 18 Jul 2025 at 14:46, Kieran Kunhya via ffmpeg-devel
> <ffmpeg-devel@ffmpeg.org> wrote:
> >
> > On Fri, Jul 18, 2025 at 1:41 PM Kacper Michajlow <kasper93@gmail.com> wrote:
> > >
> > > On Fri, 18 Jul 2025 at 14:14, Kieran Kunhya via ffmpeg-devel
> > > <ffmpeg-devel@ffmpeg.org> wrote:
> > > >
> > > > > blackdetect8_c: 820.8 ( 1.00x)
> > > > > blackdetect8_avx2: 219.2 ( 3.74x)
> > > > > blackdetect16_c: 372.8 ( 1.00x)
> > > > > blackdetect16_avx2: 201.4 ( 1.85x)
> > > > >
> > > > > Again, sorry for being pedantic here, but it gives the wrong
> > > > > impression especially if you look at this from outside.
> > > >
> > > > Also misleading as far as I understand because GCC doesn't have
> > > > runtime detection like FFmpeg.
> > >
> > > Speak of... actually GCC does have runtime detection. All you have to
> > > do is mark the function with `target_clones` with requested
> > > architectures and it will dispatch automatically during runtime the
> > > best function to use.
> > >
> > > See for more information:
> > > https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-target_005fclones-function-attribute
> >
> > It's not as sophisticated as our runtime detection (e.g avx512 vs
> > avx512icl which we support).
> > Comparing C vs autovectorised code that works only on some platforms
> > with forced compilation settings is also unfair.
>
> In my original message clang build was completely default, no forced options.
>
> Handwritten avx512 also works on this specific platform. So comparing
> this to autovectorized code (that works on exactly the same platform)
> as a baseline makes sense. Furthermore autovectorized code can scale
> onto more platforms than handwritten avx512. IMHO comparing things in
> the same domain makes more sense.
>
> The point of my message was that we should have defined a baseline
> target, if it is GCC without autovectorization, so be it. But it
> should be specified and not implied in the commit description that the
> compared result is autovectorized.
>
> To be honest, I agree with you. It's misleading and unfair, so we
> shouldn't make any comparisons. This is not only limited to
> autovectorization, scalar code generation also differs. It just
> happens to give the biggest difference.
>
> Context matters, saying "C code performance " is vague. I'm not saying
> one way is better than the other, but it doesn't cost anything to
> specify it better to avoid miscommunication.
It's not fair to compare autovectorised output that's AVX512 that will
be called *on any system with AVX512 support including ones that
downclock heavily* with AVX512(ICL) checked properly in FFmpeg to run
on only non-downlocking systems.
Outside the land of the theoretical compiler world, this is a
practical problem. If FFmpeg used compiler runtime detection I
personally would have a significant number of systems downclock
drastically.
I don't believe compilers are smart enough to generate AVX512 with YMM
for that use-case.
It's substantially uglier to use compiler-specific runtime detection.
Compiler autovectorisation is inconsistent across compiler versions.
It's nothing that can be relied upon.
Kieran
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version
2025-07-18 13:33 ` Kieran Kunhya via ffmpeg-devel
@ 2025-07-18 14:16 ` Kacper Michajlow
2025-07-18 14:36 ` Kieran Kunhya via ffmpeg-devel
0 siblings, 1 reply; 13+ messages in thread
From: Kacper Michajlow @ 2025-07-18 14:16 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Fri, 18 Jul 2025 at 15:33, Kieran Kunhya via ffmpeg-devel
<ffmpeg-devel@ffmpeg.org> wrote:
>
> On Fri, Jul 18, 2025 at 2:22 PM Kacper Michajlow <kasper93@gmail.com> wrote:
> >
> > On Fri, 18 Jul 2025 at 14:46, Kieran Kunhya via ffmpeg-devel
> > <ffmpeg-devel@ffmpeg.org> wrote:
> > >
> > > On Fri, Jul 18, 2025 at 1:41 PM Kacper Michajlow <kasper93@gmail.com> wrote:
> > > >
> > > > On Fri, 18 Jul 2025 at 14:14, Kieran Kunhya via ffmpeg-devel
> > > > <ffmpeg-devel@ffmpeg.org> wrote:
> > > > >
> > > > > > blackdetect8_c: 820.8 ( 1.00x)
> > > > > > blackdetect8_avx2: 219.2 ( 3.74x)
> > > > > > blackdetect16_c: 372.8 ( 1.00x)
> > > > > > blackdetect16_avx2: 201.4 ( 1.85x)
> > > > > >
> > > > > > Again, sorry for being pedantic here, but it gives the wrong
> > > > > > impression especially if you look at this from outside.
> > > > >
> > > > > Also misleading as far as I understand because GCC doesn't have
> > > > > runtime detection like FFmpeg.
> > > >
> > > > Speak of... actually GCC does have runtime detection. All you have to
> > > > do is mark the function with `target_clones` with requested
> > > > architectures and it will dispatch automatically during runtime the
> > > > best function to use.
> > > >
> > > > See for more information:
> > > > https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-target_005fclones-function-attribute
> > >
> > > It's not as sophisticated as our runtime detection (e.g avx512 vs
> > > avx512icl which we support).
> > > Comparing C vs autovectorised code that works only on some platforms
> > > with forced compilation settings is also unfair.
> >
> > In my original message clang build was completely default, no forced options.
> >
> > Handwritten avx512 also works on this specific platform. So comparing
> > this to autovectorized code (that works on exactly the same platform)
> > as a baseline makes sense. Furthermore autovectorized code can scale
> > onto more platforms than handwritten avx512. IMHO comparing things in
> > the same domain makes more sense.
> >
> > The point of my message was that we should have defined a baseline
> > target, if it is GCC without autovectorization, so be it. But it
> > should be specified and not implied in the commit description that the
> > compared result is autovectorized.
> >
> > To be honest, I agree with you. It's misleading and unfair, so we
> > shouldn't make any comparisons. This is not only limited to
> > autovectorization, scalar code generation also differs. It just
> > happens to give the biggest difference.
> >
> > Context matters, saying "C code performance " is vague. I'm not saying
> > one way is better than the other, but it doesn't cost anything to
> > specify it better to avoid miscommunication.
>
> It's not fair to compare autovectorised output that's AVX512 that will
> be called *on any system with AVX512 support including ones that
> downclock heavily* with AVX512(ICL) checked properly in FFmpeg to run
> on only non-downlocking systems.
That's the customer/user decision how to compile FFmpeg for best
performance on their target platform. Also note, you brought up
avx512, while I agree on the issues with it. I'm commenting on the
AVX2 patch. I wanted to make general comment about the performance
metric we share, diving into avx512 issues is kinda a separate topic.
I guess the C code performance can vary a lot, between compiler,
between optimization flags, between platforms. And we should be
specific about what our "x figure" mean, else it's just a number in
void. There are cases where "fully optimized" generated code is
terrible as with some recent cases, (not this one tho) and then it's
cool to point this out, but if you add different constraints on
compiler generated code it makes this comparison unnecessary
confusing. Whatever that means, but I think you know what I'm trying
to say.
- Kacper
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version
2025-07-18 14:16 ` Kacper Michajlow
@ 2025-07-18 14:36 ` Kieran Kunhya via ffmpeg-devel
0 siblings, 0 replies; 13+ messages in thread
From: Kieran Kunhya via ffmpeg-devel @ 2025-07-18 14:36 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Kieran Kunhya
On Fri, Jul 18, 2025 at 3:17 PM Kacper Michajlow <kasper93@gmail.com> wrote:
>
> On Fri, 18 Jul 2025 at 15:33, Kieran Kunhya via ffmpeg-devel
> <ffmpeg-devel@ffmpeg.org> wrote:
> >
> > On Fri, Jul 18, 2025 at 2:22 PM Kacper Michajlow <kasper93@gmail.com> wrote:
> > >
> > > On Fri, 18 Jul 2025 at 14:46, Kieran Kunhya via ffmpeg-devel
> > > <ffmpeg-devel@ffmpeg.org> wrote:
> > > >
> > > > On Fri, Jul 18, 2025 at 1:41 PM Kacper Michajlow <kasper93@gmail.com> wrote:
> > > > >
> > > > > On Fri, 18 Jul 2025 at 14:14, Kieran Kunhya via ffmpeg-devel
> > > > > <ffmpeg-devel@ffmpeg.org> wrote:
> > > > > >
> > > > > > > blackdetect8_c: 820.8 ( 1.00x)
> > > > > > > blackdetect8_avx2: 219.2 ( 3.74x)
> > > > > > > blackdetect16_c: 372.8 ( 1.00x)
> > > > > > > blackdetect16_avx2: 201.4 ( 1.85x)
> > > > > > >
> > > > > > > Again, sorry for being pedantic here, but it gives the wrong
> > > > > > > impression especially if you look at this from outside.
> > > > > >
> > > > > > Also misleading as far as I understand because GCC doesn't have
> > > > > > runtime detection like FFmpeg.
> > > > >
> > > > > Speak of... actually GCC does have runtime detection. All you have to
> > > > > do is mark the function with `target_clones` with requested
> > > > > architectures and it will dispatch automatically during runtime the
> > > > > best function to use.
> > > > >
> > > > > See for more information:
> > > > > https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-target_005fclones-function-attribute
> > > >
> > > > It's not as sophisticated as our runtime detection (e.g avx512 vs
> > > > avx512icl which we support).
> > > > Comparing C vs autovectorised code that works only on some platforms
> > > > with forced compilation settings is also unfair.
> > >
> > > In my original message clang build was completely default, no forced options.
> > >
> > > Handwritten avx512 also works on this specific platform. So comparing
> > > this to autovectorized code (that works on exactly the same platform)
> > > as a baseline makes sense. Furthermore autovectorized code can scale
> > > onto more platforms than handwritten avx512. IMHO comparing things in
> > > the same domain makes more sense.
> > >
> > > The point of my message was that we should have defined a baseline
> > > target, if it is GCC without autovectorization, so be it. But it
> > > should be specified and not implied in the commit description that the
> > > compared result is autovectorized.
> > >
> > > To be honest, I agree with you. It's misleading and unfair, so we
> > > shouldn't make any comparisons. This is not only limited to
> > > autovectorization, scalar code generation also differs. It just
> > > happens to give the biggest difference.
> > >
> > > Context matters, saying "C code performance " is vague. I'm not saying
> > > one way is better than the other, but it doesn't cost anything to
> > > specify it better to avoid miscommunication.
> >
> > It's not fair to compare autovectorised output that's AVX512 that will
> > be called *on any system with AVX512 support including ones that
> > downclock heavily* with AVX512(ICL) checked properly in FFmpeg to run
> > on only non-downlocking systems.
>
> That's the customer/user decision how to compile FFmpeg for best
> performance on their target platform. Also note, you brought up
> avx512, while I agree on the issues with it. I'm commenting on the
> AVX2 patch. I wanted to make general comment about the performance
> metric we share, diving into avx512 issues is kinda a separate topic.
Huh, we should have the best performance for *all* users (all
compilers, all platforms) by default.
We have this now for SIMD functions, it's an open question about
autovec for the rest.
Kieran
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2025-07-18 14:37 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-07-17 10:45 [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version Niklas Haas
2025-07-17 10:45 ` [FFmpeg-devel] [PATCH v2 2/2] tests/checkasm: add test for vf_blackdetect Niklas Haas
2025-07-18 11:35 ` [FFmpeg-devel] [PATCH v2 1/2] avfilter/vf_blackdetect: add AVX2 SIMD version Zhao Zhili
2025-07-18 11:36 ` Kacper Michajlow
2025-07-18 12:14 ` Kieran Kunhya via ffmpeg-devel
2025-07-18 12:28 ` Kacper Michajlow
2025-07-18 12:41 ` Kacper Michajlow
2025-07-18 12:46 ` Kieran Kunhya via ffmpeg-devel
2025-07-18 13:21 ` Kacper Michajlow
2025-07-18 13:33 ` Kieran Kunhya via ffmpeg-devel
2025-07-18 14:16 ` Kacper Michajlow
2025-07-18 14:36 ` Kieran Kunhya via ffmpeg-devel
2025-07-18 12:26 ` Zhao Zhili
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git