From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.ffmpeg.org (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTPS id 51CBE4C948 for ; Thu, 10 Jul 2025 15:10:54 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.ffmpeg.org (Postfix) with ESMTP id B543F690098; Thu, 10 Jul 2025 18:10:51 +0300 (EEST) Received: from haasn.dev (haasn.dev [78.46.187.166]) by ffbox0-bg.ffmpeg.org (Postfix) with ESMTP id E2195690082 for ; Thu, 10 Jul 2025 18:10:44 +0300 (EEST) Received: from haasn.dev (unknown [10.30.1.1]) by haasn.dev (Postfix) with UTF8SMTP id 3D8BC405F4; Thu, 10 Jul 2025 17:10:44 +0200 (CEST) From: Niklas Haas To: ffmpeg-devel@ffmpeg.org Date: Thu, 10 Jul 2025 17:10:42 +0200 Message-ID: <20250710151042.198670-1-ffmpeg@haasn.xyz> X-Mailer: git-send-email 2.49.0 MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH] avfilter/vf_blackdetect: add AVX2 SIMD version X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: Niklas Haas Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: From: Niklas Haas Requested by a user. Even with autovectorization enabled, the compiler performs a quite poor job of optimizing this function, due to not being able to take advantage of the pmaxub + pcmpeqb trick for counting the number of pixels less than or equal-to a threshold. blackdetect8_c: 4627.2 ( 1.00x) blackdetect8_avx2: 163.9 (28.24x) blackdetect16_c: 2474.9 ( 1.00x) blackdetect16_avx2: 155.9 (15.88x) --- libavfilter/vf_blackdetect.c | 33 +++--------- libavfilter/vf_blackdetect.h | 71 ++++++++++++++++++++++++ libavfilter/x86/Makefile | 2 + libavfilter/x86/vf_blackdetect.asm | 73 +++++++++++++++++++++++++ libavfilter/x86/vf_blackdetect_init.c | 34 ++++++++++++ tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 ++ tests/checkasm/checkasm.h | 1 + tests/checkasm/vf_blackdetect.c | 78 +++++++++++++++++++++++++++ 9 files changed, 271 insertions(+), 25 deletions(-) create mode 100644 libavfilter/vf_blackdetect.h create mode 100644 libavfilter/x86/vf_blackdetect.asm create mode 100644 libavfilter/x86/vf_blackdetect_init.c create mode 100644 tests/checkasm/vf_blackdetect.c diff --git a/libavfilter/vf_blackdetect.c b/libavfilter/vf_blackdetect.c index 8be33a814d..b233bdfd60 100644 --- a/libavfilter/vf_blackdetect.c +++ b/libavfilter/vf_blackdetect.c @@ -33,6 +33,7 @@ #include "filters.h" #include "formats.h" #include "video.h" +#include "vf_blackdetect.h" typedef struct BlackDetectContext { const AVClass *class; @@ -53,6 +54,8 @@ typedef struct BlackDetectContext { int depth; int nb_threads; unsigned int *counter; + + ff_blackdetect_fn func; } BlackDetectContext; #define OFFSET(x) offsetof(BlackDetectContext, x) @@ -133,6 +136,7 @@ static int config_input(AVFilterLink *inlink) s->time_base = inlink->time_base; s->black_min_duration = s->black_min_duration_time / av_q2d(s->time_base); s->counter = av_calloc(s->nb_threads, sizeof(*s->counter)); + s->func = ff_blackdetect_get_fn(depth); if (!s->counter) return AVERROR(ENOMEM); @@ -160,37 +164,16 @@ static int black_counter(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { BlackDetectContext *s = ctx->priv; - const unsigned int threshold = s->pixel_black_th_i; - unsigned int *counterp = &s->counter[jobnr]; - AVFrame *in = arg; + const AVFrame *in = arg; const int plane = s->alpha ? 3 : 0; const int linesize = in->linesize[plane]; - const int w = in->width; const int h = in->height; const int start = (h * jobnr) / nb_jobs; const int end = (h * (jobnr+1)) / nb_jobs; - const int size = end - start; - unsigned int counter = 0; - - if (s->depth == 8) { - const uint8_t *p = in->data[plane] + start * linesize; - - for (int i = 0; i < size; i++) { - for (int x = 0; x < w; x++) - counter += p[x] <= threshold; - p += linesize; - } - } else { - const uint16_t *p = (const uint16_t *)(in->data[plane] + start * linesize); - - for (int i = 0; i < size; i++) { - for (int x = 0; x < w; x++) - counter += p[x] <= threshold; - p += linesize / 2; - } - } - *counterp = counter; + s->counter[jobnr] = s->func(in->data[plane] + start * linesize, + linesize, in->width, end - start, + s->pixel_black_th_i); return 0; } diff --git a/libavfilter/vf_blackdetect.h b/libavfilter/vf_blackdetect.h new file mode 100644 index 0000000000..361da2c5bc --- /dev/null +++ b/libavfilter/vf_blackdetect.h @@ -0,0 +1,71 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFILTER_VF_BLACKDETECT_H +#define AVFILTER_VF_BLACKDETECT_H + +#include +#include + +typedef unsigned (*ff_blackdetect_fn)(const uint8_t *src, ptrdiff_t stride, + ptrdiff_t width, ptrdiff_t height, + unsigned threshold); + +ff_blackdetect_fn ff_blackdetect_get_fn_x86(int depth); + +static unsigned count_pixels8_c(const uint8_t *src, ptrdiff_t stride, + ptrdiff_t width, ptrdiff_t height, + unsigned threshold) +{ + unsigned int counter = 0; + while (height--) { + for (int x = 0; x < width; x++) + counter += src[x] <= threshold; + src += stride; + } + return counter; +} + +static unsigned count_pixels16_c(const uint8_t *src, ptrdiff_t stride, + ptrdiff_t width, ptrdiff_t height, + unsigned threshold) +{ + unsigned int counter = 0; + while (height--) { + const uint16_t *src16 = (const uint16_t *) src; + for (int x = 0; x < width; x++) + counter += src16[x] <= threshold; + src += stride; + } + return counter; +} + + +static inline ff_blackdetect_fn ff_blackdetect_get_fn(int depth) +{ + ff_blackdetect_fn fn = NULL; +#if ARCH_X86 + fn = ff_blackdetect_get_fn_x86(depth); +#endif + + if (!fn) + fn = depth == 8 ? count_pixels8_c : count_pixels16_c; + return fn; +} + +#endif /* AVFILTER_VF_BLACKDETECT_H */ diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 0efe3f8d2c..86f7119a7b 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -3,6 +3,7 @@ OBJS-$(CONFIG_SCENE_SAD) += x86/scene_sad_init.o OBJS-$(CONFIG_AFIR_FILTER) += x86/af_afir_init.o OBJS-$(CONFIG_ANLMDN_FILTER) += x86/af_anlmdn_init.o OBJS-$(CONFIG_ATADENOISE_FILTER) += x86/vf_atadenoise_init.o +OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect_init.o OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o @@ -49,6 +50,7 @@ X86ASM-OBJS-$(CONFIG_SCENE_SAD) += x86/scene_sad.o X86ASM-OBJS-$(CONFIG_AFIR_FILTER) += x86/af_afir.o X86ASM-OBJS-$(CONFIG_ANLMDN_FILTER) += x86/af_anlmdn.o X86ASM-OBJS-$(CONFIG_ATADENOISE_FILTER) += x86/vf_atadenoise.o +X86ASM-OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect.o X86ASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o X86ASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o diff --git a/libavfilter/x86/vf_blackdetect.asm b/libavfilter/x86/vf_blackdetect.asm new file mode 100644 index 0000000000..78c24c5adc --- /dev/null +++ b/libavfilter/x86/vf_blackdetect.asm @@ -0,0 +1,73 @@ +;***************************************************************************** +;* x86-optimized functions for blackdetect filter +;* +;* Copyright (C) 2025 Niklas Haas +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;***************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +%macro count_pixels_fn 1 ; depth +cglobal blackdetect_%1, 5, 7, 2, src, stride, width, height, threshold + movd xm1, thresholdd + %if %1 == 8 + vpbroadcastb m1, xm1 + %else + vpbroadcastw m1, xm1 + shl widthq, 1 + %endif + add srcq, widthq + neg widthq + xor r4, r4 + mov r5, widthq + jmp .start +.loop: + popcnt r6d, r6d + add r4, r6 +.start: + movu m0, [srcq + r5] + %if %1 == 8 + pmaxub m0, m1 + pcmpeqb m0, m1 + %else + pmaxuw m0, m1 + pcmpeqw m0, m1 + %endif + pmovmskb r6d, m0 + add r5, mmsize + jl .loop + ; handle tail by shifting away unused high elements + shlx r6d, r6d, r5d + popcnt r6d, r6d + add r4, r6 + add srcq, strideq + mov r5, widthq + dec heightq + jg .start + %if %1 > 8 + shr r4, 1 + %endif + mov rax, r4 + RET +%endmacro + +INIT_YMM avx2 +count_pixels_fn 8 +count_pixels_fn 16 diff --git a/libavfilter/x86/vf_blackdetect_init.c b/libavfilter/x86/vf_blackdetect_init.c new file mode 100644 index 0000000000..3780072589 --- /dev/null +++ b/libavfilter/x86/vf_blackdetect_init.c @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/vf_blackdetect.h" + +unsigned ff_blackdetect_8_avx2(const uint8_t *, ptrdiff_t, ptrdiff_t, ptrdiff_t, unsigned); +unsigned ff_blackdetect_16_avx2(const uint8_t *, ptrdiff_t, ptrdiff_t, ptrdiff_t, unsigned); + +av_cold ff_blackdetect_fn ff_blackdetect_get_fn_x86(int depth) +{ + int cpu_flags = av_get_cpu_flags(); + if (EXTERNAL_AVX2(cpu_flags)) + return depth == 8 ? ff_blackdetect_8_avx2 : ff_blackdetect_16_avx2; + return NULL; +} diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index afd62d95ba..94d2273ef0 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -53,6 +53,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) # libavfilter tests AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o +AVFILTEROBJS-$(CONFIG_BLACKDETECT_FILTER) += vf_blackdetect.o AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o AVFILTEROBJS-$(CONFIG_BWDIF_FILTER) += vf_bwdif.o AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index f4e3d4f433..9ccf9f97e1 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -263,6 +263,9 @@ static const struct { #if CONFIG_AFIR_FILTER { "af_afir", checkasm_check_afir }, #endif + #if CONFIG_BLACKDETECT_FILTER + { "vf_blackdetect", checkasm_check_blackdetect }, + #endif #if CONFIG_BLEND_FILTER { "vf_blend", checkasm_check_blend }, #endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index e829942d58..cddafccff3 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -86,6 +86,7 @@ void checkasm_check_alacdsp(void); void checkasm_check_apv_dsp(void); void checkasm_check_audiodsp(void); void checkasm_check_av_tx(void); +void checkasm_check_blackdetect(void); void checkasm_check_blend(void); void checkasm_check_blockdsp(void); void checkasm_check_bswapdsp(void); diff --git a/tests/checkasm/vf_blackdetect.c b/tests/checkasm/vf_blackdetect.c new file mode 100644 index 0000000000..ec66420168 --- /dev/null +++ b/tests/checkasm/vf_blackdetect.c @@ -0,0 +1,78 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include "checkasm.h" + +#include "libavfilter/vf_blackdetect.h" +#include "libavutil/mem_internal.h" + +#define WIDTH 256 +#define HEIGHT 16 +#define WIDTH_PADDED WIDTH + 32 + +#define randomize_buffers(buf, size) \ + do { \ + int j; \ + uint8_t *tmp_buf = (uint8_t *)buf;\ + for (j = 0; j < size; j++) \ + tmp_buf[j] = rnd() & 0xFF; \ + } while (0) + +static void check_blackdetect(int depth) +{ + LOCAL_ALIGNED_32(uint8_t, in, [HEIGHT * WIDTH_PADDED]); + ptrdiff_t line_size = WIDTH_PADDED; + + declare_func(unsigned, const uint8_t *in, ptrdiff_t stride, + ptrdiff_t width, ptrdiff_t height, + unsigned threshold); + + memset(in, 0, HEIGHT * WIDTH_PADDED); + for (int y = 0; y < HEIGHT; y++) { + for (int x = 0; x < WIDTH; x++) + in[y * WIDTH_PADDED + x] = rnd() & 0xFF; + } + + const unsigned threshold = 16 << (depth - 8); + + int w = WIDTH; + if (depth == 16) + w /= 2; + w -= 8; /* ensure odd tail is handled correctly */ + + if (check_func(ff_blackdetect_get_fn(depth), "blackdetect%d", depth)) { + unsigned count_ref = call_ref(in, line_size, w, HEIGHT, threshold); + unsigned count_new = call_new(in, line_size, w, HEIGHT, threshold); + if (count_ref != count_new) { + fprintf(stderr, "blackdetect%d: count mismatch: %u != %u\n", + depth, count_ref, count_new); + fail(); + } + bench_new(in, line_size, w, HEIGHT, 16); + } +} + +void checkasm_check_blackdetect(void) +{ + check_blackdetect(8); + report("blackdetect8"); + + check_blackdetect(16); + report("blackdetect16"); +} -- 2.49.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".