From: MakarDev via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: MakarDev <code@ffmpeg.org>
Subject: [FFmpeg-devel] [PATCH] avfilter/boxblur: add AVX2 assembly (PR #20770)
Date: Mon, 27 Oct 2025 18:47:48 -0000
Message-ID: <176159086870.81.11355698174203966738@7d278768979e> (raw)
PR #20770 opened by MakarDev
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20770
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20770.patch
AVX2 assembly implementation of the boxblur filter. As the boxblur filter has a dependency chain over sum, it can't be fully vectorized, but speedup was achieved through vectorizing all the other operations in the filter. Also, assembly is written only for the "steady-state" middle part of the image, to which boxblur is applied.
Benchmarking results
tests/checkasm/checkasm --test=vf_boxblur --bench
AVX2:
- vf_boxblur.boxblur_blur8 [OK]
- vf_boxblur.boxblur_blur16 [OK]
checkasm: all 2 tests passed
boxblur_blur8_c: 1396.9 ( 1.00x)
boxblur_blur8_avx2: 541.1 ( 2.58x)
boxblur_blur16_c: 1256.0 ( 1.00x)
boxblur_blur16_avx2: 504.2 ( 2.49x)
>From 26e836c1ebf2bfcd3c02f9e7d7a46dd135ee6174 Mon Sep 17 00:00:00 2001
From: MakarDev <kuznietsov.makar@gmail.com>
Date: Thu, 16 Oct 2025 22:44:31 -0700
Subject: [PATCH] avfilter/boxblur: add AVX2 assembly
---
libavfilter/Makefile | 2 +-
libavfilter/boxblur.h | 9 ++
libavfilter/boxblur_dsp.c | 37 ++++++
libavfilter/vf_boxblur.c | 93 ++++++++++---
libavfilter/vf_boxblur_dsp.h | 46 +++++++
libavfilter/x86/Makefile | 2 +
libavfilter/x86/vf_boxblur.asm | 213 ++++++++++++++++++++++++++++++
libavfilter/x86/vf_boxblur_init.c | 50 +++++++
tests/checkasm/Makefile | 1 +
tests/checkasm/checkasm.c | 3 +
tests/checkasm/checkasm.h | 1 +
tests/checkasm/vf_boxblur.c | 148 +++++++++++++++++++++
tests/fate/checkasm.mak | 1 +
13 files changed, 585 insertions(+), 21 deletions(-)
create mode 100644 libavfilter/boxblur_dsp.c
create mode 100644 libavfilter/vf_boxblur_dsp.h
create mode 100644 libavfilter/x86/vf_boxblur.asm
create mode 100644 libavfilter/x86/vf_boxblur_init.c
create mode 100644 tests/checkasm/vf_boxblur.c
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 69d74183b2..00f956dc19 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -217,7 +217,7 @@ OBJS-$(CONFIG_BLEND_VULKAN_FILTER) += vf_blend_vulkan.o framesync.o vu
OBJS-$(CONFIG_BLOCKDETECT_FILTER) += vf_blockdetect.o
OBJS-$(CONFIG_BLURDETECT_FILTER) += vf_blurdetect.o edge_common.o
OBJS-$(CONFIG_BM3D_FILTER) += vf_bm3d.o framesync.o
-OBJS-$(CONFIG_BOXBLUR_FILTER) += vf_boxblur.o boxblur.o
+OBJS-$(CONFIG_BOXBLUR_FILTER) += vf_boxblur.o boxblur.o boxblur_dsp.o
OBJS-$(CONFIG_BOXBLUR_OPENCL_FILTER) += vf_avgblur_opencl.o opencl.o \
opencl/avgblur.o boxblur.o
OBJS-$(CONFIG_BWDIF_FILTER) += vf_bwdif.o bwdifdsp.o yadif_common.o
diff --git a/libavfilter/boxblur.h b/libavfilter/boxblur.h
index 214d4e0c93..16ca377600 100644
--- a/libavfilter/boxblur.h
+++ b/libavfilter/boxblur.h
@@ -44,4 +44,13 @@ int ff_boxblur_eval_filter_params(AVFilterLink *inlink,
FilterParam *chroma_param,
FilterParam *alpha_param);
+/* Forward declaration */
+typedef struct FFBoxblurDSPContext FFBoxblurDSPContext;
+
+/* Blur functions - used for testing and internally */
+void ff_boxblur_blur8(uint8_t *dst, int dst_step, const uint8_t *src,
+ int src_step, int len, int radius, FFBoxblurDSPContext *dsp);
+void ff_boxblur_blur16(uint16_t *dst, int dst_step, const uint16_t *src,
+ int src_step, int len, int radius, FFBoxblurDSPContext *dsp);
+
#endif // AVFILTER_BOXBLUR_H
diff --git a/libavfilter/boxblur_dsp.c b/libavfilter/boxblur_dsp.c
new file mode 100644
index 0000000000..9633cd1062
--- /dev/null
+++ b/libavfilter/boxblur_dsp.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2025 Makar Kuznietsov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "vf_boxblur_dsp.h"
+
+#if ARCH_X86_64
+void ff_boxblur_dsp_init_x86(FFBoxblurDSPContext *dsp, int depth);
+#endif
+
+av_cold void ff_boxblur_dsp_init(FFBoxblurDSPContext *dsp, int depth)
+{
+ dsp->middle = depth > 8 ? (void *)boxblur_middle16_c : (void *)boxblur_middle8_c;
+
+#if ARCH_X86_64
+ ff_boxblur_dsp_init_x86(dsp, depth);
+#endif
+}
+
diff --git a/libavfilter/vf_boxblur.c b/libavfilter/vf_boxblur.c
index 3cb42471a7..07bf979453 100644
--- a/libavfilter/vf_boxblur.c
+++ b/libavfilter/vf_boxblur.c
@@ -33,7 +33,7 @@
#include "formats.h"
#include "video.h"
#include "boxblur.h"
-
+#include "vf_boxblur_dsp.h"
typedef struct BoxBlurContext {
const AVClass *class;
@@ -45,6 +45,7 @@ typedef struct BoxBlurContext {
int radius[4];
int power[4];
uint8_t *temp[2]; ///< temporary buffer used in blur_power()
+ FFBoxblurDSPContext dsp;
} BoxBlurContext;
static av_cold void uninit(AVFilterContext *ctx)
@@ -108,9 +109,39 @@ static int config_input(AVFilterLink *inlink)
s->power[U] = s->power[V] = s->chroma_param.power;
s->power[A] = s->alpha_param.power;
+ ff_boxblur_dsp_init(&s->dsp, desc->comp[0].depth);
+
return 0;
}
+/* C reference implementation of middle loop for 8-bit */
+void boxblur_middle8_c(uint8_t *dst, const uint8_t *src,
+ int x_start, int x_end, int radius,
+ int inv, int *sum_ptr)
+{
+ int x;
+ int sum = *sum_ptr;
+ for (x = x_start; x < x_end; x++) {
+ sum += (src[radius+x] - src[x-radius-1])*inv;
+ dst[x] = sum >>16;
+ }
+ *sum_ptr = sum;
+}
+
+/* C reference implementation of middle loop for 16-bit */
+void boxblur_middle16_c(uint16_t *dst, const uint16_t *src,
+ int x_start, int x_end, int radius,
+ int inv, int *sum_ptr)
+{
+ int x;
+ int sum = *sum_ptr;
+ for (x = x_start; x < x_end; x++) {
+ sum += (src[radius+x] - src[x-radius-1])*inv;
+ dst[x] = sum >>16;
+ }
+ *sum_ptr = sum;
+}
+
/* Naive boxblur would sum source pixels from x-radius .. x+radius
* for destination pixel x. That would be O(radius*width).
* If you now look at what source pixels represent 2 consecutive
@@ -125,9 +156,10 @@ static int config_input(AVFilterLink *inlink)
* and subtracting 1 input pixel.
* The following code adopts this faster variant.
*/
-#define BLUR(type, depth) \
-static inline void blur ## depth(type *dst, int dst_step, const type *src, \
- int src_step, int len, int radius) \
+#define BLUR(type, depth) \
+void ff_boxblur_blur ## depth(type *dst, int dst_step, const type *src, \
+ int src_step, int len, int radius, \
+ FFBoxblurDSPContext *dsp) \
{ \
const int length = radius*2 + 1; \
const int inv = ((1<<16) + length/2)/length; \
@@ -143,9 +175,27 @@ static inline void blur ## depth(type *dst, int dst_step, const type *src, \
dst[x*dst_step] = sum>>16; \
} \
\
- for (; x < len-radius; x++) { \
- sum += (src[(radius+x)*src_step] - src[(x-radius-1)*src_step])*inv; \
- dst[x*dst_step] = sum >>16; \
+ /* Middle loop: use optimized function if strides are 1 */ \
+ { \
+ int middle_start = radius + 1; \
+ int middle_end = len - radius; \
+ if (middle_end > middle_start && dst_step == 1 && src_step == 1) { \
+ int middle_end_mod16 = middle_end - ((middle_end-middle_start)%16); \
+ if (dsp && dsp->middle && middle_end_mod16 > middle_start) { \
+ dsp->middle(dst, src, middle_start, middle_end_mod16, \
+ radius, inv, &sum); \
+ x = middle_end_mod16; \
+ } \
+ for (; x < middle_end; x++) { \
+ sum += (src[(radius+x)*src_step] - src[(x-radius-1)*src_step])*inv; \
+ dst[x*dst_step] = sum >>16; \
+ } \
+ } else { \
+ for (x = middle_start; x < middle_end; x++) { \
+ sum += (src[(radius+x)*src_step] - src[(x-radius-1)*src_step])*inv; \
+ dst[x*dst_step] = sum >>16; \
+ } \
+ } \
} \
\
for (; x < len; x++) { \
@@ -160,26 +210,27 @@ BLUR(uint16_t, 16)
#undef BLUR
static inline void blur(uint8_t *dst, int dst_step, const uint8_t *src, int src_step,
- int len, int radius, int pixsize)
+ int len, int radius, int pixsize, FFBoxblurDSPContext *dsp)
{
- if (pixsize == 1) blur8 (dst, dst_step , src, src_step , len, radius);
- else blur16((uint16_t*)dst, dst_step>>1, (const uint16_t*)src, src_step>>1, len, radius);
+ if (pixsize == 1) ff_boxblur_blur8 (dst, dst_step , src, src_step , len, radius, dsp);
+ else ff_boxblur_blur16((uint16_t*)dst, dst_step>>1, (const uint16_t*)src, src_step>>1, len, radius, dsp);
}
static inline void blur_power(uint8_t *dst, int dst_step, const uint8_t *src, int src_step,
- int len, int radius, int power, uint8_t *temp[2], int pixsize)
+ int len, int radius, int power, uint8_t *temp[2], int pixsize,
+ FFBoxblurDSPContext *dsp)
{
uint8_t *a = temp[0], *b = temp[1];
if (radius && power) {
- blur(a, pixsize, src, src_step, len, radius, pixsize);
+ blur(a, pixsize, src, src_step, len, radius, pixsize, dsp);
for (; power > 2; power--) {
uint8_t *c;
- blur(b, pixsize, a, pixsize, len, radius, pixsize);
+ blur(b, pixsize, a, pixsize, len, radius, pixsize, dsp);
c = a; a = b; b = c;
}
if (power > 1) {
- blur(dst, dst_step, a, pixsize, len, radius, pixsize);
+ blur(dst, dst_step, a, pixsize, len, radius, pixsize, dsp);
} else {
int i;
if (pixsize == 1) {
@@ -201,7 +252,8 @@ static inline void blur_power(uint8_t *dst, int dst_step, const uint8_t *src, in
}
static void hblur(uint8_t *dst, int dst_linesize, const uint8_t *src, int src_linesize,
- int w, int h, int radius, int power, uint8_t *temp[2], int pixsize)
+ int w, int h, int radius, int power, uint8_t *temp[2], int pixsize,
+ FFBoxblurDSPContext *dsp)
{
int y;
@@ -210,11 +262,12 @@ static void hblur(uint8_t *dst, int dst_linesize, const uint8_t *src, int src_li
for (y = 0; y < h; y++)
blur_power(dst + y*dst_linesize, pixsize, src + y*src_linesize, pixsize,
- w, radius, power, temp, pixsize);
+ w, radius, power, temp, pixsize, dsp);
}
static void vblur(uint8_t *dst, int dst_linesize, const uint8_t *src, int src_linesize,
- int w, int h, int radius, int power, uint8_t *temp[2], int pixsize)
+ int w, int h, int radius, int power, uint8_t *temp[2], int pixsize,
+ FFBoxblurDSPContext *dsp)
{
int x;
@@ -223,7 +276,7 @@ static void vblur(uint8_t *dst, int dst_linesize, const uint8_t *src, int src_li
for (x = 0; x < w; x++)
blur_power(dst + x*pixsize, dst_linesize, src + x*pixsize, src_linesize,
- h, radius, power, temp, pixsize);
+ h, radius, power, temp, pixsize, dsp);
}
static int filter_frame(AVFilterLink *inlink, AVFrame *in)
@@ -251,13 +304,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
hblur(out->data[plane], out->linesize[plane],
in ->data[plane], in ->linesize[plane],
w[plane], h[plane], s->radius[plane], s->power[plane],
- s->temp, pixsize);
+ s->temp, pixsize, &s->dsp);
for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++)
vblur(out->data[plane], out->linesize[plane],
out->data[plane], out->linesize[plane],
w[plane], h[plane], s->radius[plane], s->power[plane],
- s->temp, pixsize);
+ s->temp, pixsize, &s->dsp);
av_frame_free(&in);
diff --git a/libavfilter/vf_boxblur_dsp.h b/libavfilter/vf_boxblur_dsp.h
new file mode 100644
index 0000000000..c2603df55f
--- /dev/null
+++ b/libavfilter/vf_boxblur_dsp.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2025 Makar Kuznietsov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_BOXBLUR_DSP_H
+#define AVFILTER_BOXBLUR_DSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct FFBoxblurDSPContext {
+ /* Optimized middle-loop function for steady-state blur */
+ void (*middle)(void *dst, const void *src,
+ int x_start, int x_end, int radius,
+ int inv, int *sum_ptr);
+} FFBoxblurDSPContext;
+
+/* C reference implementations */
+void boxblur_middle8_c(uint8_t *dst, const uint8_t *src,
+ int x_start, int x_end, int radius,
+ int inv, int *sum_ptr);
+
+void boxblur_middle16_c(uint16_t *dst, const uint16_t *src,
+ int x_start, int x_end, int radius,
+ int inv, int *sum_ptr);
+
+void ff_boxblur_dsp_init(FFBoxblurDSPContext *dsp, int depth);
+
+#endif /* AVFILTER_BOXBLUR_DSP_H */
+
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index b485c10fbe..f8840c5a73 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -5,6 +5,7 @@ OBJS-$(CONFIG_ANLMDN_FILTER) += x86/af_anlmdn_init.o
OBJS-$(CONFIG_ATADENOISE_FILTER) += x86/vf_atadenoise_init.o
OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect_init.o
OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o
+OBJS-$(CONFIG_BOXBLUR_FILTER) += x86/vf_boxblur_init.o
OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o
OBJS-$(CONFIG_COLORDETECT_FILTER) += x86/vf_colordetect_init.o
OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o
@@ -53,6 +54,7 @@ X86ASM-OBJS-$(CONFIG_ANLMDN_FILTER) += x86/af_anlmdn.o
X86ASM-OBJS-$(CONFIG_ATADENOISE_FILTER) += x86/vf_atadenoise.o
X86ASM-OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect.o
X86ASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o
+X86ASM-OBJS-$(CONFIG_BOXBLUR_FILTER) += x86/vf_boxblur.o
X86ASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o
X86ASM-OBJS-$(CONFIG_COLORDETECT_FILTER) += x86/vf_colordetect.o
X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o
diff --git a/libavfilter/x86/vf_boxblur.asm b/libavfilter/x86/vf_boxblur.asm
new file mode 100644
index 0000000000..069ed092c9
--- /dev/null
+++ b/libavfilter/x86/vf_boxblur.asm
@@ -0,0 +1,213 @@
+;*****************************************************************************
+;* x86-optimized functions for boxblur filter
+;*
+;* Copyright (c) 2025 Makar Kuznietsov
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%if ARCH_X86_64
+
+; void ff_boxblur_middle_avx2(uint8_t *dst, const uint8_t *src,
+; int x_start, int x_end, int radius,
+; int inv, int *sum_ptr)
+INIT_YMM avx2
+cglobal boxblur_middle, 7, 10, 6, dst, src, x_start, x_end, radius, inv, sum_ptr, x, tmp, sum
+ mov sumd, [sum_ptrq]
+ movd xm3, invd
+ vpbroadcastd m3, xm3
+ mov xd, x_startd
+
+.vloop:
+ ; Load incoming pixels: src[x + radius]
+ lea tmpq, [xq + radiusq]
+ movu xm0, [srcq + tmpq]
+
+ ; Load outgoing pixels: src[x - radius - 1]
+ lea tmpq, [xq - 1]
+ sub tmpq, radiusq
+ movu xm1, [srcq + tmpq]
+
+ ; Zero-extend u8 -> u16
+ pmovzxbw m0, xm0
+ pmovzxbw m1, xm1
+
+ ; Compute signed difference
+ psubw m2, m0, m1
+ pmovsxwd m4, xm2
+
+ ; Extract high 8 words and sign-extend
+ vextracti128 xm0, m2, 1
+ pmovsxwd m5, xm0
+
+ ; Multiply by inv
+ pmulld m4, m4, m3
+ pmulld m5, m5, m3
+
+ ; Compute prefix sum for m4 (lower 8 pixels)
+ mova m0, m4
+ pslldq m1, m0, 4
+ paddd m0, m0, m1
+ pslldq m1, m0, 8
+ paddd m0, m0, m1
+
+ ; Propagate carry across 128-bit lanes
+ vextracti128 xm1, m0, 0
+ vpshufd xm1, xm1, 0xFF
+ vpxor m2, m2, m2
+ vinserti128 m2, m2, xm1, 1
+ vpaddd m0, m0, m2
+
+ ; Add accumulator
+ movd xm2, sumd
+ vpbroadcastd m2, xm2
+ paddd m0, m0, m2
+ mova m4, m0
+
+ ; Update accumulator for next iteration
+ vextracti128 xm1, m0, 1
+ pshufd xm1, xm1, 0xFF
+ movd sumd, xm1
+
+ ; Compute prefix sum for m5 (upper 8 pixels)
+ mova m0, m5
+ pslldq m1, m0, 4
+ paddd m0, m0, m1
+ pslldq m1, m0, 8
+ paddd m0, m0, m1
+
+ ; Propagate carry across 128-bit lanes
+ vextracti128 xm1, m0, 0
+ pshufd xm1, xm1, 0xFF
+ pxor m2, m2, m2
+ vinserti128 m2, m2, xm1, 1
+ paddd m0, m0, m2
+
+ ; Add accumulator
+ movd xm2, sumd
+ vpbroadcastd m2, xm2
+ paddd m0, m0, m2
+ mova m5, m0
+
+ ; Update accumulator for next iteration
+ vextracti128 xm1, m0, 1
+ pshufd xm1, xm1, 0xFF
+ movd sumd, xm1
+
+ ; Shift and pack results
+ psrad m4, m4, 16
+ psrad m5, m5, 16
+
+ ; Pack lower 8 pixels
+ vextracti128 xm0, m4, 0
+ vextracti128 xm1, m4, 1
+ packusdw xm0, xm0, xm1
+ packuswb xm0, xm0, xm0
+ movq [dstq + xq + 0], xm0
+
+ ; Pack upper 8 pixels
+ vextracti128 xm0, m5, 0
+ vextracti128 xm1, m5, 1
+ packusdw xm0, xm0, xm1
+ packuswb xm0, xm0, xm0
+ movq [dstq + xq + 8], xm0
+
+ add xd, 16
+ cmp xd, x_endd
+ jl .vloop
+
+ mov [sum_ptrq], sumd
+ RET
+
+; void ff_boxblur_middle16_avx2(uint16_t *dst, const uint16_t *src,
+; int x_start, int x_end, int radius,
+; int inv, int *sum_ptr)
+INIT_YMM avx2
+cglobal boxblur_middle16, 7, 10, 5, dst, src, x_start, x_end, radius, inv, sum_ptr, x, tmp, sum
+ mov sumd, [sum_ptrq]
+ movd xm3, invd
+ vpbroadcastd m3, xm3
+ mov xd, x_startd
+
+.vloop:
+ ; Load incoming pixels: src[x + radius] (accounting for 2-byte stride)
+ lea tmpq, [xq + radiusq]
+ movu xm0, [srcq + tmpq*2]
+
+ ; Load outgoing pixels: src[x - radius - 1]
+ lea tmpq, [xq - 1]
+ sub tmpq, radiusq
+ movu xm1, [srcq + tmpq*2]
+
+ ; Zero-extend u16 -> u32
+ pmovzxwd m0, xm0
+ pmovzxwd m1, xm1
+
+ ; Compute signed difference
+ psubd m2, m0, m1
+
+ ; Multiply by inv
+ pmulld m4, m2, m3
+
+ ; Compute prefix sum
+ mova m0, m4
+ pslldq m1, m0, 4
+ paddd m0, m0, m1
+ pslldq m1, m0, 8
+ paddd m0, m0, m1
+
+ ; Propagate carry across 128-bit lanes
+ vextracti128 xm1, m0, 0
+ pshufd xm1, xm1, 0xFF
+ pxor m2, m2, m2
+ vinserti128 m2, m2, xm1, 1
+ paddd m0, m0, m2
+
+ ; Add accumulator
+ movd xm2, sumd
+ vpbroadcastd m2, xm2
+ paddd m0, m0, m2
+ mova m4, m0
+
+ ; Update accumulator for next iteration
+ vextracti128 xm1, m0, 1
+ pshufd xm1, xm1, 0xFF
+ movd sumd, xm1
+
+ ; Shift and pack results
+ psrld m4, m4, 16
+ vextracti128 xm0, m4, 0
+ vextracti128 xm1, m4, 1
+ packusdw xm0, xm0, xm1
+ movu [dstq + xq*2], xm0
+
+ add xd, 8
+ cmp xd, x_endd
+ jl .vloop
+
+ mov [sum_ptrq], sumd
+ RET
+
+%endif
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+%endif
diff --git a/libavfilter/x86/vf_boxblur_init.c b/libavfilter/x86/vf_boxblur_init.c
new file mode 100644
index 0000000000..e11536d10c
--- /dev/null
+++ b/libavfilter/x86/vf_boxblur_init.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2025 Makar Kuznietsov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+
+#include "libavfilter/vf_boxblur_dsp.h"
+
+/* Forward declaration */
+void ff_boxblur_dsp_init_x86(FFBoxblurDSPContext *dsp, int depth);
+
+/* AVX2 optimized middle-loop functions */
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+void ff_boxblur_middle_avx2(uint8_t *dst, const uint8_t *src,
+ int x_start, int x_end, int radius,
+ int inv, int *sum_ptr);
+
+void ff_boxblur_middle16_avx2(uint16_t *dst, const uint16_t *src,
+ int x_start, int x_end, int radius,
+ int inv, int *sum_ptr);
+#endif
+
+av_cold void ff_boxblur_dsp_init_x86(FFBoxblurDSPContext *dsp, int depth)
+{
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ dsp->middle = depth > 8 ? (void *)ff_boxblur_middle16_avx2 : (void *)ff_boxblur_middle_avx2;
+ }
+#endif
+}
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index e47070d90f..8d3196bbdf 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -60,6 +60,7 @@ AVFILTEROBJS-$(CONFIG_SCENE_SAD) += scene_sad.o
AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o
AVFILTEROBJS-$(CONFIG_BLACKDETECT_FILTER) += vf_blackdetect.o
AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o
+AVFILTEROBJS-$(CONFIG_BOXBLUR_FILTER) += vf_boxblur.o
AVFILTEROBJS-$(CONFIG_BWDIF_FILTER) += vf_bwdif.o
AVFILTEROBJS-$(CONFIG_COLORDETECT_FILTER)+= vf_colordetect.o
AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 4469e043f5..23800b9978 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -285,6 +285,9 @@ static const struct {
#if CONFIG_BLEND_FILTER
{ "vf_blend", checkasm_check_blend },
#endif
+ #if CONFIG_BOXBLUR_FILTER
+ { "vf_boxblur", checkasm_check_boxblur },
+ #endif
#if CONFIG_BWDIF_FILTER
{ "vf_bwdif", checkasm_check_vf_bwdif },
#endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index e1ccd4011b..bfca26cb82 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -89,6 +89,7 @@ void checkasm_check_av_tx(void);
void checkasm_check_blackdetect(void);
void checkasm_check_blend(void);
void checkasm_check_blockdsp(void);
+void checkasm_check_boxblur(void);
void checkasm_check_bswapdsp(void);
void checkasm_check_cavsdsp(void);
void checkasm_check_colordetect(void);
diff --git a/tests/checkasm/vf_boxblur.c b/tests/checkasm/vf_boxblur.c
new file mode 100644
index 0000000000..c67abc5ece
--- /dev/null
+++ b/tests/checkasm/vf_boxblur.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2025 Makar Kuznietsov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+#include "checkasm.h"
+#include "libavutil/mem_internal.h"
+#include "libavutil/cpu.h"
+
+#include "libavfilter/boxblur.h"
+#include "libavfilter/vf_boxblur_dsp.h"
+
+static int current_depth = 8;
+
+static void blur8_c(uint8_t *dst, int dst_step, const uint8_t *src,
+ int src_step, int len, int radius)
+{
+ FFBoxblurDSPContext dsp;
+ int saved_flags = av_get_cpu_flags();
+ av_force_cpu_flags(saved_flags & ~AV_CPU_FLAG_AVX2);
+ ff_boxblur_dsp_init(&dsp, current_depth);
+ av_force_cpu_flags(saved_flags);
+ ff_boxblur_blur8(dst, dst_step, src, src_step, len, radius, &dsp);
+}
+
+static void blur8_simd(uint8_t *dst, int dst_step, const uint8_t *src,
+ int src_step, int len, int radius)
+{
+ FFBoxblurDSPContext dsp;
+ ff_boxblur_dsp_init(&dsp, current_depth);
+ ff_boxblur_blur8(dst, dst_step, src, src_step, len, radius, &dsp);
+}
+
+static void check_blur8(int depth)
+{
+ LOCAL_ALIGNED_32(uint8_t, src, [2048]);
+ LOCAL_ALIGNED_32(uint8_t, dst0, [2048]);
+ LOCAL_ALIGNED_32(uint8_t, dst1, [2048]);
+
+ declare_func(void, uint8_t *, int, const uint8_t *, int, int, int);
+
+ current_depth = depth;
+
+ /* Register exactly one version per CPU run so checkasm records C and AVX2 */
+ void (*fn)(uint8_t *, int, const uint8_t *, int, int, int) =
+ (av_get_cpu_flags() & AV_CPU_FLAG_AVX2) ? blur8_simd : blur8_c;
+
+ if (check_func(fn, "boxblur_blur8")) {
+ for (int iter = 0; iter < 16; iter++) {
+ const int len = 64 + (rnd() % 256);
+ const int radius = FFMIN((len - 1) / 2, 1 + (rnd() % 15));
+ for (int i = 0; i < len; i++)
+ src[i] = rnd();
+
+ call_ref(dst0, 1, src, 1, len, radius);
+ call_new(dst1, 1, src, 1, len, radius);
+ if (memcmp(dst0, dst1, len))
+ fail();
+ }
+
+ /* Benchmark with typical size */
+ const int bench_len = 256;
+ const int bench_radius = 8;
+ for (int i = 0; i < bench_len; i++)
+ src[i] = rnd();
+ bench_new(dst1, 1, src, 1, bench_len, bench_radius);
+ }
+}
+
+static void blur16_c(uint16_t *dst, int dst_step, const uint16_t *src,
+ int src_step, int len, int radius)
+{
+ FFBoxblurDSPContext dsp;
+ int saved_flags = av_get_cpu_flags();
+ av_force_cpu_flags(saved_flags & ~AV_CPU_FLAG_AVX2);
+ ff_boxblur_dsp_init(&dsp, current_depth);
+ av_force_cpu_flags(saved_flags);
+ ff_boxblur_blur16(dst, dst_step, src, src_step, len, radius, &dsp);
+}
+
+static void blur16_simd(uint16_t *dst, int dst_step, const uint16_t *src,
+ int src_step, int len, int radius)
+{
+ FFBoxblurDSPContext dsp;
+ ff_boxblur_dsp_init(&dsp, current_depth);
+ ff_boxblur_blur16(dst, dst_step, src, src_step, len, radius, &dsp);
+}
+
+static void check_blur16(int depth)
+{
+ LOCAL_ALIGNED_32(uint16_t, src, [2048]);
+ LOCAL_ALIGNED_32(uint16_t, dst0, [2048]);
+ LOCAL_ALIGNED_32(uint16_t, dst1, [2048]);
+
+ declare_func(void, uint16_t *, int, const uint16_t *, int, int, int);
+
+ current_depth = depth;
+
+ /* Register exactly one version per CPU run so checkasm records C and AVX2 */
+ void (*fn)(uint16_t *, int, const uint16_t *, int, int, int) =
+ (av_get_cpu_flags() & AV_CPU_FLAG_AVX2) ? blur16_simd : blur16_c;
+
+ if (check_func(fn, "boxblur_blur16")) {
+ for (int iter = 0; iter < 16; iter++) {
+ const int len = 64 + (rnd() % 256);
+ const int radius = FFMIN((len - 1) / 2, 1 + (rnd() % 15));
+ for (int i = 0; i < len; i++)
+ src[i] = rnd();
+
+ call_ref(dst0, 1, src, 1, len, radius);
+ call_new(dst1, 1, src, 1, len, radius);
+ if (memcmp(dst0, dst1, len * sizeof(uint16_t)))
+ fail();
+ }
+
+ /* Benchmark with typical size */
+ const int bench_len = 256;
+ const int bench_radius = 8;
+ for (int i = 0; i < bench_len; i++)
+ src[i] = rnd();
+ bench_new(dst1, 1, src, 1, bench_len, bench_radius);
+ }
+}
+
+void checkasm_check_boxblur(void)
+{
+ check_blur8(8);
+ report("boxblur_blur8");
+
+ check_blur16(16);
+ report("boxblur_blur16");
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index ca1cd0dea3..3fcef57496 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -63,6 +63,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \
fate-checkasm-vc1dsp \
fate-checkasm-vf_blackdetect \
fate-checkasm-vf_blend \
+ fate-checkasm-vf_boxblur \
fate-checkasm-vf_bwdif \
fate-checkasm-vf_colordetect \
fate-checkasm-vf_colorspace \
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2025-10-27 18:48 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=176159086870.81.11355698174203966738@7d278768979e \
--to=ffmpeg-devel@ffmpeg.org \
--cc=code@ffmpeg.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git