On 6/20/2025 10:21 AM, Niklas Haas wrote: > From: Niklas Haas > > Processes two channels in parallel, using 128-bit XMM registers. > > In theory, we could go up to YMM registers to process 4 channels, but this is > not a gain except for relatively high channel counts (e.g. 7.1), and also > complicates the sample load/store operations considerably. > > I decided to only add an AVX variant, since the C code is not substantially > slower enough to justify a separate function just for ancient CPUs. > --- > libavfilter/f_ebur128.c | 15 ++-- > libavfilter/f_ebur128.h | 16 ++++ > libavfilter/x86/Makefile | 4 + > libavfilter/x86/f_ebur128.asm | 141 +++++++++++++++++++++++++++++++ > libavfilter/x86/f_ebur128_init.c | 35 ++++++++ > 5 files changed, 206 insertions(+), 5 deletions(-) > create mode 100644 libavfilter/x86/f_ebur128.asm > create mode 100644 libavfilter/x86/f_ebur128_init.c > > diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c > index b9e210c05a..2d94cefce7 100644 > --- a/libavfilter/f_ebur128.c > +++ b/libavfilter/f_ebur128.c > @@ -579,6 +579,11 @@ static av_cold int init(AVFilterContext *ctx) > /* summary */ > av_log(ctx, AV_LOG_VERBOSE, "EBU +%d scale\n", ebur128->meter); > > + ebur128->dsp.filter_channels = ff_ebur128_filter_channels_c; > +#if ARCH_X86 > + ff_ebur128_init_x86(&ebur128->dsp); > +#endif > + > return 0; > } > > @@ -692,11 +697,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples) > MOVE_TO_NEXT_CACHED_ENTRY(400); > MOVE_TO_NEXT_CACHED_ENTRY(3000); > > - ff_ebur128_filter_channels_c(dsp, &samples[idx_insample * nb_channels], > - &ebur128->i400.cache[bin_id_400 * nb_channels], > - &ebur128->i3000.cache[bin_id_3000 * nb_channels], > - ebur128->i400.sum, ebur128->i3000.sum, > - nb_channels); > + dsp->filter_channels(dsp, &samples[idx_insample * nb_channels], > + &ebur128->i400.cache[bin_id_400 * nb_channels], > + &ebur128->i3000.cache[bin_id_3000 * nb_channels], > + ebur128->i400.sum, ebur128->i3000.sum, > + nb_channels); > > #define FIND_PEAK(global, sp, ptype) do { \ > int ch; \ > diff --git a/libavfilter/f_ebur128.h b/libavfilter/f_ebur128.h > index 7b8e876576..1889e28bdd 100644 > --- a/libavfilter/f_ebur128.h > +++ b/libavfilter/f_ebur128.h > @@ -22,6 +22,9 @@ > #ifndef AVFILTER_F_EBUR128_H > #define AVFILTER_F_EBUR128_H > > +#include > +#include > + > typedef struct EBUR128Biquad { > double b0, b1, b2; > double a1, a2; > @@ -35,8 +38,21 @@ typedef struct EBUR128DSPContext { > /* Cache of 3 samples for each channel */ > double *y; /* after pre-filter */ > double *z; /* after RLB-filter */ > + > + /* DSP functions */ > + void (*filter_channels)(const struct EBUR128DSPContext *dsp, > + const double *samples, > + double *cache_400, double *cache_3000, > + double *sum_400, double *sum_3000, > + int nb_channels); > } EBUR128DSPContext; > > +static_assert(offsetof(EBUR128DSPContext, pre) == 0, "struct layout mismatch"); > +static_assert(offsetof(EBUR128DSPContext, rlb) == 5 * sizeof(double), "struct layout mismatch"); > +static_assert(offsetof(EBUR128DSPContext, y) == 10 * sizeof(double), "struct layout mismatch"); > + > +void ff_ebur128_init_x86(EBUR128DSPContext *dsp); > + > void ff_ebur128_filter_channels_c(const EBUR128DSPContext *, const double *, > double *, double *, double *, double *, int); > > diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile > index 0d9a28a935..e5f0c55a5e 100644 > --- a/libavfilter/x86/Makefile > +++ b/libavfilter/x86/Makefile > @@ -7,6 +7,7 @@ OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o > OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o > OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o > OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution_init.o > +OBJS-$(CONFIG_EBUR128_FILTER) += x86/f_ebur128_init.o > OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq_init.o > OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o > OBJS-$(CONFIG_GBLUR_FILTER) += x86/vf_gblur_init.o > @@ -52,6 +53,9 @@ X86ASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o > X86ASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o > X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o > X86ASM-OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution.o > +ifdef ARCH_X86_64 nit: The way we do this usually is by adding this check to the asm file, to cover whatever is needed after the x86util.asm include. Also, a checkasm test would be nice.