* [FFmpeg-devel] [PATCH v4 02/13] avfilter/f_ebur128: simplify sample cache array
2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
@ 2025-06-17 12:05 ` Niklas Haas
2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 03/13] avfilter/f_ebur128: use structs for biquad weights Niklas Haas
` (10 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:05 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
From: Niklas Haas <git@haasn.dev>
We don't need an X sample cache anymore, and we also can simplify the
access macro slightly.
---
libavfilter/f_ebur128.c | 29 +++++++++++------------------
1 file changed, 11 insertions(+), 18 deletions(-)
diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 173a4f75ca..d0707e9ef9 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -115,7 +115,6 @@ typedef struct EBUR128Context {
/* Filter caches.
* The mult by 3 in the following is for X[i], X[i-1] and X[i-2] */
- double *x; ///< 3 input samples cache for each channel
double *y; ///< 3 pre-filter samples cache for each channel
double *z; ///< 3 RLB-filter samples cache for each channel
double pre_b[3]; ///< pre-filter numerator coefficients
@@ -446,11 +445,10 @@ static int config_audio_output(AVFilterLink *outlink)
AV_CH_SURROUND_DIRECT_LEFT |AV_CH_SURROUND_DIRECT_RIGHT)
ebur128->nb_channels = nb_channels;
- ebur128->x = av_calloc(nb_channels, 3 * sizeof(*ebur128->x));
ebur128->y = av_calloc(nb_channels, 3 * sizeof(*ebur128->y));
ebur128->z = av_calloc(nb_channels, 3 * sizeof(*ebur128->z));
ebur128->ch_weighting = av_calloc(nb_channels, sizeof(*ebur128->ch_weighting));
- if (!ebur128->ch_weighting || !ebur128->x || !ebur128->y || !ebur128->z)
+ if (!ebur128->ch_weighting || !ebur128->y || !ebur128->z)
return AVERROR(ENOMEM);
#define I400_BINS(x) ((x) * 4 / 10)
@@ -673,34 +671,30 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
MOVE_TO_NEXT_CACHED_ENTRY(3000);
for (ch = 0; ch < nb_channels; ch++) {
- double bin;
-
if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS)
ebur128->sample_peaks[ch] = FFMAX(ebur128->sample_peaks[ch], fabs(samples[idx_insample * nb_channels + ch]));
- ebur128->x[ch * 3] = samples[idx_insample * nb_channels + ch]; // set X[i]
-
if (!ebur128->ch_weighting[ch])
continue;
/* Y[i] = X[i]*b0 + X[i-1]*b1 + X[i-2]*b2 - Y[i-1]*a1 - Y[i-2]*a2 */
-#define FILTER(Y, X, NUM, DEN) do { \
- double *dst = ebur128->Y + ch*3; \
- double src = ebur128->X[ch*3] ; \
- double dst0 = NUM[0] * src + dst[1]; \
- dst[1] = NUM[1] * src + dst[2] - DEN[1] * dst0; \
- dst[2] = NUM[2] * src - DEN[2] * dst0; \
- dst[0] = dst0; \
+#define FILTER(DST, SRC, NUM, DEN) do { \
+ const double tmp = DST[0] = NUM[0] * SRC + DST[1]; \
+ DST[1] = NUM[1] * SRC + DST[2] - DEN[1] * tmp; \
+ DST[2] = NUM[2] * SRC - DEN[2] * tmp; \
} while (0)
+ const double x = samples[idx_insample * nb_channels + ch];
+ double *restrict y = &ebur128->y[3 * ch];
+ double *restrict z = &ebur128->z[3 * ch];
+
// TODO: merge both filters in one?
FILTER(y, x, ebur128->pre_b, ebur128->pre_a); // apply pre-filter
- FILTER(z, y, ebur128->rlb_b, ebur128->rlb_a); // apply RLB-filter
-
- bin = ebur128->z[ch * 3] * ebur128->z[ch * 3];
+ FILTER(z, *y, ebur128->rlb_b, ebur128->rlb_a); // apply RLB-filter
/* add the new value, and limit the sum to the cache size (400ms or 3s)
* by removing the oldest one */
+ double bin = *z * *z;
ebur128->i400.sum [ch] = ebur128->i400.sum [ch] + bin - ebur128->i400.cache [ch][bin_id_400];
ebur128->i3000.sum[ch] = ebur128->i3000.sum[ch] + bin - ebur128->i3000.cache[ch][bin_id_3000];
@@ -1073,7 +1067,6 @@ static av_cold void uninit(AVFilterContext *ctx)
}
av_freep(&ebur128->y_line_ref);
- av_freep(&ebur128->x);
av_freep(&ebur128->y);
av_freep(&ebur128->z);
av_freep(&ebur128->ch_weighting);
--
2.49.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* [FFmpeg-devel] [PATCH v4 03/13] avfilter/f_ebur128: use structs for biquad weights
2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 02/13] avfilter/f_ebur128: simplify sample cache array Niklas Haas
@ 2025-06-17 12:05 ` Niklas Haas
2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 04/13] avfilter/f_ebur128: use a single packed array for the integrator cache Niklas Haas
` (9 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:05 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
From: Niklas Haas <git@haasn.dev>
Simplifies the code a bit. In particular, the copy to the stack is marginally
faster.
---
libavfilter/f_ebur128.c | 52 +++++++++++++++++++++++------------------
1 file changed, 29 insertions(+), 23 deletions(-)
diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index d0707e9ef9..776329db1c 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -75,6 +75,11 @@ struct integrator {
struct rect { int x, y, w, h; };
+struct biquad {
+ double b0, b1, b2;
+ double a1, a2;
+};
+
typedef struct EBUR128Context {
const AVClass *class; ///< AVClass context for log and options purpose
@@ -117,10 +122,8 @@ typedef struct EBUR128Context {
* The mult by 3 in the following is for X[i], X[i-1] and X[i-2] */
double *y; ///< 3 pre-filter samples cache for each channel
double *z; ///< 3 RLB-filter samples cache for each channel
- double pre_b[3]; ///< pre-filter numerator coefficients
- double pre_a[3]; ///< pre-filter denominator coefficients
- double rlb_b[3]; ///< rlb-filter numerator coefficients
- double rlb_a[3]; ///< rlb-filter denominator coefficients
+ struct biquad pre;
+ struct biquad rlb;
struct integrator i400; ///< 400ms integrator, used for Momentary loudness (M), and Integrated loudness (I)
struct integrator i3000; ///< 3s integrator, used for Short term loudness (S), and Loudness Range (LRA)
@@ -405,21 +408,21 @@ static int config_audio_input(AVFilterLink *inlink)
double a0 = 1.0 + K / Q + K * K;
- ebur128->pre_b[0] = (Vh + Vb * K / Q + K * K) / a0;
- ebur128->pre_b[1] = 2.0 * (K * K - Vh) / a0;
- ebur128->pre_b[2] = (Vh - Vb * K / Q + K * K) / a0;
- ebur128->pre_a[1] = 2.0 * (K * K - 1.0) / a0;
- ebur128->pre_a[2] = (1.0 - K / Q + K * K) / a0;
+ ebur128->pre.b0 = (Vh + Vb * K / Q + K * K) / a0;
+ ebur128->pre.b1 = 2.0 * (K * K - Vh) / a0;
+ ebur128->pre.b2 = (Vh - Vb * K / Q + K * K) / a0;
+ ebur128->pre.a1 = 2.0 * (K * K - 1.0) / a0;
+ ebur128->pre.a2 = (1.0 - K / Q + K * K) / a0;
f0 = 38.13547087602444;
Q = 0.5003270373238773;
K = tan(M_PI * f0 / (double)inlink->sample_rate);
- ebur128->rlb_b[0] = 1.0;
- ebur128->rlb_b[1] = -2.0;
- ebur128->rlb_b[2] = 1.0;
- ebur128->rlb_a[1] = 2.0 * (K * K - 1.0) / (1.0 + K / Q + K * K);
- ebur128->rlb_a[2] = (1.0 - K / Q + K * K) / (1.0 + K / Q + K * K);
+ ebur128->rlb.b0 = 1.0;
+ ebur128->rlb.b1 = -2.0;
+ ebur128->rlb.b2 = 1.0;
+ ebur128->rlb.a1 = 2.0 * (K * K - 1.0) / (1.0 + K / Q + K * K);
+ ebur128->rlb.a2 = (1.0 - K / Q + K * K) / (1.0 + K / Q + K * K);
/* Force 100ms framing in case of metadata injection: the frames must have
* a granularity of the window overlap to be accurately exploited.
@@ -654,6 +657,9 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
}
#endif
+ const struct biquad pre = ebur128->pre;
+ const struct biquad rlb = ebur128->rlb;
+
for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
const int bin_id_400 = ebur128->i400.cache_pos;
const int bin_id_3000 = ebur128->i3000.cache_pos;
@@ -678,10 +684,10 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
continue;
/* Y[i] = X[i]*b0 + X[i-1]*b1 + X[i-2]*b2 - Y[i-1]*a1 - Y[i-2]*a2 */
-#define FILTER(DST, SRC, NUM, DEN) do { \
- const double tmp = DST[0] = NUM[0] * SRC + DST[1]; \
- DST[1] = NUM[1] * SRC + DST[2] - DEN[1] * tmp; \
- DST[2] = NUM[2] * SRC - DEN[2] * tmp; \
+#define FILTER(DST, SRC, FILT) do { \
+ const double tmp = DST[0] = FILT.b0 * SRC + DST[1]; \
+ DST[1] = FILT.b1 * SRC + DST[2] - FILT.a1 * tmp; \
+ DST[2] = FILT.b2 * SRC - FILT.a2 * tmp; \
} while (0)
const double x = samples[idx_insample * nb_channels + ch];
@@ -689,14 +695,14 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
double *restrict z = &ebur128->z[3 * ch];
// TODO: merge both filters in one?
- FILTER(y, x, ebur128->pre_b, ebur128->pre_a); // apply pre-filter
- FILTER(z, *y, ebur128->rlb_b, ebur128->rlb_a); // apply RLB-filter
+ FILTER(y, x, pre); // apply pre-filter
+ FILTER(z, *y, rlb); // apply RLB-filter
/* add the new value, and limit the sum to the cache size (400ms or 3s)
* by removing the oldest one */
- double bin = *z * *z;
- ebur128->i400.sum [ch] = ebur128->i400.sum [ch] + bin - ebur128->i400.cache [ch][bin_id_400];
- ebur128->i3000.sum[ch] = ebur128->i3000.sum[ch] + bin - ebur128->i3000.cache[ch][bin_id_3000];
+ const double bin = *z * *z;
+ ebur128->i400.sum [ch] += bin - ebur128->i400.cache [ch][bin_id_400];
+ ebur128->i3000.sum[ch] += bin - ebur128->i3000.cache[ch][bin_id_3000];
/* override old cache entry with the new value */
ebur128->i400.cache [ch][bin_id_400 ] = bin;
--
2.49.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* [FFmpeg-devel] [PATCH v4 04/13] avfilter/f_ebur128: use a single packed array for the integrator cache
2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 02/13] avfilter/f_ebur128: simplify sample cache array Niklas Haas
2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 03/13] avfilter/f_ebur128: use structs for biquad weights Niklas Haas
@ 2025-06-17 12:05 ` Niklas Haas
2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 05/13] avfilter/f_ebur128: move weights and cache to EBUR128DSPContext Niklas Haas
` (8 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:05 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
From: Niklas Haas <git@haasn.dev>
Instead of having a planar array for each channel, use a single packed array.
This will help processing multiple channels in parallel, as we can directly
load all channels' data in a single load instruction.
Also improves memory locality of data, as the loop order is:
for (samples) {
for (channels) {
process sample
}
}
---
libavfilter/f_ebur128.c | 36 ++++++++++--------------------------
1 file changed, 10 insertions(+), 26 deletions(-)
diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 776329db1c..9f7c080750 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -62,7 +62,7 @@ struct hist_entry {
};
struct integrator {
- double **cache; ///< window of filtered samples (N ms)
+ double *cache; ///< window of filtered samples (N ms)
int cache_pos; ///< focus on the last added bin in the cache array
int cache_size;
double *sum; ///< sum of the last N ms filtered samples (cache content)
@@ -457,10 +457,12 @@ static int config_audio_output(AVFilterLink *outlink)
#define I400_BINS(x) ((x) * 4 / 10)
#define I3000_BINS(x) ((x) * 3)
+ ebur128->i400.cache_size = I400_BINS(outlink->sample_rate);
+ ebur128->i3000.cache_size = I3000_BINS(outlink->sample_rate);
ebur128->i400.sum = av_calloc(nb_channels, sizeof(*ebur128->i400.sum));
ebur128->i3000.sum = av_calloc(nb_channels, sizeof(*ebur128->i3000.sum));
- ebur128->i400.cache = av_calloc(nb_channels, sizeof(*ebur128->i400.cache));
- ebur128->i3000.cache = av_calloc(nb_channels, sizeof(*ebur128->i3000.cache));
+ ebur128->i400.cache = av_calloc(nb_channels * ebur128->i400.cache_size, sizeof(*ebur128->i400.cache));
+ ebur128->i3000.cache = av_calloc(nb_channels * ebur128->i3000.cache_size, sizeof(*ebur128->i3000.cache));
if (!ebur128->i400.sum || !ebur128->i3000.sum ||
!ebur128->i400.cache || !ebur128->i3000.cache)
return AVERROR(ENOMEM);
@@ -475,17 +477,6 @@ static int config_audio_output(AVFilterLink *outlink)
} else {
ebur128->ch_weighting[i] = 1.0;
}
-
- if (!ebur128->ch_weighting[i])
- continue;
-
- /* bins buffer for the two integration window (400ms and 3s) */
- ebur128->i400.cache_size = I400_BINS(outlink->sample_rate);
- ebur128->i3000.cache_size = I3000_BINS(outlink->sample_rate);
- ebur128->i400.cache[i] = av_calloc(ebur128->i400.cache_size, sizeof(*ebur128->i400.cache[0]));
- ebur128->i3000.cache[i] = av_calloc(ebur128->i3000.cache_size, sizeof(*ebur128->i3000.cache[0]));
- if (!ebur128->i400.cache[i] || !ebur128->i3000.cache[i])
- return AVERROR(ENOMEM);
}
#if CONFIG_SWRESAMPLE
@@ -663,6 +654,8 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
const int bin_id_400 = ebur128->i400.cache_pos;
const int bin_id_3000 = ebur128->i3000.cache_pos;
+ double *restrict cache_400 = &ebur128->i400.cache[bin_id_400 * nb_channels];
+ double *restrict cache_3000 = &ebur128->i3000.cache[bin_id_3000 * nb_channels];
#define MOVE_TO_NEXT_CACHED_ENTRY(time) do { \
ebur128->i##time.cache_pos++; \
@@ -701,12 +694,9 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
/* add the new value, and limit the sum to the cache size (400ms or 3s)
* by removing the oldest one */
const double bin = *z * *z;
- ebur128->i400.sum [ch] += bin - ebur128->i400.cache [ch][bin_id_400];
- ebur128->i3000.sum[ch] += bin - ebur128->i3000.cache[ch][bin_id_3000];
-
- /* override old cache entry with the new value */
- ebur128->i400.cache [ch][bin_id_400 ] = bin;
- ebur128->i3000.cache[ch][bin_id_3000] = bin;
+ ebur128->i400.sum [ch] += bin - cache_400[ch];
+ ebur128->i3000.sum[ch] += bin - cache_3000[ch];
+ cache_400[ch] = cache_3000[ch] = bin;
}
#define FIND_PEAK(global, sp, ptype) do { \
@@ -1083,12 +1073,6 @@ static av_cold void uninit(AVFilterContext *ctx)
av_freep(&ebur128->i3000.sum);
av_freep(&ebur128->i400.histogram);
av_freep(&ebur128->i3000.histogram);
- for (int i = 0; i < ebur128->nb_channels; i++) {
- if (ebur128->i400.cache)
- av_freep(&ebur128->i400.cache[i]);
- if (ebur128->i3000.cache)
- av_freep(&ebur128->i3000.cache[i]);
- }
av_freep(&ebur128->i400.cache);
av_freep(&ebur128->i3000.cache);
av_frame_free(&ebur128->outpicref);
--
2.49.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* [FFmpeg-devel] [PATCH v4 05/13] avfilter/f_ebur128: move weights and cache to EBUR128DSPContext
2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
` (2 preceding siblings ...)
2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 04/13] avfilter/f_ebur128: use a single packed array for the integrator cache Niklas Haas
@ 2025-06-17 12:05 ` Niklas Haas
2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 06/13] avfilter/f_ebur128: split off C implementation to separate function Niklas Haas
` (7 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:05 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
From: Niklas Haas <git@haasn.dev>
---
libavfilter/f_ebur128.c | 53 +++++++++++++++++------------------------
libavfilter/f_ebur128.h | 40 +++++++++++++++++++++++++++++++
2 files changed, 62 insertions(+), 31 deletions(-)
create mode 100644 libavfilter/f_ebur128.h
diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 9f7c080750..c3328dc520 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -43,6 +43,8 @@
#include "formats.h"
#include "video.h"
+#include "f_ebur128.h"
+
#define ABS_THRES -70 ///< silence gate: we discard anything below this absolute (LUFS) threshold
#define ABS_UP_THRES 10 ///< upper loud limit to consider (ABS_THRES being the minimum)
#define HIST_GRAIN 100 ///< defines histogram precision
@@ -75,13 +77,9 @@ struct integrator {
struct rect { int x, y, w, h; };
-struct biquad {
- double b0, b1, b2;
- double a1, a2;
-};
-
typedef struct EBUR128Context {
const AVClass *class; ///< AVClass context for log and options purpose
+ EBUR128DSPContext dsp;
/* peak metering */
int peak_mode; ///< enabled peak modes
@@ -118,13 +116,6 @@ typedef struct EBUR128Context {
int idx_insample; ///< current sample position of processed samples in single input frame
AVFrame *insamples; ///< input samples reference, updated regularly
- /* Filter caches.
- * The mult by 3 in the following is for X[i], X[i-1] and X[i-2] */
- double *y; ///< 3 pre-filter samples cache for each channel
- double *z; ///< 3 RLB-filter samples cache for each channel
- struct biquad pre;
- struct biquad rlb;
-
struct integrator i400; ///< 400ms integrator, used for Momentary loudness (M), and Integrated loudness (I)
struct integrator i3000; ///< 3s integrator, used for Short term loudness (S), and Loudness Range (LRA)
@@ -408,21 +399,21 @@ static int config_audio_input(AVFilterLink *inlink)
double a0 = 1.0 + K / Q + K * K;
- ebur128->pre.b0 = (Vh + Vb * K / Q + K * K) / a0;
- ebur128->pre.b1 = 2.0 * (K * K - Vh) / a0;
- ebur128->pre.b2 = (Vh - Vb * K / Q + K * K) / a0;
- ebur128->pre.a1 = 2.0 * (K * K - 1.0) / a0;
- ebur128->pre.a2 = (1.0 - K / Q + K * K) / a0;
+ ebur128->dsp.pre.b0 = (Vh + Vb * K / Q + K * K) / a0;
+ ebur128->dsp.pre.b1 = 2.0 * (K * K - Vh) / a0;
+ ebur128->dsp.pre.b2 = (Vh - Vb * K / Q + K * K) / a0;
+ ebur128->dsp.pre.a1 = 2.0 * (K * K - 1.0) / a0;
+ ebur128->dsp.pre.a2 = (1.0 - K / Q + K * K) / a0;
f0 = 38.13547087602444;
Q = 0.5003270373238773;
K = tan(M_PI * f0 / (double)inlink->sample_rate);
- ebur128->rlb.b0 = 1.0;
- ebur128->rlb.b1 = -2.0;
- ebur128->rlb.b2 = 1.0;
- ebur128->rlb.a1 = 2.0 * (K * K - 1.0) / (1.0 + K / Q + K * K);
- ebur128->rlb.a2 = (1.0 - K / Q + K * K) / (1.0 + K / Q + K * K);
+ ebur128->dsp.rlb.b0 = 1.0;
+ ebur128->dsp.rlb.b1 = -2.0;
+ ebur128->dsp.rlb.b2 = 1.0;
+ ebur128->dsp.rlb.a1 = 2.0 * (K * K - 1.0) / (1.0 + K / Q + K * K);
+ ebur128->dsp.rlb.a2 = (1.0 - K / Q + K * K) / (1.0 + K / Q + K * K);
/* Force 100ms framing in case of metadata injection: the frames must have
* a granularity of the window overlap to be accurately exploited.
@@ -448,10 +439,10 @@ static int config_audio_output(AVFilterLink *outlink)
AV_CH_SURROUND_DIRECT_LEFT |AV_CH_SURROUND_DIRECT_RIGHT)
ebur128->nb_channels = nb_channels;
- ebur128->y = av_calloc(nb_channels, 3 * sizeof(*ebur128->y));
- ebur128->z = av_calloc(nb_channels, 3 * sizeof(*ebur128->z));
+ ebur128->dsp.y = av_calloc(nb_channels, 3 * sizeof(*ebur128->dsp.y));
+ ebur128->dsp.z = av_calloc(nb_channels, 3 * sizeof(*ebur128->dsp.z));
ebur128->ch_weighting = av_calloc(nb_channels, sizeof(*ebur128->ch_weighting));
- if (!ebur128->ch_weighting || !ebur128->y || !ebur128->z)
+ if (!ebur128->ch_weighting || !ebur128->dsp.y || !ebur128->dsp.z)
return AVERROR(ENOMEM);
#define I400_BINS(x) ((x) * 4 / 10)
@@ -648,8 +639,8 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
}
#endif
- const struct biquad pre = ebur128->pre;
- const struct biquad rlb = ebur128->rlb;
+ const EBUR128Biquad pre = ebur128->dsp.pre;
+ const EBUR128Biquad rlb = ebur128->dsp.rlb;
for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
const int bin_id_400 = ebur128->i400.cache_pos;
@@ -684,8 +675,8 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
} while (0)
const double x = samples[idx_insample * nb_channels + ch];
- double *restrict y = &ebur128->y[3 * ch];
- double *restrict z = &ebur128->z[3 * ch];
+ double *restrict y = &ebur128->dsp.y[3 * ch];
+ double *restrict z = &ebur128->dsp.z[3 * ch];
// TODO: merge both filters in one?
FILTER(y, x, pre); // apply pre-filter
@@ -1063,8 +1054,8 @@ static av_cold void uninit(AVFilterContext *ctx)
}
av_freep(&ebur128->y_line_ref);
- av_freep(&ebur128->y);
- av_freep(&ebur128->z);
+ av_freep(&ebur128->dsp.y);
+ av_freep(&ebur128->dsp.z);
av_freep(&ebur128->ch_weighting);
av_freep(&ebur128->true_peaks);
av_freep(&ebur128->sample_peaks);
diff --git a/libavfilter/f_ebur128.h b/libavfilter/f_ebur128.h
new file mode 100644
index 0000000000..42cce9a5e4
--- /dev/null
+++ b/libavfilter/f_ebur128.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ * Copyright (c) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_F_EBUR128_H
+#define AVFILTER_F_EBUR128_H
+
+typedef struct EBUR128Biquad {
+ double b0, b1, b2;
+ double a1, a2;
+} EBUR128Biquad;
+
+typedef struct EBUR128DSPContext {
+ /* Filter data */
+ EBUR128Biquad pre;
+ EBUR128Biquad rlb;
+
+ /* Cache of 3 samples for each channel */
+ double *y; /* after pre-filter */
+ double *z; /* after RLB-filter */
+} EBUR128DSPContext;
+
+#endif /* AVFILTER_F_EBUR128_H */
--
2.49.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* [FFmpeg-devel] [PATCH v4 06/13] avfilter/f_ebur128: split off C implementation to separate function
2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
` (3 preceding siblings ...)
2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 05/13] avfilter/f_ebur128: move weights and cache to EBUR128DSPContext Niklas Haas
@ 2025-06-17 12:05 ` Niklas Haas
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 07/13] avfilter/x86/f_ebur128: add x86 AVX implementation Niklas Haas
` (6 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:05 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
From: Niklas Haas <git@haasn.dev>
I decided to separate out the peak measurement loop to avoid bloating
the signature, and since it's only conditionally used.
---
libavfilter/f_ebur128.c | 83 ++++++++++++++++++++++++-----------------
libavfilter/f_ebur128.h | 3 ++
2 files changed, 52 insertions(+), 34 deletions(-)
diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index c3328dc520..b9e210c05a 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -609,11 +609,48 @@ static int gate_update(struct integrator *integ, double power,
return gate_hist_pos;
}
+void ff_ebur128_filter_channels_c(const EBUR128DSPContext *dsp,
+ const double *restrict samples,
+ double *restrict cache_400,
+ double *restrict cache_3000,
+ double *restrict sum_400,
+ double *restrict sum_3000,
+ const int nb_channels)
+{
+ const EBUR128Biquad pre = dsp->pre;
+ const EBUR128Biquad rlb = dsp->rlb;
+
+ for (int ch = 0; ch < nb_channels; ch++) {
+ /* Y[i] = X[i]*b0 + X[i-1]*b1 + X[i-2]*b2 - Y[i-1]*a1 - Y[i-2]*a2 */
+#define FILTER(DST, SRC, FILT) do { \
+ const double tmp = DST[0] = FILT.b0 * SRC + DST[1]; \
+ DST[1] = FILT.b1 * SRC + DST[2] - FILT.a1 * tmp; \
+ DST[2] = FILT.b2 * SRC - FILT.a2 * tmp; \
+} while (0)
+
+ const double x = samples[ch];
+ double *restrict y = &dsp->y[3 * ch];
+ double *restrict z = &dsp->z[3 * ch];
+
+ // TODO: merge both filters in one?
+ FILTER(y, x, pre); // apply pre-filter
+ FILTER(z, *y, rlb); // apply RLB-filter
+
+ /* add the new value, and limit the sum to the cache size (400ms or 3s)
+ * by removing the oldest one */
+ const double bin = *z * *z;
+ sum_400 [ch] += bin - cache_400[ch];
+ sum_3000[ch] += bin - cache_3000[ch];
+ cache_400[ch] = cache_3000[ch] = bin;
+ }
+}
+
static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
{
int i, ch, idx_insample, ret;
AVFilterContext *ctx = inlink->dst;
EBUR128Context *ebur128 = ctx->priv;
+ const EBUR128DSPContext *dsp = &ebur128->dsp;
const int nb_channels = ebur128->nb_channels;
const int nb_samples = insamples->nb_samples;
const double *samples = (double *)insamples->data[0];
@@ -639,14 +676,9 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
}
#endif
- const EBUR128Biquad pre = ebur128->dsp.pre;
- const EBUR128Biquad rlb = ebur128->dsp.rlb;
-
for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
const int bin_id_400 = ebur128->i400.cache_pos;
const int bin_id_3000 = ebur128->i3000.cache_pos;
- double *restrict cache_400 = &ebur128->i400.cache[bin_id_400 * nb_channels];
- double *restrict cache_3000 = &ebur128->i3000.cache[bin_id_3000 * nb_channels];
#define MOVE_TO_NEXT_CACHED_ENTRY(time) do { \
ebur128->i##time.cache_pos++; \
@@ -660,35 +692,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
MOVE_TO_NEXT_CACHED_ENTRY(400);
MOVE_TO_NEXT_CACHED_ENTRY(3000);
- for (ch = 0; ch < nb_channels; ch++) {
- if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS)
- ebur128->sample_peaks[ch] = FFMAX(ebur128->sample_peaks[ch], fabs(samples[idx_insample * nb_channels + ch]));
-
- if (!ebur128->ch_weighting[ch])
- continue;
-
- /* Y[i] = X[i]*b0 + X[i-1]*b1 + X[i-2]*b2 - Y[i-1]*a1 - Y[i-2]*a2 */
-#define FILTER(DST, SRC, FILT) do { \
- const double tmp = DST[0] = FILT.b0 * SRC + DST[1]; \
- DST[1] = FILT.b1 * SRC + DST[2] - FILT.a1 * tmp; \
- DST[2] = FILT.b2 * SRC - FILT.a2 * tmp; \
-} while (0)
-
- const double x = samples[idx_insample * nb_channels + ch];
- double *restrict y = &ebur128->dsp.y[3 * ch];
- double *restrict z = &ebur128->dsp.z[3 * ch];
-
- // TODO: merge both filters in one?
- FILTER(y, x, pre); // apply pre-filter
- FILTER(z, *y, rlb); // apply RLB-filter
-
- /* add the new value, and limit the sum to the cache size (400ms or 3s)
- * by removing the oldest one */
- const double bin = *z * *z;
- ebur128->i400.sum [ch] += bin - cache_400[ch];
- ebur128->i3000.sum[ch] += bin - cache_3000[ch];
- cache_400[ch] = cache_3000[ch] = bin;
- }
+ ff_ebur128_filter_channels_c(dsp, &samples[idx_insample * nb_channels],
+ &ebur128->i400.cache[bin_id_400 * nb_channels],
+ &ebur128->i3000.cache[bin_id_3000 * nb_channels],
+ ebur128->i400.sum, ebur128->i3000.sum,
+ nb_channels);
#define FIND_PEAK(global, sp, ptype) do { \
int ch; \
@@ -701,6 +709,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
} \
} while (0)
+ if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS) {
+ for (ch = 0; ch < nb_channels; ch++) {
+ const double sample = samples[idx_insample * nb_channels + ch];
+ ebur128->sample_peaks[ch] = FFMAX(ebur128->sample_peaks[ch], fabs(sample));
+ }
+ }
+
FIND_PEAK(ebur128->sample_peak, ebur128->sample_peaks, SAMPLES);
FIND_PEAK(ebur128->true_peak, ebur128->true_peaks, TRUE);
diff --git a/libavfilter/f_ebur128.h b/libavfilter/f_ebur128.h
index 42cce9a5e4..7b8e876576 100644
--- a/libavfilter/f_ebur128.h
+++ b/libavfilter/f_ebur128.h
@@ -37,4 +37,7 @@ typedef struct EBUR128DSPContext {
double *z; /* after RLB-filter */
} EBUR128DSPContext;
+void ff_ebur128_filter_channels_c(const EBUR128DSPContext *, const double *,
+ double *, double *, double *, double *, int);
+
#endif /* AVFILTER_F_EBUR128_H */
--
2.49.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* [FFmpeg-devel] [PATCH v4 07/13] avfilter/x86/f_ebur128: add x86 AVX implementation
2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
` (4 preceding siblings ...)
2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 06/13] avfilter/f_ebur128: split off C implementation to separate function Niklas Haas
@ 2025-06-17 12:06 ` Niklas Haas
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 08/13] avfilter/f_ebur128: remove pointless macro Niklas Haas
` (5 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:06 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
From: Niklas Haas <git@haasn.dev>
Processes two channels in parallel, using 128-bit XMM registers.
In theory, we could go up to YMM registers to process 4 channels, but this is
not a gain except for relatively high channel counts (e.g. 7.1), and also
complicates the sample load/store operations considerably.
I decided to only add an AVX variant, since the C code is not substantially
slower enough to justify a separate function just for ancient CPUs.
---
libavfilter/f_ebur128.c | 15 ++--
libavfilter/f_ebur128.h | 16 ++++
libavfilter/x86/Makefile | 4 +
libavfilter/x86/f_ebur128.asm | 141 +++++++++++++++++++++++++++++++
libavfilter/x86/f_ebur128_init.c | 35 ++++++++
5 files changed, 206 insertions(+), 5 deletions(-)
create mode 100644 libavfilter/x86/f_ebur128.asm
create mode 100644 libavfilter/x86/f_ebur128_init.c
diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index b9e210c05a..2d94cefce7 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -579,6 +579,11 @@ static av_cold int init(AVFilterContext *ctx)
/* summary */
av_log(ctx, AV_LOG_VERBOSE, "EBU +%d scale\n", ebur128->meter);
+ ebur128->dsp.filter_channels = ff_ebur128_filter_channels_c;
+#if ARCH_X86
+ ff_ebur128_init_x86(&ebur128->dsp);
+#endif
+
return 0;
}
@@ -692,11 +697,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
MOVE_TO_NEXT_CACHED_ENTRY(400);
MOVE_TO_NEXT_CACHED_ENTRY(3000);
- ff_ebur128_filter_channels_c(dsp, &samples[idx_insample * nb_channels],
- &ebur128->i400.cache[bin_id_400 * nb_channels],
- &ebur128->i3000.cache[bin_id_3000 * nb_channels],
- ebur128->i400.sum, ebur128->i3000.sum,
- nb_channels);
+ dsp->filter_channels(dsp, &samples[idx_insample * nb_channels],
+ &ebur128->i400.cache[bin_id_400 * nb_channels],
+ &ebur128->i3000.cache[bin_id_3000 * nb_channels],
+ ebur128->i400.sum, ebur128->i3000.sum,
+ nb_channels);
#define FIND_PEAK(global, sp, ptype) do { \
int ch; \
diff --git a/libavfilter/f_ebur128.h b/libavfilter/f_ebur128.h
index 7b8e876576..1889e28bdd 100644
--- a/libavfilter/f_ebur128.h
+++ b/libavfilter/f_ebur128.h
@@ -22,6 +22,9 @@
#ifndef AVFILTER_F_EBUR128_H
#define AVFILTER_F_EBUR128_H
+#include <assert.h>
+#include <stddef.h>
+
typedef struct EBUR128Biquad {
double b0, b1, b2;
double a1, a2;
@@ -35,8 +38,21 @@ typedef struct EBUR128DSPContext {
/* Cache of 3 samples for each channel */
double *y; /* after pre-filter */
double *z; /* after RLB-filter */
+
+ /* DSP functions */
+ void (*filter_channels)(const struct EBUR128DSPContext *dsp,
+ const double *samples,
+ double *cache_400, double *cache_3000,
+ double *sum_400, double *sum_3000,
+ int nb_channels);
} EBUR128DSPContext;
+static_assert(offsetof(EBUR128DSPContext, pre) == 0, "struct layout mismatch");
+static_assert(offsetof(EBUR128DSPContext, rlb) == 5 * sizeof(double), "struct layout mismatch");
+static_assert(offsetof(EBUR128DSPContext, y) == 10 * sizeof(double), "struct layout mismatch");
+
+void ff_ebur128_init_x86(EBUR128DSPContext *dsp);
+
void ff_ebur128_filter_channels_c(const EBUR128DSPContext *, const double *,
double *, double *, double *, double *, int);
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 0d9a28a935..e5f0c55a5e 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -7,6 +7,7 @@ OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o
OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o
OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o
OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution_init.o
+OBJS-$(CONFIG_EBUR128_FILTER) += x86/f_ebur128_init.o
OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq_init.o
OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o
OBJS-$(CONFIG_GBLUR_FILTER) += x86/vf_gblur_init.o
@@ -52,6 +53,9 @@ X86ASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o
X86ASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o
X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o
X86ASM-OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution.o
+ifdef ARCH_X86_64
+X86ASM-OBJS-$(CONFIG_EBUR128_FILTER) += x86/f_ebur128.o
+endif
X86ASM-OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o
X86ASM-OBJS-$(CONFIG_FRAMERATE_FILTER) += x86/vf_framerate.o
X86ASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o
diff --git a/libavfilter/x86/f_ebur128.asm b/libavfilter/x86/f_ebur128.asm
new file mode 100644
index 0000000000..d9cc8d9361
--- /dev/null
+++ b/libavfilter/x86/f_ebur128.asm
@@ -0,0 +1,141 @@
+;*****************************************************************************
+;* x86-optimized functions for ebur128 filter
+;*
+;* Copyright (C) 2025 Niklas Haas
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern ebur128_filter_channels_c
+
+struc Biquad
+ .b0 resq 1
+ .b1 resq 1
+ .b2 resq 1
+ .a1 resq 1
+ .a2 resq 1
+endstruc
+
+struc DSP
+ .pre resq 5
+ .rlb resq 5
+ .y resq 1
+ .z resq 1
+endstruc
+
+SECTION .text
+
+%macro MOVNQ 3 ; num, dst, src
+%if %1 == 1
+ movsd %2, %3
+%else
+ movupd %2, %3
+%endif
+%endmacro
+
+%macro FILTER 11 ; y0, y1, y2, x, b0, b1, b2, a1, a2, samples, num_channels
+ ; Y[0] := b0 * X + Y1
+ ; Y[1] := b1 * X + Y2 - a1 * Y[0]
+ ; Y[2] := b2 * X - a2 * Y[0]
+ movsd %1, [%10 + 8]
+ movsd %3, [%10 + 16]
+%if %11 > 1
+ movhpd %1, [%10 + 32]
+ movhpd %3, [%10 + 40]
+%endif
+
+ mulpd %2, %5, %4
+ addpd %1, %2
+
+ mulpd %2, %8, %1
+ subpd %3, %2
+ mulpd %2, %6, %4
+ addpd %2, %3
+
+ mulpd %3, %7, %4
+ mulpd %4, %9, %1
+ subpd %3, %4
+
+ movsd [%10 + 0], %1
+ movsd [%10 + 8], %2
+ movsd [%10 + 16], %3
+%if %11 > 1
+ movhpd [%10 + 24], %1
+ movhpd [%10 + 32], %2
+ movhpd [%10 + 40], %3
+%endif
+ add %10, 24 * %11
+%endmacro
+
+%macro filter_channels 1 ; num_channels
+ MOVNQ %1, m3, [samplesq]
+ add samplesq, 8 * %1
+
+ FILTER m0, m1, m2, m3, m4, m5, m6, m7, m8, r7q, %1
+ FILTER m3, m1, m2, m0, m9, m10, m11, m12, m13, r8q, %1
+
+ ; update sum and cache
+ mulpd m3, m3
+ subpd m0, m3, [cache400q]
+ subpd m1, m3, [cache3000q]
+ MOVNQ %1, [cache400q], m3
+ MOVNQ %1, [cache3000q], m3
+ add cache400q, 8 * %1
+ add cache3000q, 8 * %1
+ addpd m0, [sum400q]
+ addpd m1, [sum3000q]
+ MOVNQ %1, [sum400q], m0
+ MOVNQ %1, [sum3000q], m1
+ add sum400q, 8 * %1
+ add sum3000q, 8 * %1
+%endmacro
+
+INIT_XMM avx
+cglobal ebur128_filter_channels, 7, 9, 14, dsp, samples, cache400, cache3000, sum400, sum3000, channels
+ movddup m4, [dspq + DSP.pre + Biquad.b0]
+ movddup m5, [dspq + DSP.pre + Biquad.b1]
+ movddup m6, [dspq + DSP.pre + Biquad.b2]
+ movddup m7, [dspq + DSP.pre + Biquad.a1]
+ movddup m8, [dspq + DSP.pre + Biquad.a2]
+
+ movddup m9, [dspq + DSP.rlb + Biquad.b0]
+ movddup m10, [dspq + DSP.rlb + Biquad.b1]
+ movddup m11, [dspq + DSP.rlb + Biquad.b2]
+ movddup m12, [dspq + DSP.rlb + Biquad.a1]
+ movddup m13, [dspq + DSP.rlb + Biquad.a2]
+
+ mov r7q, [dspq + DSP.y]
+ mov r8q, [dspq + DSP.z]
+
+ ; handle odd channel count
+ test channelsd, 1
+ jnz .tail
+
+.loop:
+ filter_channels 2
+ sub channelsd, 2
+ jg .loop
+ RET
+
+.tail:
+ filter_channels 1
+ dec channelsd
+ test channelsd, channelsd
+ jnz .loop
+ RET
diff --git a/libavfilter/x86/f_ebur128_init.c b/libavfilter/x86/f_ebur128_init.c
new file mode 100644
index 0000000000..8f38aee967
--- /dev/null
+++ b/libavfilter/x86/f_ebur128_init.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/f_ebur128.h"
+
+void ff_ebur128_filter_channels_avx(const EBUR128DSPContext *, const double *,
+ double *, double *, double *, double *, int);
+
+av_cold void ff_ebur128_init_x86(EBUR128DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags))
+ dsp->filter_channels = ff_ebur128_filter_channels_avx;
+}
--
2.49.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* [FFmpeg-devel] [PATCH v4 08/13] avfilter/f_ebur128: remove pointless macro
2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
` (5 preceding siblings ...)
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 07/13] avfilter/x86/f_ebur128: add x86 AVX implementation Niklas Haas
@ 2025-06-17 12:06 ` Niklas Haas
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 09/13] avfilter/f_ebur128: move true peak calculation out of main loop Niklas Haas
` (4 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:06 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
From: Niklas Haas <git@haasn.dev>
This macro is not shortening the code nor aiding readability.
---
libavfilter/f_ebur128.c | 24 +++++++++++-------------
1 file changed, 11 insertions(+), 13 deletions(-)
diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 2d94cefce7..2e1eedd855 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -682,20 +682,18 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
#endif
for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
- const int bin_id_400 = ebur128->i400.cache_pos;
- const int bin_id_3000 = ebur128->i3000.cache_pos;
-
-#define MOVE_TO_NEXT_CACHED_ENTRY(time) do { \
- ebur128->i##time.cache_pos++; \
- if (ebur128->i##time.cache_pos == \
- ebur128->i##time.cache_size) { \
- ebur128->i##time.filled = 1; \
- ebur128->i##time.cache_pos = 0; \
- } \
-} while (0)
+ const int bin_id_400 = ebur128->i400.cache_pos++;
+ const int bin_id_3000 = ebur128->i3000.cache_pos++;
+
+ if (ebur128->i400.cache_pos == ebur128->i400.cache_size) {
+ ebur128->i400.filled = 1;
+ ebur128->i400.cache_pos = 0;
+ }
- MOVE_TO_NEXT_CACHED_ENTRY(400);
- MOVE_TO_NEXT_CACHED_ENTRY(3000);
+ if (ebur128->i3000.cache_pos == ebur128->i3000.cache_size) {
+ ebur128->i3000.filled = 1;
+ ebur128->i3000.cache_pos = 0;
+ }
dsp->filter_channels(dsp, &samples[idx_insample * nb_channels],
&ebur128->i400.cache[bin_id_400 * nb_channels],
--
2.49.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* [FFmpeg-devel] [PATCH v4 09/13] avfilter/f_ebur128: move true peak calculation out of main loop
2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
` (6 preceding siblings ...)
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 08/13] avfilter/f_ebur128: remove pointless macro Niklas Haas
@ 2025-06-17 12:06 ` Niklas Haas
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 10/13] avfilter/f_ebur128: lift sample " Niklas Haas
` (3 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:06 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
From: Niklas Haas <git@haasn.dev>
Easier to read, less convoluted, and ~30% faster. Most importantly, this
avoids repeating the redundant recalculation of the true peak on every
single sample, by moving the FIND_PEAK() loop out of the main loop. (Note
that FIND_PEAK() does not depend on the current sample index at all, so
there is no reason for it to ever be recomputed here)
---
libavfilter/f_ebur128.c | 23 ++++++++++++++---------
1 file changed, 14 insertions(+), 9 deletions(-)
diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 2e1eedd855..23092b597f 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -668,16 +668,22 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
(const uint8_t **)insamples->data, nb_samples);
if (ret < 0)
return ret;
- for (ch = 0; ch < nb_channels; ch++)
- ebur128->true_peaks_per_frame[ch] = 0.0;
- for (idx_insample = 0; idx_insample < ret; idx_insample++) {
- for (ch = 0; ch < nb_channels; ch++) {
- ebur128->true_peaks[ch] = FFMAX(ebur128->true_peaks[ch], fabs(*swr_samples));
- ebur128->true_peaks_per_frame[ch] = FFMAX(ebur128->true_peaks_per_frame[ch],
- fabs(*swr_samples));
- swr_samples++;
+
+ double maxpeak = 0.0;
+ for (int ch = 0; ch < nb_channels; ch++) {
+ double tp = ebur128->true_peaks[ch];
+ double tppf = 0.0;
+ for (int i = 0; i < ret; i++) {
+ const double sample = fabs(swr_samples[i * nb_channels]);
+ tp = FFMAX(tp, sample);
+ tppf = FFMAX(tppf, sample);
}
+ maxpeak = FFMAX(maxpeak, tp);
+ ebur128->true_peaks[ch] = tp;
+ ebur128->true_peaks_per_frame[ch] = tppf;
}
+
+ ebur128->true_peak = DBFS(maxpeak);
}
#endif
@@ -720,7 +726,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
}
FIND_PEAK(ebur128->sample_peak, ebur128->sample_peaks, SAMPLES);
- FIND_PEAK(ebur128->true_peak, ebur128->true_peaks, TRUE);
/* For integrated loudness, gating blocks are 400ms long with 75%
* overlap (see BS.1770-2 p5), so a re-computation is needed each 100ms
--
2.49.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* [FFmpeg-devel] [PATCH v4 10/13] avfilter/f_ebur128: lift sample peak calculation out of main loop
2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
` (7 preceding siblings ...)
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 09/13] avfilter/f_ebur128: move true peak calculation out of main loop Niklas Haas
@ 2025-06-17 12:06 ` Niklas Haas
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 11/13] avfilter/f_ebur128: move variable declarations to usage site Niklas Haas
` (2 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:06 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
From: Niklas Haas <git@haasn.dev>
This is substantially faster (~55%) than the transposed loop, and also
avoids an unnecessary macro.
---
libavfilter/f_ebur128.c | 38 ++++++++++++++++++--------------------
1 file changed, 18 insertions(+), 20 deletions(-)
diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 23092b597f..4051b1ea95 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -687,6 +687,24 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
}
#endif
+ if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS) {
+ double maxpeak = 0.0;
+ for (int ch = 0; ch < nb_channels; ch++) {
+ const double *restrict samples_ch = &samples[ch];
+ double sp = ebur128->sample_peaks[ch];
+
+ for (int i = ebur128->idx_insample; i < nb_samples; i++) {
+ const double sample = fabs(samples_ch[nb_channels * i]);
+ sp = FFMAX(sp, sample);
+ }
+ maxpeak = FFMAX(maxpeak, sp);
+ ebur128->sample_peaks[ch] = sp;
+ }
+
+ ebur128->sample_peak = DBFS(maxpeak);
+ }
+
+
for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
const int bin_id_400 = ebur128->i400.cache_pos++;
const int bin_id_3000 = ebur128->i3000.cache_pos++;
@@ -707,26 +725,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
ebur128->i400.sum, ebur128->i3000.sum,
nb_channels);
-#define FIND_PEAK(global, sp, ptype) do { \
- int ch; \
- double maxpeak; \
- maxpeak = 0.0; \
- if (ebur128->peak_mode & PEAK_MODE_ ## ptype ## _PEAKS) { \
- for (ch = 0; ch < ebur128->nb_channels; ch++) \
- maxpeak = FFMAX(maxpeak, sp[ch]); \
- global = DBFS(maxpeak); \
- } \
-} while (0)
-
- if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS) {
- for (ch = 0; ch < nb_channels; ch++) {
- const double sample = samples[idx_insample * nb_channels + ch];
- ebur128->sample_peaks[ch] = FFMAX(ebur128->sample_peaks[ch], fabs(sample));
- }
- }
-
- FIND_PEAK(ebur128->sample_peak, ebur128->sample_peaks, SAMPLES);
-
/* For integrated loudness, gating blocks are 400ms long with 75%
* overlap (see BS.1770-2 p5), so a re-computation is needed each 100ms
* (4800 samples at 48kHz). */
--
2.49.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* [FFmpeg-devel] [PATCH v4 11/13] avfilter/f_ebur128: move variable declarations to usage site
2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
` (8 preceding siblings ...)
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 10/13] avfilter/f_ebur128: lift sample " Niklas Haas
@ 2025-06-17 12:06 ` Niklas Haas
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 12/13] avfilter/f_ebur128: move true peak calculation to DSP function Niklas Haas
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 13/13] avfilter/x86/f_ebur128: implement AVX true peak calculation Niklas Haas
11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:06 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
From: Niklas Haas <git@haasn.dev>
This is actually allowed by non-ancient versions of C.
---
libavfilter/f_ebur128.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 4051b1ea95..1fb7129271 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -652,7 +652,7 @@ void ff_ebur128_filter_channels_c(const EBUR128DSPContext *dsp,
static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
{
- int i, ch, idx_insample, ret;
+ int ret;
AVFilterContext *ctx = inlink->dst;
EBUR128Context *ebur128 = ctx->priv;
const EBUR128DSPContext *dsp = &ebur128->dsp;
@@ -705,7 +705,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
}
- for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
+ for (int idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
const int bin_id_400 = ebur128->i400.cache_pos++;
const int bin_id_3000 = ebur128->i3000.cache_pos++;
@@ -741,7 +741,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
#define COMPUTE_LOUDNESS(m, time) do { \
if (ebur128->i##time.filled) { \
/* weighting sum of the last <time> ms */ \
- for (ch = 0; ch < nb_channels; ch++) \
+ for (int ch = 0; ch < nb_channels; ch++) \
power_##time += ebur128->ch_weighting[ch] * ebur128->i##time.sum[ch]; \
power_##time /= I##time##_BINS(inlink->sample_rate); \
} \
@@ -762,7 +762,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
/* compute integrated loudness by summing the histogram values
* above the relative threshold */
- for (i = gate_hist_pos; i < HIST_SIZE; i++) {
+ for (int i = gate_hist_pos; i < HIST_SIZE; i++) {
const unsigned nb_v = ebur128->i400.histogram[i].count;
nb_integrated += nb_v;
integrated_sum += nb_v * ebur128->i400.histogram[i].energy;
@@ -788,7 +788,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
int gate_hist_pos = gate_update(&ebur128->i3000, power_3000,
loudness_3000, LRA_GATE_THRES);
- for (i = gate_hist_pos; i < HIST_SIZE; i++)
+ for (int i = gate_hist_pos; i < HIST_SIZE; i++)
nb_powers += ebur128->i3000.histogram[i].count;
if (nb_powers) {
uint64_t n, nb_pow;
@@ -796,7 +796,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
/* get lower loudness to consider */
n = 0;
nb_pow = LRA_LOWER_PRC * nb_powers * 0.01 + 0.5;
- for (i = gate_hist_pos; i < HIST_SIZE; i++) {
+ for (int i = gate_hist_pos; i < HIST_SIZE; i++) {
n += ebur128->i3000.histogram[i].count;
if (n >= nb_pow) {
ebur128->lra_low = ebur128->i3000.histogram[i].loudness;
@@ -807,7 +807,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
/* get higher loudness to consider */
n = nb_powers;
nb_pow = LRA_HIGHER_PRC * nb_powers * 0.01 + 0.5;
- for (i = HIST_SIZE - 1; i >= 0; i--) {
+ for (int i = HIST_SIZE - 1; i >= 0; i--) {
n -= FFMIN(n, ebur128->i3000.histogram[i].count);
if (n < nb_pow) {
ebur128->lra_high = ebur128->i3000.histogram[i].loudness;
@@ -909,7 +909,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
if (ebur128->peak_mode & PEAK_MODE_ ## ptype ## _PEAKS) { \
double max_peak = 0.0; \
char key[64]; \
- for (ch = 0; ch < nb_channels; ch++) { \
+ for (int ch = 0; ch < nb_channels; ch++) { \
snprintf(key, sizeof(key), \
META_PREFIX AV_STRINGIFY(name) "_peaks_ch%d", ch); \
max_peak = fmax(max_peak, ebur128->name##_peaks[ch]); \
@@ -948,7 +948,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
#define PRINT_PEAKS(str, sp, ptype) do { \
if (ebur128->peak_mode & PEAK_MODE_ ## ptype ## _PEAKS) { \
av_log(ctx, ebur128->loglevel, " " str ":"); \
- for (ch = 0; ch < nb_channels; ch++) \
+ for (int ch = 0; ch < nb_channels; ch++) \
av_log(ctx, ebur128->loglevel, " %5.1f", DBFS(sp[ch])); \
av_log(ctx, ebur128->loglevel, " dBFS"); \
} \
--
2.49.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* [FFmpeg-devel] [PATCH v4 12/13] avfilter/f_ebur128: move true peak calculation to DSP function
2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
` (9 preceding siblings ...)
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 11/13] avfilter/f_ebur128: move variable declarations to usage site Niklas Haas
@ 2025-06-17 12:06 ` Niklas Haas
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 13/13] avfilter/x86/f_ebur128: implement AVX true peak calculation Niklas Haas
11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:06 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
From: Niklas Haas <git@haasn.dev>
---
libavfilter/f_ebur128.c | 43 ++++++++++++++++++++++++++---------------
libavfilter/f_ebur128.h | 4 ++++
2 files changed, 31 insertions(+), 16 deletions(-)
diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 1fb7129271..0adc89c823 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -580,6 +580,8 @@ static av_cold int init(AVFilterContext *ctx)
av_log(ctx, AV_LOG_VERBOSE, "EBU +%d scale\n", ebur128->meter);
ebur128->dsp.filter_channels = ff_ebur128_filter_channels_c;
+ ebur128->dsp.true_peak = ff_ebur128_true_peak_c;
+
#if ARCH_X86
ff_ebur128_init_x86(&ebur128->dsp);
#endif
@@ -650,6 +652,28 @@ void ff_ebur128_filter_channels_c(const EBUR128DSPContext *dsp,
}
}
+double ff_ebur128_true_peak_c(double *restrict true_peaks,
+ double *restrict true_peaks_per_frame,
+ const int nb_channels, const double *samples,
+ const int nb_samples)
+{
+ double maxpeak = 0.0;
+ for (int ch = 0; ch < nb_channels; ch++) {
+ double tp = true_peaks[ch];
+ double tppf = 0.0f;
+ for (int i = 0; i < nb_samples; i++) {
+ const double sample = fabs(samples[i * nb_channels]);
+ tp = FFMAX(tp, sample);
+ tppf = FFMAX(tppf, sample);
+ }
+ maxpeak = FFMAX(maxpeak, tp);
+ true_peaks[ch] = tp;
+ true_peaks_per_frame[ch] = tppf;
+ }
+
+ return maxpeak;
+}
+
static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
{
int ret;
@@ -669,21 +693,9 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
if (ret < 0)
return ret;
- double maxpeak = 0.0;
- for (int ch = 0; ch < nb_channels; ch++) {
- double tp = ebur128->true_peaks[ch];
- double tppf = 0.0;
- for (int i = 0; i < ret; i++) {
- const double sample = fabs(swr_samples[i * nb_channels]);
- tp = FFMAX(tp, sample);
- tppf = FFMAX(tppf, sample);
- }
- maxpeak = FFMAX(maxpeak, tp);
- ebur128->true_peaks[ch] = tp;
- ebur128->true_peaks_per_frame[ch] = tppf;
- }
-
- ebur128->true_peak = DBFS(maxpeak);
+ ebur128->true_peak = DBFS(dsp->true_peak(ebur128->true_peaks,
+ ebur128->true_peaks_per_frame,
+ nb_channels, swr_samples, ret));
}
#endif
@@ -704,7 +716,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
ebur128->sample_peak = DBFS(maxpeak);
}
-
for (int idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
const int bin_id_400 = ebur128->i400.cache_pos++;
const int bin_id_3000 = ebur128->i3000.cache_pos++;
diff --git a/libavfilter/f_ebur128.h b/libavfilter/f_ebur128.h
index 1889e28bdd..8aab7838a0 100644
--- a/libavfilter/f_ebur128.h
+++ b/libavfilter/f_ebur128.h
@@ -45,6 +45,8 @@ typedef struct EBUR128DSPContext {
double *cache_400, double *cache_3000,
double *sum_400, double *sum_3000,
int nb_channels);
+
+ double (*true_peak)(double *, double *, int, const double *, int);
} EBUR128DSPContext;
static_assert(offsetof(EBUR128DSPContext, pre) == 0, "struct layout mismatch");
@@ -56,4 +58,6 @@ void ff_ebur128_init_x86(EBUR128DSPContext *dsp);
void ff_ebur128_filter_channels_c(const EBUR128DSPContext *, const double *,
double *, double *, double *, double *, int);
+double ff_ebur128_true_peak_c(double *, double *, int, const double *, int);
+
#endif /* AVFILTER_F_EBUR128_H */
--
2.49.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* [FFmpeg-devel] [PATCH v4 13/13] avfilter/x86/f_ebur128: implement AVX true peak calculation
2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
` (10 preceding siblings ...)
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 12/13] avfilter/f_ebur128: move true peak calculation to DSP function Niklas Haas
@ 2025-06-17 12:06 ` Niklas Haas
2025-06-17 13:48 ` Niklas Haas
11 siblings, 1 reply; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:06 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
From: Niklas Haas <git@haasn.dev>
Stereo only, for simplicity. Slightly faster than the C code.
---
libavfilter/f_ebur128.c | 8 +++-----
libavfilter/f_ebur128.h | 2 +-
libavfilter/x86/f_ebur128.asm | 25 +++++++++++++++++++++++++
libavfilter/x86/f_ebur128_init.c | 9 +++++++--
4 files changed, 36 insertions(+), 8 deletions(-)
diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 0adc89c823..c64f6ed032 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -502,6 +502,9 @@ static int config_audio_output(AVFilterLink *outlink)
return AVERROR(ENOMEM);
}
+#if ARCH_X86
+ ff_ebur128_init_x86(&ebur128->dsp, nb_channels);
+#endif
return 0;
}
@@ -581,11 +584,6 @@ static av_cold int init(AVFilterContext *ctx)
ebur128->dsp.filter_channels = ff_ebur128_filter_channels_c;
ebur128->dsp.true_peak = ff_ebur128_true_peak_c;
-
-#if ARCH_X86
- ff_ebur128_init_x86(&ebur128->dsp);
-#endif
-
return 0;
}
diff --git a/libavfilter/f_ebur128.h b/libavfilter/f_ebur128.h
index 8aab7838a0..5fb9d4a8d5 100644
--- a/libavfilter/f_ebur128.h
+++ b/libavfilter/f_ebur128.h
@@ -53,7 +53,7 @@ static_assert(offsetof(EBUR128DSPContext, pre) == 0, "struct l
static_assert(offsetof(EBUR128DSPContext, rlb) == 5 * sizeof(double), "struct layout mismatch");
static_assert(offsetof(EBUR128DSPContext, y) == 10 * sizeof(double), "struct layout mismatch");
-void ff_ebur128_init_x86(EBUR128DSPContext *dsp);
+void ff_ebur128_init_x86(EBUR128DSPContext *dsp, int nb_channels);
void ff_ebur128_filter_channels_c(const EBUR128DSPContext *, const double *,
double *, double *, double *, double *, int);
diff --git a/libavfilter/x86/f_ebur128.asm b/libavfilter/x86/f_ebur128.asm
index d9cc8d9361..53dd3f858d 100644
--- a/libavfilter/x86/f_ebur128.asm
+++ b/libavfilter/x86/f_ebur128.asm
@@ -39,6 +39,10 @@ struc DSP
.z resq 1
endstruc
+SECTION_RODATA
+
+abs_mask: dq 0x7FFFFFFFFFFFFFFF
+
SECTION .text
%macro MOVNQ 3 ; num, dst, src
@@ -139,3 +143,24 @@ cglobal ebur128_filter_channels, 7, 9, 14, dsp, samples, cache400, cache3000, su
test channelsd, channelsd
jnz .loop
RET
+
+cglobal ebur128_true_peak_stereo, 5, 6, 4, tp, tppf, channels, samples, nb_samples
+ vpbroadcastq m4, [abs_mask]
+ pxor m0, m0 ; maxpeak
+ movupd m1, [tpq] ; tp
+ pxor m2, m2 ; tppf
+.inner:
+ movupd m3, [samplesq]
+ add samplesq, 16
+ pand m3, m4
+ maxpd m1, m3
+ maxpd m2, m3
+ dec nb_samplesd
+ jg .inner
+ movupd [tpq], m1
+ movupd [tppfq], m2
+ maxpd m0, m1
+ shufpd m1, m0, m0, 1
+ maxpd m0, m1
+ movq rax, m0
+ RET
diff --git a/libavfilter/x86/f_ebur128_init.c b/libavfilter/x86/f_ebur128_init.c
index 8f38aee967..527e5e4dbc 100644
--- a/libavfilter/x86/f_ebur128_init.c
+++ b/libavfilter/x86/f_ebur128_init.c
@@ -26,10 +26,15 @@
void ff_ebur128_filter_channels_avx(const EBUR128DSPContext *, const double *,
double *, double *, double *, double *, int);
-av_cold void ff_ebur128_init_x86(EBUR128DSPContext *dsp)
+double ff_ebur128_true_peak_stereo_avx(double *, double *, int, const double *, int);
+
+av_cold void ff_ebur128_init_x86(EBUR128DSPContext *dsp, int nb_channels)
{
int cpu_flags = av_get_cpu_flags();
- if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags))
+ if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags)) {
dsp->filter_channels = ff_ebur128_filter_channels_avx;
+ if (nb_channels == 2)
+ dsp->true_peak = ff_ebur128_true_peak_stereo_avx;
+ }
}
--
2.49.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [FFmpeg-devel] [PATCH v4 13/13] avfilter/x86/f_ebur128: implement AVX true peak calculation
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 13/13] avfilter/x86/f_ebur128: implement AVX true peak calculation Niklas Haas
@ 2025-06-17 13:48 ` Niklas Haas
0 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 13:48 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
On Tue, 17 Jun 2025 14:06:06 +0200 Niklas Haas <ffmpeg@haasn.xyz> wrote:
> From: Niklas Haas <git@haasn.dev>
>
> Stereo only, for simplicity. Slightly faster than the C code.
I will drop this and the prior commit and resubmit them in a refactored form
that also speeds up the sample peak calculation.
In the meantime, I would appreciate a review of the rest.
> ---
> libavfilter/f_ebur128.c | 8 +++-----
> libavfilter/f_ebur128.h | 2 +-
> libavfilter/x86/f_ebur128.asm | 25 +++++++++++++++++++++++++
> libavfilter/x86/f_ebur128_init.c | 9 +++++++--
> 4 files changed, 36 insertions(+), 8 deletions(-)
>
> diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
> index 0adc89c823..c64f6ed032 100644
> --- a/libavfilter/f_ebur128.c
> +++ b/libavfilter/f_ebur128.c
> @@ -502,6 +502,9 @@ static int config_audio_output(AVFilterLink *outlink)
> return AVERROR(ENOMEM);
> }
>
> +#if ARCH_X86
> + ff_ebur128_init_x86(&ebur128->dsp, nb_channels);
> +#endif
> return 0;
> }
>
> @@ -581,11 +584,6 @@ static av_cold int init(AVFilterContext *ctx)
>
> ebur128->dsp.filter_channels = ff_ebur128_filter_channels_c;
> ebur128->dsp.true_peak = ff_ebur128_true_peak_c;
> -
> -#if ARCH_X86
> - ff_ebur128_init_x86(&ebur128->dsp);
> -#endif
> -
> return 0;
> }
>
> diff --git a/libavfilter/f_ebur128.h b/libavfilter/f_ebur128.h
> index 8aab7838a0..5fb9d4a8d5 100644
> --- a/libavfilter/f_ebur128.h
> +++ b/libavfilter/f_ebur128.h
> @@ -53,7 +53,7 @@ static_assert(offsetof(EBUR128DSPContext, pre) == 0, "struct l
> static_assert(offsetof(EBUR128DSPContext, rlb) == 5 * sizeof(double), "struct layout mismatch");
> static_assert(offsetof(EBUR128DSPContext, y) == 10 * sizeof(double), "struct layout mismatch");
>
> -void ff_ebur128_init_x86(EBUR128DSPContext *dsp);
> +void ff_ebur128_init_x86(EBUR128DSPContext *dsp, int nb_channels);
>
> void ff_ebur128_filter_channels_c(const EBUR128DSPContext *, const double *,
> double *, double *, double *, double *, int);
> diff --git a/libavfilter/x86/f_ebur128.asm b/libavfilter/x86/f_ebur128.asm
> index d9cc8d9361..53dd3f858d 100644
> --- a/libavfilter/x86/f_ebur128.asm
> +++ b/libavfilter/x86/f_ebur128.asm
> @@ -39,6 +39,10 @@ struc DSP
> .z resq 1
> endstruc
>
> +SECTION_RODATA
> +
> +abs_mask: dq 0x7FFFFFFFFFFFFFFF
> +
> SECTION .text
>
> %macro MOVNQ 3 ; num, dst, src
> @@ -139,3 +143,24 @@ cglobal ebur128_filter_channels, 7, 9, 14, dsp, samples, cache400, cache3000, su
> test channelsd, channelsd
> jnz .loop
> RET
> +
> +cglobal ebur128_true_peak_stereo, 5, 6, 4, tp, tppf, channels, samples, nb_samples
> + vpbroadcastq m4, [abs_mask]
> + pxor m0, m0 ; maxpeak
> + movupd m1, [tpq] ; tp
> + pxor m2, m2 ; tppf
> +.inner:
> + movupd m3, [samplesq]
> + add samplesq, 16
> + pand m3, m4
> + maxpd m1, m3
> + maxpd m2, m3
> + dec nb_samplesd
> + jg .inner
> + movupd [tpq], m1
> + movupd [tppfq], m2
> + maxpd m0, m1
> + shufpd m1, m0, m0, 1
> + maxpd m0, m1
> + movq rax, m0
> + RET
> diff --git a/libavfilter/x86/f_ebur128_init.c b/libavfilter/x86/f_ebur128_init.c
> index 8f38aee967..527e5e4dbc 100644
> --- a/libavfilter/x86/f_ebur128_init.c
> +++ b/libavfilter/x86/f_ebur128_init.c
> @@ -26,10 +26,15 @@
> void ff_ebur128_filter_channels_avx(const EBUR128DSPContext *, const double *,
> double *, double *, double *, double *, int);
>
> -av_cold void ff_ebur128_init_x86(EBUR128DSPContext *dsp)
> +double ff_ebur128_true_peak_stereo_avx(double *, double *, int, const double *, int);
> +
> +av_cold void ff_ebur128_init_x86(EBUR128DSPContext *dsp, int nb_channels)
> {
> int cpu_flags = av_get_cpu_flags();
>
> - if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags))
> + if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags)) {
> dsp->filter_channels = ff_ebur128_filter_channels_avx;
> + if (nb_channels == 2)
> + dsp->true_peak = ff_ebur128_true_peak_stereo_avx;
> + }
> }
> --
> 2.49.0
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread