[FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II
@ 2025-06-17 12:05 Niklas Haas
  2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 02/13] avfilter/f_ebur128: simplify sample cache array Niklas Haas
                   ` (11 more replies)
  0 siblings, 12 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:05 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Niklas Haas

From: Niklas Haas <git@haasn.dev>

Instead of direct form I. See af_biquads.c for math. Also eliminate
an unnecessary indirection.
---
 libavfilter/f_ebur128.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 768f062bac..173a4f75ca 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -686,17 +686,15 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
             /* Y[i] = X[i]*b0 + X[i-1]*b1 + X[i-2]*b2 - Y[i-1]*a1 - Y[i-2]*a2 */
 #define FILTER(Y, X, NUM, DEN) do {                                             \
             double *dst = ebur128->Y + ch*3;                                    \
-            double *src = ebur128->X + ch*3;                                    \
-            dst[2] = dst[1];                                                    \
-            dst[1] = dst[0];                                                    \
-            dst[0] = src[0]*NUM[0] + src[1]*NUM[1] + src[2]*NUM[2]              \
-                                   - dst[1]*DEN[1] - dst[2]*DEN[2];             \
+            double src = ebur128->X[ch*3]  ;                                    \
+            double dst0 = NUM[0] * src + dst[1];                                \
+            dst[1] = NUM[1] * src + dst[2] - DEN[1] * dst0;                     \
+            dst[2] = NUM[2] * src - DEN[2] * dst0;                              \
+            dst[0] = dst0;                                                      \
 } while (0)
 
             // TODO: merge both filters in one?
             FILTER(y, x, ebur128->pre_b, ebur128->pre_a);  // apply pre-filter
-            ebur128->x[ch * 3 + 2] = ebur128->x[ch * 3 + 1];
-            ebur128->x[ch * 3 + 1] = ebur128->x[ch * 3    ];
             FILTER(z, y, ebur128->rlb_b, ebur128->rlb_a);  // apply RLB-filter
 
             bin = ebur128->z[ch * 3] * ebur128->z[ch * 3];
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [FFmpeg-devel] [PATCH v4 02/13] avfilter/f_ebur128: simplify sample cache array
  2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
@ 2025-06-17 12:05 ` Niklas Haas
  2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 03/13] avfilter/f_ebur128: use structs for biquad weights Niklas Haas
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:05 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Niklas Haas

From: Niklas Haas <git@haasn.dev>

We don't need an X sample cache anymore, and we also can simplify the
access macro slightly.
---
 libavfilter/f_ebur128.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 173a4f75ca..d0707e9ef9 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -115,7 +115,6 @@ typedef struct EBUR128Context {
 
     /* Filter caches.
      * The mult by 3 in the following is for X[i], X[i-1] and X[i-2] */
-    double *x;                      ///< 3 input samples cache for each channel
     double *y;                      ///< 3 pre-filter samples cache for each channel
     double *z;                      ///< 3 RLB-filter samples cache for each channel
     double pre_b[3];                ///< pre-filter numerator coefficients
@@ -446,11 +445,10 @@ static int config_audio_output(AVFilterLink *outlink)
                    AV_CH_SURROUND_DIRECT_LEFT               |AV_CH_SURROUND_DIRECT_RIGHT)
 
     ebur128->nb_channels  = nb_channels;
-    ebur128->x            = av_calloc(nb_channels, 3 * sizeof(*ebur128->x));
     ebur128->y            = av_calloc(nb_channels, 3 * sizeof(*ebur128->y));
     ebur128->z            = av_calloc(nb_channels, 3 * sizeof(*ebur128->z));
     ebur128->ch_weighting = av_calloc(nb_channels, sizeof(*ebur128->ch_weighting));
-    if (!ebur128->ch_weighting || !ebur128->x || !ebur128->y || !ebur128->z)
+    if (!ebur128->ch_weighting ||  !ebur128->y || !ebur128->z)
         return AVERROR(ENOMEM);
 
 #define I400_BINS(x)  ((x) * 4 / 10)
@@ -673,34 +671,30 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
         MOVE_TO_NEXT_CACHED_ENTRY(3000);
 
         for (ch = 0; ch < nb_channels; ch++) {
-            double bin;
-
             if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS)
                 ebur128->sample_peaks[ch] = FFMAX(ebur128->sample_peaks[ch], fabs(samples[idx_insample * nb_channels + ch]));
 
-            ebur128->x[ch * 3] = samples[idx_insample * nb_channels + ch]; // set X[i]
-
             if (!ebur128->ch_weighting[ch])
                 continue;
 
             /* Y[i] = X[i]*b0 + X[i-1]*b1 + X[i-2]*b2 - Y[i-1]*a1 - Y[i-2]*a2 */
-#define FILTER(Y, X, NUM, DEN) do {                                             \
-            double *dst = ebur128->Y + ch*3;                                    \
-            double src = ebur128->X[ch*3]  ;                                    \
-            double dst0 = NUM[0] * src + dst[1];                                \
-            dst[1] = NUM[1] * src + dst[2] - DEN[1] * dst0;                     \
-            dst[2] = NUM[2] * src - DEN[2] * dst0;                              \
-            dst[0] = dst0;                                                      \
+#define FILTER(DST, SRC, NUM, DEN) do {                                         \
+            const double tmp = DST[0] = NUM[0] * SRC + DST[1];                  \
+            DST[1] = NUM[1] * SRC + DST[2] - DEN[1] * tmp;                      \
+            DST[2] = NUM[2] * SRC - DEN[2] * tmp;                               \
 } while (0)
 
+            const double x = samples[idx_insample * nb_channels + ch];
+            double *restrict y = &ebur128->y[3 * ch];
+            double *restrict z = &ebur128->z[3 * ch];
+
             // TODO: merge both filters in one?
             FILTER(y, x, ebur128->pre_b, ebur128->pre_a);  // apply pre-filter
-            FILTER(z, y, ebur128->rlb_b, ebur128->rlb_a);  // apply RLB-filter
-
-            bin = ebur128->z[ch * 3] * ebur128->z[ch * 3];
+            FILTER(z, *y, ebur128->rlb_b, ebur128->rlb_a);  // apply RLB-filter
 
             /* add the new value, and limit the sum to the cache size (400ms or 3s)
              * by removing the oldest one */
+            double bin = *z * *z;
             ebur128->i400.sum [ch] = ebur128->i400.sum [ch] + bin - ebur128->i400.cache [ch][bin_id_400];
             ebur128->i3000.sum[ch] = ebur128->i3000.sum[ch] + bin - ebur128->i3000.cache[ch][bin_id_3000];
 
@@ -1073,7 +1067,6 @@ static av_cold void uninit(AVFilterContext *ctx)
     }
 
     av_freep(&ebur128->y_line_ref);
-    av_freep(&ebur128->x);
     av_freep(&ebur128->y);
     av_freep(&ebur128->z);
     av_freep(&ebur128->ch_weighting);
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [FFmpeg-devel] [PATCH v4 03/13] avfilter/f_ebur128: use structs for biquad weights
  2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
  2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 02/13] avfilter/f_ebur128: simplify sample cache array Niklas Haas
@ 2025-06-17 12:05 ` Niklas Haas
  2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 04/13] avfilter/f_ebur128: use a single packed array for the integrator cache Niklas Haas
                   ` (9 subsequent siblings)
  11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:05 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Niklas Haas

From: Niklas Haas <git@haasn.dev>

Simplifies the code a bit. In particular, the copy to the stack is marginally
faster.
---
 libavfilter/f_ebur128.c | 52 +++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index d0707e9ef9..776329db1c 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -75,6 +75,11 @@ struct integrator {
 
 struct rect { int x, y, w, h; };
 
+struct biquad {
+    double b0, b1, b2;
+    double a1, a2;
+};
+
 typedef struct EBUR128Context {
     const AVClass *class;           ///< AVClass context for log and options purpose
 
@@ -117,10 +122,8 @@ typedef struct EBUR128Context {
      * The mult by 3 in the following is for X[i], X[i-1] and X[i-2] */
     double *y;                      ///< 3 pre-filter samples cache for each channel
     double *z;                      ///< 3 RLB-filter samples cache for each channel
-    double pre_b[3];                ///< pre-filter numerator coefficients
-    double pre_a[3];                ///< pre-filter denominator coefficients
-    double rlb_b[3];                ///< rlb-filter numerator coefficients
-    double rlb_a[3];                ///< rlb-filter denominator coefficients
+    struct biquad pre;
+    struct biquad rlb;
 
     struct integrator i400;         ///< 400ms integrator, used for Momentary loudness  (M), and Integrated loudness (I)
     struct integrator i3000;        ///<    3s integrator, used for Short term loudness (S), and Loudness Range      (LRA)
@@ -405,21 +408,21 @@ static int config_audio_input(AVFilterLink *inlink)
 
     double a0 = 1.0 + K / Q + K * K;
 
-    ebur128->pre_b[0] = (Vh + Vb * K / Q + K * K) / a0;
-    ebur128->pre_b[1] = 2.0 * (K * K - Vh) / a0;
-    ebur128->pre_b[2] = (Vh - Vb * K / Q + K * K) / a0;
-    ebur128->pre_a[1] = 2.0 * (K * K - 1.0) / a0;
-    ebur128->pre_a[2] = (1.0 - K / Q + K * K) / a0;
+    ebur128->pre.b0 = (Vh + Vb * K / Q + K * K) / a0;
+    ebur128->pre.b1 = 2.0 * (K * K - Vh) / a0;
+    ebur128->pre.b2 = (Vh - Vb * K / Q + K * K) / a0;
+    ebur128->pre.a1 = 2.0 * (K * K - 1.0) / a0;
+    ebur128->pre.a2 = (1.0 - K / Q + K * K) / a0;
 
     f0 = 38.13547087602444;
     Q = 0.5003270373238773;
     K = tan(M_PI * f0 / (double)inlink->sample_rate);
 
-    ebur128->rlb_b[0] = 1.0;
-    ebur128->rlb_b[1] = -2.0;
-    ebur128->rlb_b[2] = 1.0;
-    ebur128->rlb_a[1] = 2.0 * (K * K - 1.0) / (1.0 + K / Q + K * K);
-    ebur128->rlb_a[2] = (1.0 - K / Q + K * K) / (1.0 + K / Q + K * K);
+    ebur128->rlb.b0 = 1.0;
+    ebur128->rlb.b1 = -2.0;
+    ebur128->rlb.b2 = 1.0;
+    ebur128->rlb.a1 = 2.0 * (K * K - 1.0) / (1.0 + K / Q + K * K);
+    ebur128->rlb.a2 = (1.0 - K / Q + K * K) / (1.0 + K / Q + K * K);
 
     /* Force 100ms framing in case of metadata injection: the frames must have
      * a granularity of the window overlap to be accurately exploited.
@@ -654,6 +657,9 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
     }
 #endif
 
+    const struct biquad pre = ebur128->pre;
+    const struct biquad rlb = ebur128->rlb;
+
     for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
         const int bin_id_400  = ebur128->i400.cache_pos;
         const int bin_id_3000 = ebur128->i3000.cache_pos;
@@ -678,10 +684,10 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
                 continue;
 
             /* Y[i] = X[i]*b0 + X[i-1]*b1 + X[i-2]*b2 - Y[i-1]*a1 - Y[i-2]*a2 */
-#define FILTER(DST, SRC, NUM, DEN) do {                                         \
-            const double tmp = DST[0] = NUM[0] * SRC + DST[1];                  \
-            DST[1] = NUM[1] * SRC + DST[2] - DEN[1] * tmp;                      \
-            DST[2] = NUM[2] * SRC - DEN[2] * tmp;                               \
+#define FILTER(DST, SRC, FILT) do {                                             \
+            const double tmp = DST[0] = FILT.b0 * SRC + DST[1];                 \
+            DST[1] = FILT.b1 * SRC + DST[2] - FILT.a1 * tmp;                    \
+            DST[2] = FILT.b2 * SRC - FILT.a2 * tmp;                             \
 } while (0)
 
             const double x = samples[idx_insample * nb_channels + ch];
@@ -689,14 +695,14 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
             double *restrict z = &ebur128->z[3 * ch];
 
             // TODO: merge both filters in one?
-            FILTER(y, x, ebur128->pre_b, ebur128->pre_a);  // apply pre-filter
-            FILTER(z, *y, ebur128->rlb_b, ebur128->rlb_a);  // apply RLB-filter
+            FILTER(y, x, pre);  // apply pre-filter
+            FILTER(z, *y, rlb); // apply RLB-filter
 
             /* add the new value, and limit the sum to the cache size (400ms or 3s)
              * by removing the oldest one */
-            double bin = *z * *z;
-            ebur128->i400.sum [ch] = ebur128->i400.sum [ch] + bin - ebur128->i400.cache [ch][bin_id_400];
-            ebur128->i3000.sum[ch] = ebur128->i3000.sum[ch] + bin - ebur128->i3000.cache[ch][bin_id_3000];
+            const double bin = *z * *z;
+            ebur128->i400.sum [ch] += bin - ebur128->i400.cache [ch][bin_id_400];
+            ebur128->i3000.sum[ch] += bin - ebur128->i3000.cache[ch][bin_id_3000];
 
             /* override old cache entry with the new value */
             ebur128->i400.cache [ch][bin_id_400 ] = bin;
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [FFmpeg-devel] [PATCH v4 04/13] avfilter/f_ebur128: use a single packed array for the integrator cache
  2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
  2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 02/13] avfilter/f_ebur128: simplify sample cache array Niklas Haas
  2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 03/13] avfilter/f_ebur128: use structs for biquad weights Niklas Haas
@ 2025-06-17 12:05 ` Niklas Haas
  2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 05/13] avfilter/f_ebur128: move weights and cache to EBUR128DSPContext Niklas Haas
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:05 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Niklas Haas

From: Niklas Haas <git@haasn.dev>

Instead of having a planar array for each channel, use a single packed array.
This will help processing multiple channels in parallel, as we can directly
load all channels' data in a single load instruction.

Also improves memory locality of data, as the loop order is:

for (samples) {
    for (channels) {
        process sample
    }
}
---
 libavfilter/f_ebur128.c | 36 ++++++++++--------------------------
 1 file changed, 10 insertions(+), 26 deletions(-)

diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 776329db1c..9f7c080750 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -62,7 +62,7 @@ struct hist_entry {
 };
 
 struct integrator {
-    double **cache;                 ///< window of filtered samples (N ms)
+    double *cache;                  ///< window of filtered samples (N ms)
     int cache_pos;                  ///< focus on the last added bin in the cache array
     int cache_size;
     double *sum;                    ///< sum of the last N ms filtered samples (cache content)
@@ -457,10 +457,12 @@ static int config_audio_output(AVFilterLink *outlink)
 #define I400_BINS(x)  ((x) * 4 / 10)
 #define I3000_BINS(x) ((x) * 3)
 
+    ebur128->i400.cache_size = I400_BINS(outlink->sample_rate);
+    ebur128->i3000.cache_size = I3000_BINS(outlink->sample_rate);
     ebur128->i400.sum = av_calloc(nb_channels, sizeof(*ebur128->i400.sum));
     ebur128->i3000.sum = av_calloc(nb_channels, sizeof(*ebur128->i3000.sum));
-    ebur128->i400.cache = av_calloc(nb_channels, sizeof(*ebur128->i400.cache));
-    ebur128->i3000.cache = av_calloc(nb_channels, sizeof(*ebur128->i3000.cache));
+    ebur128->i400.cache = av_calloc(nb_channels * ebur128->i400.cache_size, sizeof(*ebur128->i400.cache));
+    ebur128->i3000.cache = av_calloc(nb_channels * ebur128->i3000.cache_size, sizeof(*ebur128->i3000.cache));
     if (!ebur128->i400.sum || !ebur128->i3000.sum ||
         !ebur128->i400.cache || !ebur128->i3000.cache)
         return AVERROR(ENOMEM);
@@ -475,17 +477,6 @@ static int config_audio_output(AVFilterLink *outlink)
         } else {
             ebur128->ch_weighting[i] = 1.0;
         }
-
-        if (!ebur128->ch_weighting[i])
-            continue;
-
-        /* bins buffer for the two integration window (400ms and 3s) */
-        ebur128->i400.cache_size = I400_BINS(outlink->sample_rate);
-        ebur128->i3000.cache_size = I3000_BINS(outlink->sample_rate);
-        ebur128->i400.cache[i]  = av_calloc(ebur128->i400.cache_size,  sizeof(*ebur128->i400.cache[0]));
-        ebur128->i3000.cache[i] = av_calloc(ebur128->i3000.cache_size, sizeof(*ebur128->i3000.cache[0]));
-        if (!ebur128->i400.cache[i] || !ebur128->i3000.cache[i])
-            return AVERROR(ENOMEM);
     }
 
 #if CONFIG_SWRESAMPLE
@@ -663,6 +654,8 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
     for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
         const int bin_id_400  = ebur128->i400.cache_pos;
         const int bin_id_3000 = ebur128->i3000.cache_pos;
+        double *restrict cache_400  = &ebur128->i400.cache[bin_id_400 * nb_channels];
+        double *restrict cache_3000 = &ebur128->i3000.cache[bin_id_3000 * nb_channels];
 
 #define MOVE_TO_NEXT_CACHED_ENTRY(time) do {                \
     ebur128->i##time.cache_pos++;                           \
@@ -701,12 +694,9 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
             /* add the new value, and limit the sum to the cache size (400ms or 3s)
              * by removing the oldest one */
             const double bin = *z * *z;
-            ebur128->i400.sum [ch] += bin - ebur128->i400.cache [ch][bin_id_400];
-            ebur128->i3000.sum[ch] += bin - ebur128->i3000.cache[ch][bin_id_3000];
-
-            /* override old cache entry with the new value */
-            ebur128->i400.cache [ch][bin_id_400 ] = bin;
-            ebur128->i3000.cache[ch][bin_id_3000] = bin;
+            ebur128->i400.sum [ch] += bin - cache_400[ch];
+            ebur128->i3000.sum[ch] += bin - cache_3000[ch];
+            cache_400[ch] = cache_3000[ch] = bin;
         }
 
 #define FIND_PEAK(global, sp, ptype) do {                        \
@@ -1083,12 +1073,6 @@ static av_cold void uninit(AVFilterContext *ctx)
     av_freep(&ebur128->i3000.sum);
     av_freep(&ebur128->i400.histogram);
     av_freep(&ebur128->i3000.histogram);
-    for (int i = 0; i < ebur128->nb_channels; i++) {
-        if (ebur128->i400.cache)
-            av_freep(&ebur128->i400.cache[i]);
-        if (ebur128->i3000.cache)
-            av_freep(&ebur128->i3000.cache[i]);
-    }
     av_freep(&ebur128->i400.cache);
     av_freep(&ebur128->i3000.cache);
     av_frame_free(&ebur128->outpicref);
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [FFmpeg-devel] [PATCH v4 05/13] avfilter/f_ebur128: move weights and cache to EBUR128DSPContext
  2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
                   ` (2 preceding siblings ...)
  2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 04/13] avfilter/f_ebur128: use a single packed array for the integrator cache Niklas Haas
@ 2025-06-17 12:05 ` Niklas Haas
  2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 06/13] avfilter/f_ebur128: split off C implementation to separate function Niklas Haas
                   ` (7 subsequent siblings)
  11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:05 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Niklas Haas

From: Niklas Haas <git@haasn.dev>

---
 libavfilter/f_ebur128.c | 53 +++++++++++++++++------------------------
 libavfilter/f_ebur128.h | 40 +++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 31 deletions(-)
 create mode 100644 libavfilter/f_ebur128.h

diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 9f7c080750..c3328dc520 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -43,6 +43,8 @@
 #include "formats.h"
 #include "video.h"
 
+#include "f_ebur128.h"
+
 #define ABS_THRES    -70            ///< silence gate: we discard anything below this absolute (LUFS) threshold
 #define ABS_UP_THRES  10            ///< upper loud limit to consider (ABS_THRES being the minimum)
 #define HIST_GRAIN   100            ///< defines histogram precision
@@ -75,13 +77,9 @@ struct integrator {
 
 struct rect { int x, y, w, h; };
 
-struct biquad {
-    double b0, b1, b2;
-    double a1, a2;
-};
-
 typedef struct EBUR128Context {
     const AVClass *class;           ///< AVClass context for log and options purpose
+    EBUR128DSPContext dsp;
 
     /* peak metering */
     int peak_mode;                  ///< enabled peak modes
@@ -118,13 +116,6 @@ typedef struct EBUR128Context {
     int idx_insample;               ///< current sample position of processed samples in single input frame
     AVFrame *insamples;             ///< input samples reference, updated regularly
 
-    /* Filter caches.
-     * The mult by 3 in the following is for X[i], X[i-1] and X[i-2] */
-    double *y;                      ///< 3 pre-filter samples cache for each channel
-    double *z;                      ///< 3 RLB-filter samples cache for each channel
-    struct biquad pre;
-    struct biquad rlb;
-
     struct integrator i400;         ///< 400ms integrator, used for Momentary loudness  (M), and Integrated loudness (I)
     struct integrator i3000;        ///<    3s integrator, used for Short term loudness (S), and Loudness Range      (LRA)
 
@@ -408,21 +399,21 @@ static int config_audio_input(AVFilterLink *inlink)
 
     double a0 = 1.0 + K / Q + K * K;
 
-    ebur128->pre.b0 = (Vh + Vb * K / Q + K * K) / a0;
-    ebur128->pre.b1 = 2.0 * (K * K - Vh) / a0;
-    ebur128->pre.b2 = (Vh - Vb * K / Q + K * K) / a0;
-    ebur128->pre.a1 = 2.0 * (K * K - 1.0) / a0;
-    ebur128->pre.a2 = (1.0 - K / Q + K * K) / a0;
+    ebur128->dsp.pre.b0 = (Vh + Vb * K / Q + K * K) / a0;
+    ebur128->dsp.pre.b1 = 2.0 * (K * K - Vh) / a0;
+    ebur128->dsp.pre.b2 = (Vh - Vb * K / Q + K * K) / a0;
+    ebur128->dsp.pre.a1 = 2.0 * (K * K - 1.0) / a0;
+    ebur128->dsp.pre.a2 = (1.0 - K / Q + K * K) / a0;
 
     f0 = 38.13547087602444;
     Q = 0.5003270373238773;
     K = tan(M_PI * f0 / (double)inlink->sample_rate);
 
-    ebur128->rlb.b0 = 1.0;
-    ebur128->rlb.b1 = -2.0;
-    ebur128->rlb.b2 = 1.0;
-    ebur128->rlb.a1 = 2.0 * (K * K - 1.0) / (1.0 + K / Q + K * K);
-    ebur128->rlb.a2 = (1.0 - K / Q + K * K) / (1.0 + K / Q + K * K);
+    ebur128->dsp.rlb.b0 = 1.0;
+    ebur128->dsp.rlb.b1 = -2.0;
+    ebur128->dsp.rlb.b2 = 1.0;
+    ebur128->dsp.rlb.a1 = 2.0 * (K * K - 1.0) / (1.0 + K / Q + K * K);
+    ebur128->dsp.rlb.a2 = (1.0 - K / Q + K * K) / (1.0 + K / Q + K * K);
 
     /* Force 100ms framing in case of metadata injection: the frames must have
      * a granularity of the window overlap to be accurately exploited.
@@ -448,10 +439,10 @@ static int config_audio_output(AVFilterLink *outlink)
                    AV_CH_SURROUND_DIRECT_LEFT               |AV_CH_SURROUND_DIRECT_RIGHT)
 
     ebur128->nb_channels  = nb_channels;
-    ebur128->y            = av_calloc(nb_channels, 3 * sizeof(*ebur128->y));
-    ebur128->z            = av_calloc(nb_channels, 3 * sizeof(*ebur128->z));
+    ebur128->dsp.y        = av_calloc(nb_channels, 3 * sizeof(*ebur128->dsp.y));
+    ebur128->dsp.z        = av_calloc(nb_channels, 3 * sizeof(*ebur128->dsp.z));
     ebur128->ch_weighting = av_calloc(nb_channels, sizeof(*ebur128->ch_weighting));
-    if (!ebur128->ch_weighting ||  !ebur128->y || !ebur128->z)
+    if (!ebur128->ch_weighting || !ebur128->dsp.y || !ebur128->dsp.z)
         return AVERROR(ENOMEM);
 
 #define I400_BINS(x)  ((x) * 4 / 10)
@@ -648,8 +639,8 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
     }
 #endif
 
-    const struct biquad pre = ebur128->pre;
-    const struct biquad rlb = ebur128->rlb;
+    const EBUR128Biquad pre = ebur128->dsp.pre;
+    const EBUR128Biquad rlb = ebur128->dsp.rlb;
 
     for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
         const int bin_id_400  = ebur128->i400.cache_pos;
@@ -684,8 +675,8 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
 } while (0)
 
             const double x = samples[idx_insample * nb_channels + ch];
-            double *restrict y = &ebur128->y[3 * ch];
-            double *restrict z = &ebur128->z[3 * ch];
+            double *restrict y = &ebur128->dsp.y[3 * ch];
+            double *restrict z = &ebur128->dsp.z[3 * ch];
 
             // TODO: merge both filters in one?
             FILTER(y, x, pre);  // apply pre-filter
@@ -1063,8 +1054,8 @@ static av_cold void uninit(AVFilterContext *ctx)
     }
 
     av_freep(&ebur128->y_line_ref);
-    av_freep(&ebur128->y);
-    av_freep(&ebur128->z);
+    av_freep(&ebur128->dsp.y);
+    av_freep(&ebur128->dsp.z);
     av_freep(&ebur128->ch_weighting);
     av_freep(&ebur128->true_peaks);
     av_freep(&ebur128->sample_peaks);
diff --git a/libavfilter/f_ebur128.h b/libavfilter/f_ebur128.h
new file mode 100644
index 0000000000..42cce9a5e4
--- /dev/null
+++ b/libavfilter/f_ebur128.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ * Copyright (c) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_F_EBUR128_H
+#define AVFILTER_F_EBUR128_H
+
+typedef struct EBUR128Biquad {
+    double b0, b1, b2;
+    double a1, a2;
+} EBUR128Biquad;
+
+typedef struct EBUR128DSPContext {
+    /* Filter data */
+    EBUR128Biquad pre;
+    EBUR128Biquad rlb;
+
+    /* Cache of 3 samples for each channel */
+    double *y; /* after pre-filter */
+    double *z; /* after RLB-filter */
+} EBUR128DSPContext;
+
+#endif /* AVFILTER_F_EBUR128_H */
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [FFmpeg-devel] [PATCH v4 06/13] avfilter/f_ebur128: split off C implementation to separate function
  2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
                   ` (3 preceding siblings ...)
  2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 05/13] avfilter/f_ebur128: move weights and cache to EBUR128DSPContext Niklas Haas
@ 2025-06-17 12:05 ` Niklas Haas
  2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 07/13] avfilter/x86/f_ebur128: add x86 AVX implementation Niklas Haas
                   ` (6 subsequent siblings)
  11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:05 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Niklas Haas

From: Niklas Haas <git@haasn.dev>

I decided to separate out the peak measurement loop to avoid bloating
the signature, and since it's only conditionally used.
---
 libavfilter/f_ebur128.c | 83 ++++++++++++++++++++++++-----------------
 libavfilter/f_ebur128.h |  3 ++
 2 files changed, 52 insertions(+), 34 deletions(-)

diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index c3328dc520..b9e210c05a 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -609,11 +609,48 @@ static int gate_update(struct integrator *integ, double power,
     return gate_hist_pos;
 }
 
+void ff_ebur128_filter_channels_c(const EBUR128DSPContext *dsp,
+                                  const double *restrict samples,
+                                  double *restrict cache_400,
+                                  double *restrict cache_3000,
+                                  double *restrict sum_400,
+                                  double *restrict sum_3000,
+                                  const int nb_channels)
+{
+    const EBUR128Biquad pre = dsp->pre;
+    const EBUR128Biquad rlb = dsp->rlb;
+
+    for (int ch = 0; ch < nb_channels; ch++) {
+        /* Y[i] = X[i]*b0 + X[i-1]*b1 + X[i-2]*b2 - Y[i-1]*a1 - Y[i-2]*a2 */
+#define FILTER(DST, SRC, FILT) do {                                         \
+        const double tmp = DST[0] = FILT.b0 * SRC + DST[1];                 \
+        DST[1] = FILT.b1 * SRC + DST[2] - FILT.a1 * tmp;                    \
+        DST[2] = FILT.b2 * SRC - FILT.a2 * tmp;                             \
+} while (0)
+
+        const double x = samples[ch];
+        double *restrict y = &dsp->y[3 * ch];
+        double *restrict z = &dsp->z[3 * ch];
+
+        // TODO: merge both filters in one?
+        FILTER(y, x, pre);  // apply pre-filter
+        FILTER(z, *y, rlb); // apply RLB-filter
+
+        /* add the new value, and limit the sum to the cache size (400ms or 3s)
+         * by removing the oldest one */
+        const double bin = *z * *z;
+        sum_400 [ch] += bin - cache_400[ch];
+        sum_3000[ch] += bin - cache_3000[ch];
+        cache_400[ch] = cache_3000[ch] = bin;
+    }
+}
+
 static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
 {
     int i, ch, idx_insample, ret;
     AVFilterContext *ctx = inlink->dst;
     EBUR128Context *ebur128 = ctx->priv;
+    const EBUR128DSPContext *dsp = &ebur128->dsp;
     const int nb_channels = ebur128->nb_channels;
     const int nb_samples  = insamples->nb_samples;
     const double *samples = (double *)insamples->data[0];
@@ -639,14 +676,9 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
     }
 #endif
 
-    const EBUR128Biquad pre = ebur128->dsp.pre;
-    const EBUR128Biquad rlb = ebur128->dsp.rlb;
-
     for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
         const int bin_id_400  = ebur128->i400.cache_pos;
         const int bin_id_3000 = ebur128->i3000.cache_pos;
-        double *restrict cache_400  = &ebur128->i400.cache[bin_id_400 * nb_channels];
-        double *restrict cache_3000 = &ebur128->i3000.cache[bin_id_3000 * nb_channels];
 
 #define MOVE_TO_NEXT_CACHED_ENTRY(time) do {                \
     ebur128->i##time.cache_pos++;                           \
@@ -660,35 +692,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
         MOVE_TO_NEXT_CACHED_ENTRY(400);
         MOVE_TO_NEXT_CACHED_ENTRY(3000);
 
-        for (ch = 0; ch < nb_channels; ch++) {
-            if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS)
-                ebur128->sample_peaks[ch] = FFMAX(ebur128->sample_peaks[ch], fabs(samples[idx_insample * nb_channels + ch]));
-
-            if (!ebur128->ch_weighting[ch])
-                continue;
-
-            /* Y[i] = X[i]*b0 + X[i-1]*b1 + X[i-2]*b2 - Y[i-1]*a1 - Y[i-2]*a2 */
-#define FILTER(DST, SRC, FILT) do {                                             \
-            const double tmp = DST[0] = FILT.b0 * SRC + DST[1];                 \
-            DST[1] = FILT.b1 * SRC + DST[2] - FILT.a1 * tmp;                    \
-            DST[2] = FILT.b2 * SRC - FILT.a2 * tmp;                             \
-} while (0)
-
-            const double x = samples[idx_insample * nb_channels + ch];
-            double *restrict y = &ebur128->dsp.y[3 * ch];
-            double *restrict z = &ebur128->dsp.z[3 * ch];
-
-            // TODO: merge both filters in one?
-            FILTER(y, x, pre);  // apply pre-filter
-            FILTER(z, *y, rlb); // apply RLB-filter
-
-            /* add the new value, and limit the sum to the cache size (400ms or 3s)
-             * by removing the oldest one */
-            const double bin = *z * *z;
-            ebur128->i400.sum [ch] += bin - cache_400[ch];
-            ebur128->i3000.sum[ch] += bin - cache_3000[ch];
-            cache_400[ch] = cache_3000[ch] = bin;
-        }
+        ff_ebur128_filter_channels_c(dsp, &samples[idx_insample * nb_channels],
+                                     &ebur128->i400.cache[bin_id_400 * nb_channels],
+                                     &ebur128->i3000.cache[bin_id_3000 * nb_channels],
+                                     ebur128->i400.sum, ebur128->i3000.sum,
+                                     nb_channels);
 
 #define FIND_PEAK(global, sp, ptype) do {                        \
     int ch;                                                      \
@@ -701,6 +709,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
     }                                                            \
 } while (0)
 
+        if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS) {
+            for (ch = 0; ch < nb_channels; ch++) {
+                const double sample = samples[idx_insample * nb_channels + ch];
+                ebur128->sample_peaks[ch] = FFMAX(ebur128->sample_peaks[ch], fabs(sample));
+            }
+        }
+
         FIND_PEAK(ebur128->sample_peak, ebur128->sample_peaks, SAMPLES);
         FIND_PEAK(ebur128->true_peak,   ebur128->true_peaks,   TRUE);
 
diff --git a/libavfilter/f_ebur128.h b/libavfilter/f_ebur128.h
index 42cce9a5e4..7b8e876576 100644
--- a/libavfilter/f_ebur128.h
+++ b/libavfilter/f_ebur128.h
@@ -37,4 +37,7 @@ typedef struct EBUR128DSPContext {
     double *z; /* after RLB-filter */
 } EBUR128DSPContext;
 
+void ff_ebur128_filter_channels_c(const EBUR128DSPContext *, const double *,
+                                  double *, double *, double *, double *, int);
+
 #endif /* AVFILTER_F_EBUR128_H */
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [FFmpeg-devel] [PATCH v4 07/13] avfilter/x86/f_ebur128: add x86 AVX implementation
  2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
                   ` (4 preceding siblings ...)
  2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 06/13] avfilter/f_ebur128: split off C implementation to separate function Niklas Haas
@ 2025-06-17 12:06 ` Niklas Haas
  2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 08/13] avfilter/f_ebur128: remove pointless macro Niklas Haas
                   ` (5 subsequent siblings)
  11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:06 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Niklas Haas

From: Niklas Haas <git@haasn.dev>

Processes two channels in parallel, using 128-bit XMM registers.

In theory, we could go up to YMM registers to process 4 channels, but this is
not a gain except for relatively high channel counts (e.g. 7.1), and also
complicates the sample load/store operations considerably.

I decided to only add an AVX variant, since the C code is not substantially
slower enough to justify a separate function just for ancient CPUs.
---
 libavfilter/f_ebur128.c          |  15 ++--
 libavfilter/f_ebur128.h          |  16 ++++
 libavfilter/x86/Makefile         |   4 +
 libavfilter/x86/f_ebur128.asm    | 141 +++++++++++++++++++++++++++++++
 libavfilter/x86/f_ebur128_init.c |  35 ++++++++
 5 files changed, 206 insertions(+), 5 deletions(-)
 create mode 100644 libavfilter/x86/f_ebur128.asm
 create mode 100644 libavfilter/x86/f_ebur128_init.c

diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index b9e210c05a..2d94cefce7 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -579,6 +579,11 @@ static av_cold int init(AVFilterContext *ctx)
     /* summary */
     av_log(ctx, AV_LOG_VERBOSE, "EBU +%d scale\n", ebur128->meter);
 
+    ebur128->dsp.filter_channels = ff_ebur128_filter_channels_c;
+#if ARCH_X86
+    ff_ebur128_init_x86(&ebur128->dsp);
+#endif
+
     return 0;
 }
 
@@ -692,11 +697,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
         MOVE_TO_NEXT_CACHED_ENTRY(400);
         MOVE_TO_NEXT_CACHED_ENTRY(3000);
 
-        ff_ebur128_filter_channels_c(dsp, &samples[idx_insample * nb_channels],
-                                     &ebur128->i400.cache[bin_id_400 * nb_channels],
-                                     &ebur128->i3000.cache[bin_id_3000 * nb_channels],
-                                     ebur128->i400.sum, ebur128->i3000.sum,
-                                     nb_channels);
+        dsp->filter_channels(dsp, &samples[idx_insample * nb_channels],
+                             &ebur128->i400.cache[bin_id_400 * nb_channels],
+                             &ebur128->i3000.cache[bin_id_3000 * nb_channels],
+                             ebur128->i400.sum, ebur128->i3000.sum,
+                             nb_channels);
 
 #define FIND_PEAK(global, sp, ptype) do {                        \
     int ch;                                                      \
diff --git a/libavfilter/f_ebur128.h b/libavfilter/f_ebur128.h
index 7b8e876576..1889e28bdd 100644
--- a/libavfilter/f_ebur128.h
+++ b/libavfilter/f_ebur128.h
@@ -22,6 +22,9 @@
 #ifndef AVFILTER_F_EBUR128_H
 #define AVFILTER_F_EBUR128_H
 
+#include <assert.h>
+#include <stddef.h>
+
 typedef struct EBUR128Biquad {
     double b0, b1, b2;
     double a1, a2;
@@ -35,8 +38,21 @@ typedef struct EBUR128DSPContext {
     /* Cache of 3 samples for each channel */
     double *y; /* after pre-filter */
     double *z; /* after RLB-filter */
+
+    /* DSP functions */
+    void (*filter_channels)(const struct EBUR128DSPContext *dsp,
+                            const double *samples,
+                            double *cache_400, double *cache_3000,
+                            double *sum_400, double *sum_3000,
+                            int nb_channels);
 } EBUR128DSPContext;
 
+static_assert(offsetof(EBUR128DSPContext, pre) == 0,                   "struct layout mismatch");
+static_assert(offsetof(EBUR128DSPContext, rlb) == 5  * sizeof(double), "struct layout mismatch");
+static_assert(offsetof(EBUR128DSPContext, y)   == 10 * sizeof(double), "struct layout mismatch");
+
+void ff_ebur128_init_x86(EBUR128DSPContext *dsp);
+
 void ff_ebur128_filter_channels_c(const EBUR128DSPContext *, const double *,
                                   double *, double *, double *, double *, int);
 
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 0d9a28a935..e5f0c55a5e 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -7,6 +7,7 @@ OBJS-$(CONFIG_BLEND_FILTER)                  += x86/vf_blend_init.o
 OBJS-$(CONFIG_BWDIF_FILTER)                  += x86/vf_bwdif_init.o
 OBJS-$(CONFIG_COLORSPACE_FILTER)             += x86/colorspacedsp_init.o
 OBJS-$(CONFIG_CONVOLUTION_FILTER)            += x86/vf_convolution_init.o
+OBJS-$(CONFIG_EBUR128_FILTER)                += x86/f_ebur128_init.o
 OBJS-$(CONFIG_EQ_FILTER)                     += x86/vf_eq_init.o
 OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp_init.o
 OBJS-$(CONFIG_GBLUR_FILTER)                  += x86/vf_gblur_init.o
@@ -52,6 +53,9 @@ X86ASM-OBJS-$(CONFIG_BLEND_FILTER)           += x86/vf_blend.o
 X86ASM-OBJS-$(CONFIG_BWDIF_FILTER)           += x86/vf_bwdif.o
 X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER)      += x86/colorspacedsp.o
 X86ASM-OBJS-$(CONFIG_CONVOLUTION_FILTER)     += x86/vf_convolution.o
+ifdef ARCH_X86_64
+X86ASM-OBJS-$(CONFIG_EBUR128_FILTER)         += x86/f_ebur128.o
+endif
 X86ASM-OBJS-$(CONFIG_EQ_FILTER)              += x86/vf_eq.o
 X86ASM-OBJS-$(CONFIG_FRAMERATE_FILTER)       += x86/vf_framerate.o
 X86ASM-OBJS-$(CONFIG_FSPP_FILTER)            += x86/vf_fspp.o
diff --git a/libavfilter/x86/f_ebur128.asm b/libavfilter/x86/f_ebur128.asm
new file mode 100644
index 0000000000..d9cc8d9361
--- /dev/null
+++ b/libavfilter/x86/f_ebur128.asm
@@ -0,0 +1,141 @@
+;*****************************************************************************
+;* x86-optimized functions for ebur128 filter
+;*
+;* Copyright (C) 2025 Niklas Haas
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern ebur128_filter_channels_c
+
+struc Biquad
+    .b0 resq 1
+    .b1 resq 1
+    .b2 resq 1
+    .a1 resq 1
+    .a2 resq 1
+endstruc
+
+struc DSP
+    .pre resq 5
+    .rlb resq 5
+    .y resq 1
+    .z resq 1
+endstruc
+
+SECTION .text
+
+%macro MOVNQ 3 ; num, dst, src
+%if %1 == 1
+    movsd %2, %3
+%else
+    movupd %2, %3
+%endif
+%endmacro
+
+%macro FILTER 11 ; y0, y1, y2, x, b0, b1, b2, a1, a2, samples, num_channels
+    ; Y[0] := b0 * X + Y1
+    ; Y[1] := b1 * X + Y2 - a1 * Y[0]
+    ; Y[2] := b2 * X - a2 * Y[0]
+    movsd %1, [%10 +  8]
+    movsd %3, [%10 + 16]
+%if %11 > 1
+    movhpd %1, [%10 + 32]
+    movhpd %3, [%10 + 40]
+%endif
+
+    mulpd %2, %5, %4
+    addpd %1, %2
+
+    mulpd %2, %8, %1
+    subpd %3, %2
+    mulpd %2, %6, %4
+    addpd %2, %3
+
+    mulpd %3, %7, %4
+    mulpd %4, %9, %1
+    subpd %3, %4
+
+    movsd [%10 +  0], %1
+    movsd [%10 +  8], %2
+    movsd [%10 + 16], %3
+%if %11 > 1
+    movhpd [%10 + 24], %1
+    movhpd [%10 + 32], %2
+    movhpd [%10 + 40], %3
+%endif
+    add %10, 24 * %11
+%endmacro
+
+%macro filter_channels 1 ; num_channels
+    MOVNQ %1, m3, [samplesq]
+    add samplesq, 8 * %1
+
+    FILTER m0, m1, m2, m3, m4,  m5,  m6,  m7,  m8, r7q, %1
+    FILTER m3, m1, m2, m0, m9, m10, m11, m12, m13, r8q, %1
+
+    ; update sum and cache
+    mulpd m3, m3
+    subpd m0, m3, [cache400q]
+    subpd m1, m3, [cache3000q]
+    MOVNQ %1, [cache400q],  m3
+    MOVNQ %1, [cache3000q], m3
+    add cache400q,  8 * %1
+    add cache3000q, 8 * %1
+    addpd m0, [sum400q]
+    addpd m1, [sum3000q]
+    MOVNQ %1, [sum400q],  m0
+    MOVNQ %1, [sum3000q], m1
+    add sum400q,  8 * %1
+    add sum3000q, 8 * %1
+%endmacro
+
+INIT_XMM avx
+cglobal ebur128_filter_channels, 7, 9, 14, dsp, samples, cache400, cache3000, sum400, sum3000, channels
+    movddup m4,  [dspq + DSP.pre + Biquad.b0]
+    movddup m5,  [dspq + DSP.pre + Biquad.b1]
+    movddup m6,  [dspq + DSP.pre + Biquad.b2]
+    movddup m7,  [dspq + DSP.pre + Biquad.a1]
+    movddup m8,  [dspq + DSP.pre + Biquad.a2]
+
+    movddup m9,  [dspq + DSP.rlb + Biquad.b0]
+    movddup m10, [dspq + DSP.rlb + Biquad.b1]
+    movddup m11, [dspq + DSP.rlb + Biquad.b2]
+    movddup m12, [dspq + DSP.rlb + Biquad.a1]
+    movddup m13, [dspq + DSP.rlb + Biquad.a2]
+
+    mov r7q, [dspq + DSP.y]
+    mov r8q, [dspq + DSP.z]
+
+    ; handle odd channel count
+    test channelsd, 1
+    jnz .tail
+
+.loop:
+    filter_channels 2
+    sub channelsd, 2
+    jg .loop
+    RET
+
+.tail:
+    filter_channels 1
+    dec channelsd
+    test channelsd, channelsd
+    jnz .loop
+    RET
diff --git a/libavfilter/x86/f_ebur128_init.c b/libavfilter/x86/f_ebur128_init.c
new file mode 100644
index 0000000000..8f38aee967
--- /dev/null
+++ b/libavfilter/x86/f_ebur128_init.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/f_ebur128.h"
+
+void ff_ebur128_filter_channels_avx(const EBUR128DSPContext *, const double *,
+                                    double *, double *, double *, double *, int);
+
+av_cold void ff_ebur128_init_x86(EBUR128DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags))
+        dsp->filter_channels = ff_ebur128_filter_channels_avx;
+}
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [FFmpeg-devel] [PATCH v4 08/13] avfilter/f_ebur128: remove pointless macro
  2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
                   ` (5 preceding siblings ...)
  2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 07/13] avfilter/x86/f_ebur128: add x86 AVX implementation Niklas Haas
@ 2025-06-17 12:06 ` Niklas Haas
  2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 09/13] avfilter/f_ebur128: move true peak calculation out of main loop Niklas Haas
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:06 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Niklas Haas

From: Niklas Haas <git@haasn.dev>

This macro is not shortening the code nor aiding readability.
---
 libavfilter/f_ebur128.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 2d94cefce7..2e1eedd855 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -682,20 +682,18 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
 #endif
 
     for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
-        const int bin_id_400  = ebur128->i400.cache_pos;
-        const int bin_id_3000 = ebur128->i3000.cache_pos;
-
-#define MOVE_TO_NEXT_CACHED_ENTRY(time) do {                \
-    ebur128->i##time.cache_pos++;                           \
-    if (ebur128->i##time.cache_pos ==                       \
-        ebur128->i##time.cache_size) {                      \
-        ebur128->i##time.filled    = 1;                     \
-        ebur128->i##time.cache_pos = 0;                     \
-    }                                                       \
-} while (0)
+        const int bin_id_400  = ebur128->i400.cache_pos++;
+        const int bin_id_3000 = ebur128->i3000.cache_pos++;
+
+        if (ebur128->i400.cache_pos == ebur128->i400.cache_size) {
+            ebur128->i400.filled    = 1;
+            ebur128->i400.cache_pos = 0;
+        }
 
-        MOVE_TO_NEXT_CACHED_ENTRY(400);
-        MOVE_TO_NEXT_CACHED_ENTRY(3000);
+        if (ebur128->i3000.cache_pos == ebur128->i3000.cache_size) {
+            ebur128->i3000.filled    = 1;
+            ebur128->i3000.cache_pos = 0;
+        }
 
         dsp->filter_channels(dsp, &samples[idx_insample * nb_channels],
                              &ebur128->i400.cache[bin_id_400 * nb_channels],
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [FFmpeg-devel] [PATCH v4 09/13] avfilter/f_ebur128: move true peak calculation out of main loop
  2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
                   ` (6 preceding siblings ...)
  2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 08/13] avfilter/f_ebur128: remove pointless macro Niklas Haas
@ 2025-06-17 12:06 ` Niklas Haas
  2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 10/13] avfilter/f_ebur128: lift sample " Niklas Haas
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:06 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Niklas Haas

From: Niklas Haas <git@haasn.dev>

Easier to read, less convoluted, and ~30% faster. Most importantly, this
avoids repeating the redundant recalculation of the true peak on every
single sample, by moving the FIND_PEAK() loop out of the main loop. (Note
that FIND_PEAK() does not depend on the current sample index at all, so
there is no reason for it to ever be recomputed here)
---
 libavfilter/f_ebur128.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 2e1eedd855..23092b597f 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -668,16 +668,22 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
                               (const uint8_t **)insamples->data, nb_samples);
         if (ret < 0)
             return ret;
-        for (ch = 0; ch < nb_channels; ch++)
-            ebur128->true_peaks_per_frame[ch] = 0.0;
-        for (idx_insample = 0; idx_insample < ret; idx_insample++) {
-            for (ch = 0; ch < nb_channels; ch++) {
-                ebur128->true_peaks[ch] = FFMAX(ebur128->true_peaks[ch], fabs(*swr_samples));
-                ebur128->true_peaks_per_frame[ch] = FFMAX(ebur128->true_peaks_per_frame[ch],
-                                                          fabs(*swr_samples));
-                swr_samples++;
+
+        double maxpeak = 0.0;
+        for (int ch = 0; ch < nb_channels; ch++) {
+            double tp   = ebur128->true_peaks[ch];
+            double tppf = 0.0;
+            for (int i = 0; i < ret; i++) {
+                const double sample = fabs(swr_samples[i * nb_channels]);
+                tp   = FFMAX(tp,   sample);
+                tppf = FFMAX(tppf, sample);
             }
+            maxpeak = FFMAX(maxpeak, tp);
+            ebur128->true_peaks[ch] = tp;
+            ebur128->true_peaks_per_frame[ch] = tppf;
         }
+
+        ebur128->true_peak = DBFS(maxpeak);
     }
 #endif
 
@@ -720,7 +726,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
         }
 
         FIND_PEAK(ebur128->sample_peak, ebur128->sample_peaks, SAMPLES);
-        FIND_PEAK(ebur128->true_peak,   ebur128->true_peaks,   TRUE);
 
         /* For integrated loudness, gating blocks are 400ms long with 75%
          * overlap (see BS.1770-2 p5), so a re-computation is needed each 100ms
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [FFmpeg-devel] [PATCH v4 10/13] avfilter/f_ebur128: lift sample peak calculation out of main loop
  2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
                   ` (7 preceding siblings ...)
  2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 09/13] avfilter/f_ebur128: move true peak calculation out of main loop Niklas Haas
@ 2025-06-17 12:06 ` Niklas Haas
  2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 11/13] avfilter/f_ebur128: move variable declarations to usage site Niklas Haas
                   ` (2 subsequent siblings)
  11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:06 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Niklas Haas

From: Niklas Haas <git@haasn.dev>

This is substantially faster (~55%) than the transposed loop, and also
avoids an unnecessary macro.
---
 libavfilter/f_ebur128.c | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 23092b597f..4051b1ea95 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -687,6 +687,24 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
     }
 #endif
 
+    if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS) {
+        double maxpeak = 0.0;
+        for (int ch = 0; ch < nb_channels; ch++) {
+            const double *restrict samples_ch = &samples[ch];
+            double sp = ebur128->sample_peaks[ch];
+
+            for (int i = ebur128->idx_insample; i < nb_samples; i++) {
+                const double sample = fabs(samples_ch[nb_channels * i]);
+                sp = FFMAX(sp, sample);
+            }
+            maxpeak = FFMAX(maxpeak, sp);
+            ebur128->sample_peaks[ch] = sp;
+        }
+
+        ebur128->sample_peak = DBFS(maxpeak);
+    }
+
+
     for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
         const int bin_id_400  = ebur128->i400.cache_pos++;
         const int bin_id_3000 = ebur128->i3000.cache_pos++;
@@ -707,26 +725,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
                              ebur128->i400.sum, ebur128->i3000.sum,
                              nb_channels);
 
-#define FIND_PEAK(global, sp, ptype) do {                        \
-    int ch;                                                      \
-    double maxpeak;                                              \
-    maxpeak = 0.0;                                               \
-    if (ebur128->peak_mode & PEAK_MODE_ ## ptype ## _PEAKS) {    \
-        for (ch = 0; ch < ebur128->nb_channels; ch++)            \
-            maxpeak = FFMAX(maxpeak, sp[ch]);                    \
-        global = DBFS(maxpeak);                                  \
-    }                                                            \
-} while (0)
-
-        if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS) {
-            for (ch = 0; ch < nb_channels; ch++) {
-                const double sample = samples[idx_insample * nb_channels + ch];
-                ebur128->sample_peaks[ch] = FFMAX(ebur128->sample_peaks[ch], fabs(sample));
-            }
-        }
-
-        FIND_PEAK(ebur128->sample_peak, ebur128->sample_peaks, SAMPLES);
-
         /* For integrated loudness, gating blocks are 400ms long with 75%
          * overlap (see BS.1770-2 p5), so a re-computation is needed each 100ms
          * (4800 samples at 48kHz). */
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [FFmpeg-devel] [PATCH v4 11/13] avfilter/f_ebur128: move variable declarations to usage site
  2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
                   ` (8 preceding siblings ...)
  2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 10/13] avfilter/f_ebur128: lift sample " Niklas Haas
@ 2025-06-17 12:06 ` Niklas Haas
  2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 12/13] avfilter/f_ebur128: move true peak calculation to DSP function Niklas Haas
  2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 13/13] avfilter/x86/f_ebur128: implement AVX true peak calculation Niklas Haas
  11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:06 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Niklas Haas

From: Niklas Haas <git@haasn.dev>

This is actually allowed by non-ancient versions of C.
---
 libavfilter/f_ebur128.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 4051b1ea95..1fb7129271 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -652,7 +652,7 @@ void ff_ebur128_filter_channels_c(const EBUR128DSPContext *dsp,
 
 static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
 {
-    int i, ch, idx_insample, ret;
+    int ret;
     AVFilterContext *ctx = inlink->dst;
     EBUR128Context *ebur128 = ctx->priv;
     const EBUR128DSPContext *dsp = &ebur128->dsp;
@@ -705,7 +705,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
     }
 
 
-    for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
+    for (int idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
         const int bin_id_400  = ebur128->i400.cache_pos++;
         const int bin_id_3000 = ebur128->i3000.cache_pos++;
 
@@ -741,7 +741,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
 #define COMPUTE_LOUDNESS(m, time) do {                                              \
     if (ebur128->i##time.filled) {                                                  \
         /* weighting sum of the last <time> ms */                                   \
-        for (ch = 0; ch < nb_channels; ch++)                                        \
+        for (int ch = 0; ch < nb_channels; ch++)                                    \
             power_##time += ebur128->ch_weighting[ch] * ebur128->i##time.sum[ch];   \
         power_##time /= I##time##_BINS(inlink->sample_rate);                        \
     }                                                                               \
@@ -762,7 +762,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
 
                 /* compute integrated loudness by summing the histogram values
                  * above the relative threshold */
-                for (i = gate_hist_pos; i < HIST_SIZE; i++) {
+                for (int i = gate_hist_pos; i < HIST_SIZE; i++) {
                     const unsigned nb_v = ebur128->i400.histogram[i].count;
                     nb_integrated  += nb_v;
                     integrated_sum += nb_v * ebur128->i400.histogram[i].energy;
@@ -788,7 +788,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
                 int gate_hist_pos = gate_update(&ebur128->i3000, power_3000,
                                                 loudness_3000, LRA_GATE_THRES);
 
-                for (i = gate_hist_pos; i < HIST_SIZE; i++)
+                for (int i = gate_hist_pos; i < HIST_SIZE; i++)
                     nb_powers += ebur128->i3000.histogram[i].count;
                 if (nb_powers) {
                     uint64_t n, nb_pow;
@@ -796,7 +796,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
                     /* get lower loudness to consider */
                     n = 0;
                     nb_pow = LRA_LOWER_PRC * nb_powers * 0.01 + 0.5;
-                    for (i = gate_hist_pos; i < HIST_SIZE; i++) {
+                    for (int i = gate_hist_pos; i < HIST_SIZE; i++) {
                         n += ebur128->i3000.histogram[i].count;
                         if (n >= nb_pow) {
                             ebur128->lra_low = ebur128->i3000.histogram[i].loudness;
@@ -807,7 +807,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
                     /* get higher loudness to consider */
                     n = nb_powers;
                     nb_pow = LRA_HIGHER_PRC * nb_powers * 0.01 + 0.5;
-                    for (i = HIST_SIZE - 1; i >= 0; i--) {
+                    for (int i = HIST_SIZE - 1; i >= 0; i--) {
                         n -= FFMIN(n, ebur128->i3000.histogram[i].count);
                         if (n < nb_pow) {
                             ebur128->lra_high = ebur128->i3000.histogram[i].loudness;
@@ -909,7 +909,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
     if (ebur128->peak_mode & PEAK_MODE_ ## ptype ## _PEAKS) {               \
         double max_peak = 0.0;                                              \
         char key[64];                                                       \
-        for (ch = 0; ch < nb_channels; ch++) {                              \
+        for (int ch = 0; ch < nb_channels; ch++) {                          \
             snprintf(key, sizeof(key),                                      \
                      META_PREFIX AV_STRINGIFY(name) "_peaks_ch%d", ch);     \
             max_peak = fmax(max_peak, ebur128->name##_peaks[ch]);           \
@@ -948,7 +948,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
 #define PRINT_PEAKS(str, sp, ptype) do {                            \
     if (ebur128->peak_mode & PEAK_MODE_ ## ptype ## _PEAKS) {       \
         av_log(ctx, ebur128->loglevel, "  " str ":");               \
-        for (ch = 0; ch < nb_channels; ch++)                        \
+        for (int ch = 0; ch < nb_channels; ch++)                    \
             av_log(ctx, ebur128->loglevel, " %5.1f", DBFS(sp[ch])); \
         av_log(ctx, ebur128->loglevel, " dBFS");                    \
     }                                                               \
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [FFmpeg-devel] [PATCH v4 12/13] avfilter/f_ebur128: move true peak calculation to DSP function
  2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
                   ` (9 preceding siblings ...)
  2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 11/13] avfilter/f_ebur128: move variable declarations to usage site Niklas Haas
@ 2025-06-17 12:06 ` Niklas Haas
  2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 13/13] avfilter/x86/f_ebur128: implement AVX true peak calculation Niklas Haas
  11 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:06 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Niklas Haas

From: Niklas Haas <git@haasn.dev>

---
 libavfilter/f_ebur128.c | 43 ++++++++++++++++++++++++++---------------
 libavfilter/f_ebur128.h |  4 ++++
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 1fb7129271..0adc89c823 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -580,6 +580,8 @@ static av_cold int init(AVFilterContext *ctx)
     av_log(ctx, AV_LOG_VERBOSE, "EBU +%d scale\n", ebur128->meter);
 
     ebur128->dsp.filter_channels = ff_ebur128_filter_channels_c;
+    ebur128->dsp.true_peak = ff_ebur128_true_peak_c;
+
 #if ARCH_X86
     ff_ebur128_init_x86(&ebur128->dsp);
 #endif
@@ -650,6 +652,28 @@ void ff_ebur128_filter_channels_c(const EBUR128DSPContext *dsp,
     }
 }
 
+double ff_ebur128_true_peak_c(double *restrict true_peaks,
+                              double *restrict true_peaks_per_frame,
+                              const int nb_channels, const double *samples,
+                              const int nb_samples)
+{
+    double maxpeak = 0.0;
+    for (int ch = 0; ch < nb_channels; ch++) {
+        double tp   = true_peaks[ch];
+        double tppf = 0.0f;
+        for (int i = 0; i < nb_samples; i++) {
+            const double sample = fabs(samples[i * nb_channels]);
+            tp   = FFMAX(tp,   sample);
+            tppf = FFMAX(tppf, sample);
+        }
+        maxpeak = FFMAX(maxpeak, tp);
+        true_peaks[ch] = tp;
+        true_peaks_per_frame[ch] = tppf;
+    }
+
+    return maxpeak;
+}
+
 static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
 {
     int ret;
@@ -669,21 +693,9 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
         if (ret < 0)
             return ret;
 
-        double maxpeak = 0.0;
-        for (int ch = 0; ch < nb_channels; ch++) {
-            double tp   = ebur128->true_peaks[ch];
-            double tppf = 0.0;
-            for (int i = 0; i < ret; i++) {
-                const double sample = fabs(swr_samples[i * nb_channels]);
-                tp   = FFMAX(tp,   sample);
-                tppf = FFMAX(tppf, sample);
-            }
-            maxpeak = FFMAX(maxpeak, tp);
-            ebur128->true_peaks[ch] = tp;
-            ebur128->true_peaks_per_frame[ch] = tppf;
-        }
-
-        ebur128->true_peak = DBFS(maxpeak);
+        ebur128->true_peak = DBFS(dsp->true_peak(ebur128->true_peaks,
+                                                 ebur128->true_peaks_per_frame,
+                                                 nb_channels, swr_samples, ret));
     }
 #endif
 
@@ -704,7 +716,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
         ebur128->sample_peak = DBFS(maxpeak);
     }
 
-
     for (int idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
         const int bin_id_400  = ebur128->i400.cache_pos++;
         const int bin_id_3000 = ebur128->i3000.cache_pos++;
diff --git a/libavfilter/f_ebur128.h b/libavfilter/f_ebur128.h
index 1889e28bdd..8aab7838a0 100644
--- a/libavfilter/f_ebur128.h
+++ b/libavfilter/f_ebur128.h
@@ -45,6 +45,8 @@ typedef struct EBUR128DSPContext {
                             double *cache_400, double *cache_3000,
                             double *sum_400, double *sum_3000,
                             int nb_channels);
+
+    double (*true_peak)(double *, double *, int, const double *, int);
 } EBUR128DSPContext;
 
 static_assert(offsetof(EBUR128DSPContext, pre) == 0,                   "struct layout mismatch");
@@ -56,4 +58,6 @@ void ff_ebur128_init_x86(EBUR128DSPContext *dsp);
 void ff_ebur128_filter_channels_c(const EBUR128DSPContext *, const double *,
                                   double *, double *, double *, double *, int);
 
+double ff_ebur128_true_peak_c(double *, double *, int, const double *, int);
+
 #endif /* AVFILTER_F_EBUR128_H */
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [FFmpeg-devel] [PATCH v4 13/13] avfilter/x86/f_ebur128: implement AVX true peak calculation
  2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
                   ` (10 preceding siblings ...)
  2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 12/13] avfilter/f_ebur128: move true peak calculation to DSP function Niklas Haas
@ 2025-06-17 12:06 ` Niklas Haas
  2025-06-17 13:48   ` Niklas Haas
  11 siblings, 1 reply; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 12:06 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Niklas Haas

From: Niklas Haas <git@haasn.dev>

Stereo only, for simplicity. Slightly faster than the C code.
---
 libavfilter/f_ebur128.c          |  8 +++-----
 libavfilter/f_ebur128.h          |  2 +-
 libavfilter/x86/f_ebur128.asm    | 25 +++++++++++++++++++++++++
 libavfilter/x86/f_ebur128_init.c |  9 +++++++--
 4 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 0adc89c823..c64f6ed032 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -502,6 +502,9 @@ static int config_audio_output(AVFilterLink *outlink)
             return AVERROR(ENOMEM);
     }
 
+#if ARCH_X86
+    ff_ebur128_init_x86(&ebur128->dsp, nb_channels);
+#endif
     return 0;
 }
 
@@ -581,11 +584,6 @@ static av_cold int init(AVFilterContext *ctx)
 
     ebur128->dsp.filter_channels = ff_ebur128_filter_channels_c;
     ebur128->dsp.true_peak = ff_ebur128_true_peak_c;
-
-#if ARCH_X86
-    ff_ebur128_init_x86(&ebur128->dsp);
-#endif
-
     return 0;
 }
 
diff --git a/libavfilter/f_ebur128.h b/libavfilter/f_ebur128.h
index 8aab7838a0..5fb9d4a8d5 100644
--- a/libavfilter/f_ebur128.h
+++ b/libavfilter/f_ebur128.h
@@ -53,7 +53,7 @@ static_assert(offsetof(EBUR128DSPContext, pre) == 0,                   "struct l
 static_assert(offsetof(EBUR128DSPContext, rlb) == 5  * sizeof(double), "struct layout mismatch");
 static_assert(offsetof(EBUR128DSPContext, y)   == 10 * sizeof(double), "struct layout mismatch");
 
-void ff_ebur128_init_x86(EBUR128DSPContext *dsp);
+void ff_ebur128_init_x86(EBUR128DSPContext *dsp, int nb_channels);
 
 void ff_ebur128_filter_channels_c(const EBUR128DSPContext *, const double *,
                                   double *, double *, double *, double *, int);
diff --git a/libavfilter/x86/f_ebur128.asm b/libavfilter/x86/f_ebur128.asm
index d9cc8d9361..53dd3f858d 100644
--- a/libavfilter/x86/f_ebur128.asm
+++ b/libavfilter/x86/f_ebur128.asm
@@ -39,6 +39,10 @@ struc DSP
     .z resq 1
 endstruc
 
+SECTION_RODATA
+
+abs_mask: dq 0x7FFFFFFFFFFFFFFF
+
 SECTION .text
 
 %macro MOVNQ 3 ; num, dst, src
@@ -139,3 +143,24 @@ cglobal ebur128_filter_channels, 7, 9, 14, dsp, samples, cache400, cache3000, su
     test channelsd, channelsd
     jnz .loop
     RET
+
+cglobal ebur128_true_peak_stereo, 5, 6, 4, tp, tppf, channels, samples, nb_samples
+    vpbroadcastq m4, [abs_mask]
+    pxor m0, m0      ; maxpeak
+    movupd m1, [tpq] ; tp
+    pxor m2, m2      ; tppf
+.inner:
+    movupd m3, [samplesq]
+    add samplesq, 16
+    pand m3, m4
+    maxpd m1, m3
+    maxpd m2, m3
+    dec nb_samplesd
+    jg .inner
+    movupd [tpq],  m1
+    movupd [tppfq], m2
+    maxpd m0, m1
+    shufpd m1, m0, m0, 1
+    maxpd m0, m1
+    movq rax, m0
+    RET
diff --git a/libavfilter/x86/f_ebur128_init.c b/libavfilter/x86/f_ebur128_init.c
index 8f38aee967..527e5e4dbc 100644
--- a/libavfilter/x86/f_ebur128_init.c
+++ b/libavfilter/x86/f_ebur128_init.c
@@ -26,10 +26,15 @@
 void ff_ebur128_filter_channels_avx(const EBUR128DSPContext *, const double *,
                                     double *, double *, double *, double *, int);
 
-av_cold void ff_ebur128_init_x86(EBUR128DSPContext *dsp)
+double ff_ebur128_true_peak_stereo_avx(double *, double *, int, const double *, int);
+
+av_cold void ff_ebur128_init_x86(EBUR128DSPContext *dsp, int nb_channels)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags))
+    if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags)) {
         dsp->filter_channels = ff_ebur128_filter_channels_avx;
+        if (nb_channels == 2)
+            dsp->true_peak = ff_ebur128_true_peak_stereo_avx;
+    }
 }
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 13/13] avfilter/x86/f_ebur128: implement AVX true peak calculation
  2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 13/13] avfilter/x86/f_ebur128: implement AVX true peak calculation Niklas Haas
@ 2025-06-17 13:48   ` Niklas Haas
  0 siblings, 0 replies; 14+ messages in thread
From: Niklas Haas @ 2025-06-17 13:48 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Niklas Haas

On Tue, 17 Jun 2025 14:06:06 +0200 Niklas Haas <ffmpeg@haasn.xyz> wrote:
> From: Niklas Haas <git@haasn.dev>
>
> Stereo only, for simplicity. Slightly faster than the C code.

I will drop this and the prior commit and resubmit them in a refactored form
that also speeds up the sample peak calculation.

In the meantime, I would appreciate a review of the rest.

> ---
>  libavfilter/f_ebur128.c          |  8 +++-----
>  libavfilter/f_ebur128.h          |  2 +-
>  libavfilter/x86/f_ebur128.asm    | 25 +++++++++++++++++++++++++
>  libavfilter/x86/f_ebur128_init.c |  9 +++++++--
>  4 files changed, 36 insertions(+), 8 deletions(-)
>
> diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
> index 0adc89c823..c64f6ed032 100644
> --- a/libavfilter/f_ebur128.c
> +++ b/libavfilter/f_ebur128.c
> @@ -502,6 +502,9 @@ static int config_audio_output(AVFilterLink *outlink)
>              return AVERROR(ENOMEM);
>      }
>
> +#if ARCH_X86
> +    ff_ebur128_init_x86(&ebur128->dsp, nb_channels);
> +#endif
>      return 0;
>  }
>
> @@ -581,11 +584,6 @@ static av_cold int init(AVFilterContext *ctx)
>
>      ebur128->dsp.filter_channels = ff_ebur128_filter_channels_c;
>      ebur128->dsp.true_peak = ff_ebur128_true_peak_c;
> -
> -#if ARCH_X86
> -    ff_ebur128_init_x86(&ebur128->dsp);
> -#endif
> -
>      return 0;
>  }
>
> diff --git a/libavfilter/f_ebur128.h b/libavfilter/f_ebur128.h
> index 8aab7838a0..5fb9d4a8d5 100644
> --- a/libavfilter/f_ebur128.h
> +++ b/libavfilter/f_ebur128.h
> @@ -53,7 +53,7 @@ static_assert(offsetof(EBUR128DSPContext, pre) == 0,                   "struct l
>  static_assert(offsetof(EBUR128DSPContext, rlb) == 5  * sizeof(double), "struct layout mismatch");
>  static_assert(offsetof(EBUR128DSPContext, y)   == 10 * sizeof(double), "struct layout mismatch");
>
> -void ff_ebur128_init_x86(EBUR128DSPContext *dsp);
> +void ff_ebur128_init_x86(EBUR128DSPContext *dsp, int nb_channels);
>
>  void ff_ebur128_filter_channels_c(const EBUR128DSPContext *, const double *,
>                                    double *, double *, double *, double *, int);
> diff --git a/libavfilter/x86/f_ebur128.asm b/libavfilter/x86/f_ebur128.asm
> index d9cc8d9361..53dd3f858d 100644
> --- a/libavfilter/x86/f_ebur128.asm
> +++ b/libavfilter/x86/f_ebur128.asm
> @@ -39,6 +39,10 @@ struc DSP
>      .z resq 1
>  endstruc
>
> +SECTION_RODATA
> +
> +abs_mask: dq 0x7FFFFFFFFFFFFFFF
> +
>  SECTION .text
>
>  %macro MOVNQ 3 ; num, dst, src
> @@ -139,3 +143,24 @@ cglobal ebur128_filter_channels, 7, 9, 14, dsp, samples, cache400, cache3000, su
>      test channelsd, channelsd
>      jnz .loop
>      RET
> +
> +cglobal ebur128_true_peak_stereo, 5, 6, 4, tp, tppf, channels, samples, nb_samples
> +    vpbroadcastq m4, [abs_mask]
> +    pxor m0, m0      ; maxpeak
> +    movupd m1, [tpq] ; tp
> +    pxor m2, m2      ; tppf
> +.inner:
> +    movupd m3, [samplesq]
> +    add samplesq, 16
> +    pand m3, m4
> +    maxpd m1, m3
> +    maxpd m2, m3
> +    dec nb_samplesd
> +    jg .inner
> +    movupd [tpq],  m1
> +    movupd [tppfq], m2
> +    maxpd m0, m1
> +    shufpd m1, m0, m0, 1
> +    maxpd m0, m1
> +    movq rax, m0
> +    RET
> diff --git a/libavfilter/x86/f_ebur128_init.c b/libavfilter/x86/f_ebur128_init.c
> index 8f38aee967..527e5e4dbc 100644
> --- a/libavfilter/x86/f_ebur128_init.c
> +++ b/libavfilter/x86/f_ebur128_init.c
> @@ -26,10 +26,15 @@
>  void ff_ebur128_filter_channels_avx(const EBUR128DSPContext *, const double *,
>                                      double *, double *, double *, double *, int);
>
> -av_cold void ff_ebur128_init_x86(EBUR128DSPContext *dsp)
> +double ff_ebur128_true_peak_stereo_avx(double *, double *, int, const double *, int);
> +
> +av_cold void ff_ebur128_init_x86(EBUR128DSPContext *dsp, int nb_channels)
>  {
>      int cpu_flags = av_get_cpu_flags();
>
> -    if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags))
> +    if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags)) {
>          dsp->filter_channels = ff_ebur128_filter_channels_avx;
> +        if (nb_channels == 2)
> +            dsp->true_peak = ff_ebur128_true_peak_stereo_avx;
> +    }
>  }
> --
> 2.49.0
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2025-06-17 13:48 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-06-17 12:05 [FFmpeg-devel] [PATCH v4 01/13] avfilter/f_ebur128: use transformed direct form II Niklas Haas
2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 02/13] avfilter/f_ebur128: simplify sample cache array Niklas Haas
2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 03/13] avfilter/f_ebur128: use structs for biquad weights Niklas Haas
2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 04/13] avfilter/f_ebur128: use a single packed array for the integrator cache Niklas Haas
2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 05/13] avfilter/f_ebur128: move weights and cache to EBUR128DSPContext Niklas Haas
2025-06-17 12:05 ` [FFmpeg-devel] [PATCH v4 06/13] avfilter/f_ebur128: split off C implementation to separate function Niklas Haas
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 07/13] avfilter/x86/f_ebur128: add x86 AVX implementation Niklas Haas
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 08/13] avfilter/f_ebur128: remove pointless macro Niklas Haas
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 09/13] avfilter/f_ebur128: move true peak calculation out of main loop Niklas Haas
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 10/13] avfilter/f_ebur128: lift sample " Niklas Haas
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 11/13] avfilter/f_ebur128: move variable declarations to usage site Niklas Haas
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 12/13] avfilter/f_ebur128: move true peak calculation to DSP function Niklas Haas
2025-06-17 12:06 ` [FFmpeg-devel] [PATCH v4 13/13] avfilter/x86/f_ebur128: implement AVX true peak calculation Niklas Haas
2025-06-17 13:48   ` Niklas Haas

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git