From f5257ca9ed821e9fb3dd9edc3487da4d06ba47a3 Mon Sep 17 00:00:00 2001
From: Paul B Mahol <onemda@gmail.com>
Date: Wed, 25 Oct 2023 09:58:24 +0200
Subject: [PATCH 1/4] avcodec/mlpenc: replace naive rematrix with brute-force
 search

Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavcodec/mlpenc.c | 183 +++++++++++++++++++++++++++++++-------------
 1 file changed, 129 insertions(+), 54 deletions(-)

diff --git a/libavcodec/mlpenc.c b/libavcodec/mlpenc.c
index 6b801605db..27ef5f2c82 100644
--- a/libavcodec/mlpenc.c
+++ b/libavcodec/mlpenc.c
@@ -136,7 +136,8 @@ typedef struct MLPEncodeContext {
     int             min_restart_interval;   ///< Min interval of access units in between two major frames.
     int             cur_restart_interval;
     int             lpc_coeff_precision;
-    int             rematrix_precision;
+    int             rematrix_search_step;
+    int             rematrix_search_limit;
     int             lpc_type;
     int             lpc_passes;
     int             prediction_order;
@@ -1399,79 +1400,150 @@ static void determine_filters(MLPEncodeContext *ctx, MLPSubstream *s)
         set_filter(ctx, s, ch, 0);
 }
 
+static int invert2x2(const int32_t *s, int32_t *d)
+{
+    int64_t det;
+
+    d[0] = +s[3];
+    d[1] = -s[1];
+    d[2] = -s[2];
+    d[3] = +s[0];
+
+    det = (int64_t)s[0] * d[0] + (int64_t)s[1] * d[2];
+    if (det == 0LL)
+        return -1;
+
+    d[0] = (d[0] * (1LL << 28)) / det;
+    d[1] = (d[1] * (1LL << 28)) / det;
+    d[2] = (d[2] * (1LL << 28)) / det;
+    d[3] = (d[3] * (1LL << 28)) / det;
+
+    return 0;
+}
+
 static int estimate_coeff(MLPEncodeContext *ctx, MLPSubstream *s,
-                          MatrixParams *mp,
-                          int ch0, int ch1)
+                          MatrixParams *mp, int ch0, int ch1)
 {
-    int32_t maxl = INT32_MIN, maxr = INT32_MIN, minl = INT32_MAX, minr = INT32_MAX;
-    int64_t summ = 0, sums = 0, suml = 0, sumr = 0, enl = 0, enr = 0;
-    const int shift = 14 - ctx->rematrix_precision;
-    int32_t cf0, cf1, e[4], d[4], ml, mr;
-    int i, count = 0;
+    const int search_limit = 1 << ctx->rematrix_search_limit;
+    const int search_step = 1 << ctx->rematrix_search_step;
+    int32_t best[4], d[4], e[4], count = 0, chan = -1;
+    uint64_t best_sum = UINT64_MAX;
+    int32_t v[2], inc;
 
-    for (int j = 0; j <= ctx->cur_restart_interval; j++) {
-        DecodingParams *dp = &s->b[j].decoding_params;
-        const int32_t *ch[2];
+    v[0] = 0;
+    v[1] = 0;
 
-        ch[0] = dp->sample_buffer[ch0];
-        ch[1] = dp->sample_buffer[ch1];
+    inc = search_step;
 
-        for (int i = 0; i < dp->blocksize; i++) {
-            int32_t lm = ch[0][i], rm = ch[1][i];
+    while (1) {
+        for (int c = 0; c < 2; c++) {
+            uint64_t sum = 0;
+
+            if (c) {
+                e[0] = 1 << 14;
+                e[1] = 0 << 14;
+                e[2] = v[1];
+                e[3] = v[0];
+            } else {
+                e[0] = v[0];
+                e[1] = v[1];
+                e[2] = 0 << 14;
+                e[3] = 1 << 14;
+            }
 
-            enl  += FFABS(lm);
-            enr  += FFABS(rm);
+            if (invert2x2(e, d)) {
+                sum = UINT64_MAX;
+                goto next;
+            }
 
-            summ += FFABS(lm + rm);
-            sums += FFABS(lm - rm);
+            for (int i = 0; i < 4; i++) {
+                if (d[i] != av_clip_intp2(d[i], 15)) {
+                    sum = UINT64_MAX;
+                    goto next;
+                }
+            }
 
-            suml += lm;
-            sumr += rm;
+            for (int j = 0; j <= ctx->cur_restart_interval; j++) {
+                DecodingParams *dp = &s->b[j].decoding_params;
+                const int32_t *ch[2];
 
-            maxl = FFMAX(maxl, lm);
-            maxr = FFMAX(maxr, rm);
+                ch[0] = dp->sample_buffer[ch0];
+                ch[1] = dp->sample_buffer[ch1];
 
-            minl = FFMIN(minl, lm);
-            minr = FFMIN(minr, rm);
-        }
-    }
+                for (int i = 0; i < dp->blocksize; i++) {
+                    const int64_t lm = ch[0][i], rm = ch[1][i];
+                    int64_t lt, rt, v = 0;
 
-    summ -= FFABS(suml + sumr);
-    sums -= FFABS(suml - sumr);
+                    lt = ((lm * e[0]) >> 14) + ((rm * e[1]) >> 14);
+                    rt = ((lm * e[2]) >> 14) + ((rm * e[3]) >> 14);
 
-    ml = maxl - minl;
-    mr = maxr - minr;
+                    if (FFABS(lt) > (1LL << 23) ||
+                        FFABS(rt) > (1LL << 23)) {
+                        sum = UINT64_MAX;
+                        goto next;
+                    }
 
-    if (!summ && !sums)
-        return 0;
+                    if (c)
+                        v += FFABS(rt);
+                    else
+                        v += FFABS(lt);
+                    sum += v;
+                    if (sum > best_sum)
+                        goto next;
+
+                    if ((((lt * d[0]) >> 14) + ((rt * d[1]) >> 14)) != lm) {
+                        sum = UINT64_MAX;
+                        goto next;
+                    }
 
-    if (!ml || !mr)
-        return 0;
+                    if ((((lt * d[2]) >> 14) + ((rt * d[3]) >> 14)) != rm) {
+                        sum = UINT64_MAX;
+                        goto next;
+                    }
+                }
+            }
 
-    if ((FFABS(ml) + FFABS(mr)) >= (1 << 24))
-        return 0;
+next:
+            if (sum < best_sum) {
+                chan = c;
+                best_sum = sum;
+                memcpy(best, e, sizeof(e));
+            }
+        }
 
-    cf0 = (FFMIN(FFABS(mr), FFABS(ml)) * (1LL << 14)) / FFMAX(FFABS(ml), FFABS(mr));
-    cf0 = (cf0 >> shift) << shift;
-    cf1 = -cf0;
+        v[1] += inc;
+
+        if (v[1] < -search_limit) {
+            if (v[0] > search_limit) {
+                v[0] = -search_step;
+            } else if (v[0] >= 0) {
+                v[0] += search_step;
+            } else if (v[0] >= -search_limit) {
+                v[0] -= search_step;
+            } else {
+                break;
+            }
 
-    if (sums > summ)
-        FFSWAP(int32_t, cf0, cf1);
+            inc = search_step;
+        } else if (v[1] > search_limit) {
+            v[1] = 0;
+            inc  = -search_step;
+        }
 
-    count = 1;
-    i = enl < enr;
-    mp->outch[0] = ch0 + i;
+        if (best_sum == 0ULL)
+            break;
+    }
 
-    d[!i] = cf0;
-    d[ i] = 1 << 14;
-    e[!i] = cf1;
-    e[ i] = 1 << 14;
+    if (chan < 0)
+        return 0;
 
-    mp->coeff[0][ch0] = av_clip_intp2(d[0], 15);
-    mp->coeff[0][ch1] = av_clip_intp2(d[1], 15);
+    mp->outch[0] = chan;
+    memcpy(e, best, sizeof(e));
+    invert2x2(e, d);
+    count = 1;
 
-    mp->forco[0][ch0] = av_clip_intp2(e[0], 15);
-    mp->forco[0][ch1] = av_clip_intp2(e[1], 15);
+    mp->coeff[0][ch0] = d[chan * 2 + 0]; mp->coeff[0][ch1] = d[chan * 2 + 1];
+    mp->forco[0][ch0] = e[chan * 2 + 0]; mp->forco[0][ch1] = e[chan * 2 + 1];
 
     return count;
 }
@@ -2060,11 +2132,13 @@ static void set_major_params(MLPEncodeContext *ctx, MLPSubstream *s)
     for (int index = 0; index < s->b[ctx->restart_intervals-1].seq_size; index++) {
         memcpy(&s->b[index].major_decoding_params,
                &s->b[index].decoding_params, sizeof(DecodingParams));
+
         for (int ch = 0; ch <= rh->max_matrix_channel; ch++) {
             int8_t shift = s->b[index].decoding_params.output_shift[ch];
 
             max_shift = FFMAX(max_shift, shift);
         }
+
         for (int ch = rh->min_channel; ch <= rh->max_channel; ch++) {
             uint8_t huff_lsbs = s->b[index].channel_params[ch].huff_lsbs;
 
@@ -2277,7 +2351,8 @@ static const AVOption mlp_options[] = {
 { "prediction_order", "Search method for selecting prediction order", OFFSET(prediction_order), AV_OPT_TYPE_INT, {.i64 = ORDER_METHOD_EST }, ORDER_METHOD_EST, ORDER_METHOD_SEARCH, FLAGS, "predm" },
 { "estimation", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = ORDER_METHOD_EST },    0, 0, FLAGS, "predm" },
 { "search",     NULL, 0, AV_OPT_TYPE_CONST, {.i64 = ORDER_METHOD_SEARCH }, 0, 0, FLAGS, "predm" },
-{ "rematrix_precision", "Rematrix coefficient precision", OFFSET(rematrix_precision), AV_OPT_TYPE_INT, {.i64 = 1 }, 0, 14, FLAGS },
+{ "rematrix_limit", "Rematrix search limit precision", OFFSET(rematrix_search_limit), AV_OPT_TYPE_INT, {.i64 = 16 }, 14, 20, FLAGS },
+{ "rematrix_step", "Rematrix search step precision", OFFSET(rematrix_search_step), AV_OPT_TYPE_INT, {.i64 = 10 }, 1, 14, FLAGS },
 { NULL },
 };
 
-- 
2.42.0