[FFmpeg-devel] [PATCH] tta decoder improvements

* [FFmpeg-devel] [PATCH] tta decoder improvements
@ 2023-08-16 10:47 Paul B Mahol
  2023-08-16 16:49 ` Michael Niedermayer
  2023-08-16 18:14 ` Michael Niedermayer
  0 siblings, 2 replies; 3+ messages in thread
From: Paul B Mahol @ 2023-08-16 10:47 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 16 bytes --]

Patch attached.

[-- Attachment #2: 0001-avcodec-tta-switch-to-planar-sample-formats.patch --]
[-- Type: text/x-patch, Size: 10571 bytes --]

From 2b6ac4f7093157533b7f279a78a73bfabeb98cf0 Mon Sep 17 00:00:00 2001
From: Paul B Mahol <onemda@gmail.com>
Date: Tue, 15 Aug 2023 21:13:59 +0200
Subject: [PATCH] avcodec/tta: switch to planar sample formats

Makes decoding few percent faster.
Also fix code style while here.

Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavcodec/tta.c | 167 +++++++++++++++++++++++++++++++----------------
 1 file changed, 109 insertions(+), 58 deletions(-)

diff --git a/libavcodec/tta.c b/libavcodec/tta.c
index 3e89571f16..6add4106d3 100644
--- a/libavcodec/tta.c
+++ b/libavcodec/tta.c
@@ -55,7 +55,7 @@ typedef struct TTAContext {
     unsigned data_length;
     int frame_length, last_frame_length;
 
-    int32_t *decode_buffer;
+    int32_t **decode_buffer;
 
     uint8_t crc_pass[8];
     uint8_t *pass;
@@ -107,10 +107,16 @@ static int allocate_buffers(AVCodecContext *avctx)
     TTAContext *s = avctx->priv_data;
 
     if (s->bps < 3) {
-        s->decode_buffer = av_calloc(s->frame_length,
-                                     sizeof(*s->decode_buffer) * s->channels);
+        s->decode_buffer = av_calloc(s->channels, sizeof(*s->decode_buffer));
         if (!s->decode_buffer)
             return AVERROR(ENOMEM);
+
+        for (int ch = 0; ch < s->channels; ch++) {
+            s->decode_buffer[ch] = av_calloc(s->frame_length,
+                                             sizeof(*s->decode_buffer[ch]));
+            if (!s->decode_buffer[ch])
+                return AVERROR(ENOMEM);
+        }
     } else
         s->decode_buffer = NULL;
     s->ch_ctx = av_malloc_array(avctx->ch_layout.nb_channels, sizeof(*s->ch_ctx));
@@ -181,14 +187,14 @@ static av_cold int tta_decode_init(AVCodecContext * avctx)
         }
 
         switch(s->bps) {
-        case 1: avctx->sample_fmt = AV_SAMPLE_FMT_U8; break;
+        case 1: avctx->sample_fmt = AV_SAMPLE_FMT_U8P; break;
         case 2:
-            avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+            avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
             break;
         case 3:
-            avctx->sample_fmt = AV_SAMPLE_FMT_S32;
+            avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
             break;
-        //case 4: avctx->sample_fmt = AV_SAMPLE_FMT_S32; break;
+        //case 4: avctx->sample_fmt = AV_SAMPLE_FMT_S32P; break;
         default:
             av_log(avctx, AV_LOG_ERROR, "Invalid/unsupported sample format.\n");
             return AVERROR_INVALIDDATA;
@@ -231,10 +237,10 @@ static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     TTAContext *s = avctx->priv_data;
+    const int bps = s->bps;
     GetBitContext gb;
     int i, ret;
     int cur_chan = 0, framelen = s->frame_length;
-    uint32_t *p;
 
     if (avctx->err_recognition & AV_EF_CRCCHECK) {
         if (buf_size < 4 ||
@@ -251,14 +257,13 @@ static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
         return ret;
 
     // decode directly to output buffer for 24-bit sample format
-    if (s->bps == 3)
-        s->decode_buffer = (int32_t *)frame->data[0];
+    if (bps == 3)
+        s->decode_buffer = (int32_t **)frame->extended_data;
 
     // init per channel states
     for (i = 0; i < s->channels; i++) {
         TTAFilter *filter = &s->ch_ctx[i].filter;
-        s->ch_ctx[i].predictor = 0;
-        ff_tta_filter_init(filter, ff_tta_filter_configs[s->bps-1]);
+        ff_tta_filter_init(filter, ff_tta_filter_configs[bps-1]);
         if (s->format == FORMAT_ENCRYPTED) {
             int i;
             for (i = 0; i < 8; i++)
@@ -268,9 +273,8 @@ static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
     }
 
     i = 0;
-    for (p = s->decode_buffer; (int32_t*)p < s->decode_buffer + (framelen * s->channels); p++) {
-        int32_t *predictor = &s->ch_ctx[cur_chan].predictor;
-        TTAFilter *filter = &s->ch_ctx[cur_chan].filter;
+    for (int j = 0; j < framelen * s->channels; j++) {
+        int32_t *p = s->decode_buffer[cur_chan] + i;
         TTARice *rice = &s->ch_ctx[cur_chan].rice;
         uint32_t unary, depth, k;
         int32_t value;
@@ -306,44 +310,24 @@ static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
             rice->sum1 += value - (rice->sum1 >> 4);
             if (rice->k1 > 0 && rice->sum1 < ff_tta_shift_16[rice->k1])
                 rice->k1--;
-            else if(rice->sum1 > ff_tta_shift_16[rice->k1 + 1])
+            else if (rice->sum1 > ff_tta_shift_16[rice->k1 + 1])
                 rice->k1++;
             value += ff_tta_shift_1[rice->k0];
         default:
             rice->sum0 += value - (rice->sum0 >> 4);
             if (rice->k0 > 0 && rice->sum0 < ff_tta_shift_16[rice->k0])
                 rice->k0--;
-            else if(rice->sum0 > ff_tta_shift_16[rice->k0 + 1])
+            else if (rice->sum0 > ff_tta_shift_16[rice->k0 + 1])
                 rice->k0++;
         }
 
         // extract coded value
         *p = 1 + ((value >> 1) ^ ((value & 1) - 1));
 
-        // run hybrid filter
-        s->dsp.filter_process(filter->qm, filter->dx, filter->dl, &filter->error, p,
-                              filter->shift, filter->round);
-
-        // fixed order prediction
-#define PRED(x, k) (int32_t)((((uint64_t)(x) << (k)) - (x)) >> (k))
-        switch (s->bps) {
-        case 1: *p += PRED(*predictor, 4); break;
-        case 2:
-        case 3: *p += PRED(*predictor, 5); break;
-        case 4: *p +=      *predictor;     break;
-        }
-        *predictor = *p;
-
         // flip channels
         if (cur_chan < (s->channels-1))
             cur_chan++;
         else {
-            // decorrelate in case of multiple channels
-            if (s->channels > 1) {
-                int32_t *r = p - 1;
-                for (*p += *r / 2; r > (int32_t*)p - s->channels; r--)
-                    *r = *(r + 1) - *r;
-            }
             cur_chan = 0;
             i++;
             // check for last frame
@@ -354,6 +338,64 @@ static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
         }
     }
 
+    // run hybrid filter
+    for (int ch = 0; ch < s->channels; ch++) {
+        TTAFilter *filter = &s->ch_ctx[ch].filter;
+        const int32_t shift = filter->shift;
+        const int32_t round = filter->round;
+        int32_t *p = s->decode_buffer[ch];
+        int32_t error = filter->error;
+        int32_t *qm = filter->qm;
+        int32_t *dx = filter->dx;
+        int32_t *dl = filter->dl;
+
+        for (int n = 0; n < framelen; n++) {
+            s->dsp.filter_process(qm, dx, dl,
+                                  &error, &p[n],
+                                  shift, round);
+        }
+    }
+
+    // fixed order prediction
+#define PRED(x, k) (int32_t)((((uint64_t)(x) << (k)) - (x)) >> (k))
+    for (int ch = 0; ch < s->channels; ch++) {
+        int32_t *p = s->decode_buffer[ch];
+        int32_t predictor = 0;
+
+        switch (bps) {
+        case 1:
+            for (int n = 0; n < framelen; n++) {
+                p[n] += PRED(predictor, 4);
+                predictor = p[n];
+            }
+            break;
+        case 2:
+        case 3:
+            for (int n = 0; n < framelen; n++) {
+                p[n] += PRED(predictor, 5);
+                predictor = p[n];
+            }
+            break;
+        }
+    }
+
+    // decorrelate in case of multiple channels
+    if (s->channels > 1) {
+        int32_t *a = s->decode_buffer[s->channels-1];
+        int32_t *b = s->decode_buffer[s->channels-2];
+
+        for (int n = 0; n < framelen; n++)
+            a[n] += b[n] / 2;
+
+        for (int ch = s->channels - 1; ch >= 1; ch--) {
+            int32_t *b = s->decode_buffer[ch-1];
+            int32_t *c = s->decode_buffer[ch  ];
+
+            for (int n = 0; n < framelen; n++)
+                b[n] = c[n] - b[n];
+        }
+    }
+
     align_get_bits(&gb);
     if (get_bits_left(&gb) < 32) {
         ret = AVERROR_INVALIDDATA;
@@ -362,31 +404,34 @@ static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
     skip_bits_long(&gb, 32); // frame crc
 
     // convert to output buffer
-    switch (s->bps) {
-    case 1: {
-        uint8_t *samples = (uint8_t *)frame->data[0];
-        p = s->decode_buffer;
-        for (i = 0; i < framelen * s->channels; i++)
-            samples[i] = p[i] + 0x80;
-        break;
+    switch (bps) {
+    case 1:
+        for (int ch = 0; ch < s->channels; ch++) {
+            uint8_t *samples = (uint8_t *)frame->extended_data[ch];
+            int32_t *p = s->decode_buffer[ch];
+            for (i = 0; i < framelen; i++)
+                samples[i] = p[i] + 0x80;
         }
-    case 2: {
-        int16_t *samples = (int16_t *)frame->data[0];
-        p = s->decode_buffer;
-        for (i = 0; i < framelen * s->channels; i++)
-            samples[i] = p[i];
         break;
+    case 2:
+        for (int ch = 0; ch < s->channels; ch++) {
+            int16_t *samples = (int16_t *)frame->extended_data[ch];
+            int32_t *p = s->decode_buffer[ch];
+            for (i = 0; i < framelen; i++)
+                samples[i] = p[i];
         }
-    case 3: {
-        // shift samples for 24-bit sample format
-        int32_t *samples = (int32_t *)frame->data[0];
+        break;
+    case 3:
+        for (int ch = 0; ch < s->channels; ch++) {
+            // shift samples for 24-bit sample format
+            int32_t *samples = (int32_t *)frame->extended_data[ch];
 
-        for (i = 0; i < framelen * s->channels; i++)
-            samples[i] = samples[i] * 256U;
+            for (i = 0; i < framelen; i++)
+                samples[i] = samples[i] * 256U;
+        }
         // reset decode buffer
         s->decode_buffer = NULL;
         break;
-        }
     }
 
     *got_frame_ptr = 1;
@@ -394,16 +439,22 @@ static int tta_decode_frame(AVCodecContext *avctx, AVFrame *frame,
     return buf_size;
 error:
     // reset decode buffer
-    if (s->bps == 3)
+    if (bps == 3)
         s->decode_buffer = NULL;
     return ret;
 }
 
-static av_cold int tta_decode_close(AVCodecContext *avctx) {
+static av_cold int tta_decode_close(AVCodecContext *avctx)
+{
     TTAContext *s = avctx->priv_data;
 
-    if (s->bps < 3)
+    if (s->bps < 3) {
+        if (s->decode_buffer) {
+            for (int ch = 0; ch < s->channels; ch++)
+                av_freep(&s->decode_buffer[ch]);
+        }
         av_freep(&s->decode_buffer);
+    }
     s->decode_buffer = NULL;
     av_freep(&s->ch_ctx);
 
-- 
2.39.1


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread