* [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder to lavu/tx
@ 2022-09-23 23:14 Lynne
[not found] ` <NCgcUxK--3-2@lynne.ee-NCgcZNj----2>
2022-09-24 18:42 ` [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder " Martin Storsjö
0 siblings, 2 replies; 19+ messages in thread
From: Lynne @ 2022-09-23 23:14 UTC (permalink / raw)
To: Ffmpeg Devel
[-- Attachment #1: Type: text/plain, Size: 159 bytes --]
This commit changes both the encoder and decoder to use the new lavu/tx code,
which has faster C transforms and more assembly optimizations.
Patch attached.
[-- Attachment #2: 0001-opus-convert-encoder-and-decoder-to-lavu-tx.patch --]
[-- Type: text/x-diff, Size: 9180 bytes --]
From d4fdda5b57ab1e0f08eb3d78dac6b003060dfd41 Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Sat, 24 Sep 2022 00:46:44 +0200
Subject: [PATCH 1/6] opus: convert encoder and decoder to lavu/tx
This commit changes both the encoder and decoder to use the new lavu/tx code,
which has faster C transforms and more assembly optimizations.
---
libavcodec/opus_celt.c | 20 ++++++++++++--------
libavcodec/opus_celt.h | 5 +++--
libavcodec/opusenc.c | 15 +++++++++------
libavcodec/opusenc_psy.c | 13 ++++++++-----
libavcodec/opusenc_psy.h | 4 +++-
5 files changed, 35 insertions(+), 22 deletions(-)
diff --git a/libavcodec/opus_celt.c b/libavcodec/opus_celt.c
index 9dbeff1927..f1fb88a56d 100644
--- a/libavcodec/opus_celt.c
+++ b/libavcodec/opus_celt.c
@@ -323,7 +323,8 @@ int ff_celt_decode_frame(CeltFrame *f, OpusRangeCoder *rc,
{
int i, j, downmix = 0;
int consumed; // bits of entropy consumed thus far for this frame
- MDCT15Context *imdct;
+ AVTXContext *imdct;
+ av_tx_fn imdct_fn;
if (channels != 1 && channels != 2) {
av_log(f->avctx, AV_LOG_ERROR, "Invalid number of coded channels: %d\n",
@@ -385,7 +386,8 @@ int ff_celt_decode_frame(CeltFrame *f, OpusRangeCoder *rc,
f->blocks = f->transient ? 1 << f->size : 1;
f->blocksize = frame_size / f->blocks;
- imdct = f->imdct[f->transient ? 0 : f->size];
+ imdct = f->tx[f->transient ? 0 : f->size];
+ imdct_fn = f->tx_fn[f->transient ? 0 : f->size];
if (channels == 1) {
for (i = 0; i < CELT_MAX_BANDS; i++)
@@ -440,8 +442,8 @@ int ff_celt_decode_frame(CeltFrame *f, OpusRangeCoder *rc,
for (j = 0; j < f->blocks; j++) {
float *dst = block->buf + 1024 + j * f->blocksize;
- imdct->imdct_half(imdct, dst + CELT_OVERLAP / 2, f->block[i].coeffs + j,
- f->blocks);
+ imdct_fn(imdct, dst + CELT_OVERLAP / 2, f->block[i].coeffs + j,
+ sizeof(float)*f->blocks);
f->dsp->vector_fmul_window(dst, dst, dst + CELT_OVERLAP / 2,
ff_celt_window, CELT_OVERLAP / 2);
}
@@ -526,8 +528,8 @@ void ff_celt_free(CeltFrame **f)
if (!frm)
return;
- for (i = 0; i < FF_ARRAY_ELEMS(frm->imdct); i++)
- ff_mdct15_uninit(&frm->imdct[i]);
+ for (i = 0; i < FF_ARRAY_ELEMS(frm->tx); i++)
+ av_tx_uninit(&frm->tx[i]);
ff_celt_pvq_uninit(&frm->pvq);
@@ -555,9 +557,11 @@ int ff_celt_init(AVCodecContext *avctx, CeltFrame **f, int output_channels,
frm->output_channels = output_channels;
frm->apply_phase_inv = apply_phase_inv;
- for (i = 0; i < FF_ARRAY_ELEMS(frm->imdct); i++)
- if ((ret = ff_mdct15_init(&frm->imdct[i], 1, i + 3, -1.0f/32768)) < 0)
+ for (i = 0; i < FF_ARRAY_ELEMS(frm->tx); i++) {
+ const float scale = -1.0f/32768;
+ if ((ret = av_tx_init(&frm->tx[i], &frm->tx_fn[i], AV_TX_FLOAT_MDCT, 1, 15 << (i + 3), &scale, 0)) < 0)
goto fail;
+ }
if ((ret = ff_celt_pvq_init(&frm->pvq, 0)) < 0)
goto fail;
diff --git a/libavcodec/opus_celt.h b/libavcodec/opus_celt.h
index 661ca251de..291a544298 100644
--- a/libavcodec/opus_celt.h
+++ b/libavcodec/opus_celt.h
@@ -30,10 +30,10 @@
#include "opus_pvq.h"
#include "opusdsp.h"
-#include "mdct15.h"
#include "libavutil/float_dsp.h"
#include "libavutil/libm.h"
#include "libavutil/mem_internal.h"
+#include "libavutil/tx.h"
#define CELT_VECTORS 11
#define CELT_ALLOC_STEPS 6
@@ -93,7 +93,8 @@ typedef struct CeltBlock {
struct CeltFrame {
// constant values that do not change during context lifetime
AVCodecContext *avctx;
- MDCT15Context *imdct[4];
+ AVTXContext *tx[4];
+ av_tx_fn tx_fn[4];
AVFloatDSPContext *dsp;
CeltBlock block[2];
CeltPVQ *pvq;
diff --git a/libavcodec/opusenc.c b/libavcodec/opusenc.c
index a7a9d3a5f5..8cdd27d930 100644
--- a/libavcodec/opusenc.c
+++ b/libavcodec/opusenc.c
@@ -40,7 +40,8 @@ typedef struct OpusEncContext {
AVCodecContext *avctx;
AudioFrameQueue afq;
AVFloatDSPContext *dsp;
- MDCT15Context *mdct[CELT_BLOCK_NB];
+ AVTXContext *tx[CELT_BLOCK_NB];
+ av_tx_fn tx_fn[CELT_BLOCK_NB];
CeltPVQ *pvq;
struct FFBufQueue bufqueue;
@@ -204,7 +205,7 @@ static void celt_frame_mdct(OpusEncContext *s, CeltFrame *f)
s->dsp->vector_fmul_reverse(&win[CELT_OVERLAP], src2,
ff_celt_window - 8, 128);
src1 = src2;
- s->mdct[0]->mdct(s->mdct[0], b->coeffs + t, win, f->blocks);
+ s->tx_fn[0](s->tx[0], b->coeffs + t, win, sizeof(float)*f->blocks);
}
}
} else {
@@ -226,7 +227,7 @@ static void celt_frame_mdct(OpusEncContext *s, CeltFrame *f)
ff_celt_window - 8, 128);
memcpy(win + lap_dst + blk_len, temp, CELT_OVERLAP*sizeof(float));
- s->mdct[f->size]->mdct(s->mdct[f->size], b->coeffs, win, 1);
+ s->tx_fn[f->size](s->tx[f->size], b->coeffs, win, sizeof(float));
}
}
@@ -612,7 +613,7 @@ static av_cold int opus_encode_end(AVCodecContext *avctx)
OpusEncContext *s = avctx->priv_data;
for (int i = 0; i < CELT_BLOCK_NB; i++)
- ff_mdct15_uninit(&s->mdct[i]);
+ av_tx_uninit(&s->tx[i]);
ff_celt_pvq_uninit(&s->pvq);
av_freep(&s->dsp);
@@ -668,9 +669,11 @@ static av_cold int opus_encode_init(AVCodecContext *avctx)
return AVERROR(ENOMEM);
/* I have no idea why a base scaling factor of 68 works, could be the twiddles */
- for (int i = 0; i < CELT_BLOCK_NB; i++)
- if ((ret = ff_mdct15_init(&s->mdct[i], 0, i + 3, 68 << (CELT_BLOCK_NB - 1 - i))))
+ for (int i = 0; i < CELT_BLOCK_NB; i++) {
+ const float scale = 68 << (CELT_BLOCK_NB - 1 - i);
+ if ((ret = av_tx_init(&s->tx[i], &s->tx_fn[i], AV_TX_FLOAT_MDCT, 0, 15 << (i + 3), &scale, 0)))
return AVERROR(ENOMEM);
+ }
/* Zero out previous energy (matters for inter first frame) */
for (int ch = 0; ch < s->channels; ch++)
diff --git a/libavcodec/opusenc_psy.c b/libavcodec/opusenc_psy.c
index 1c8f69269c..3bff57d347 100644
--- a/libavcodec/opusenc_psy.c
+++ b/libavcodec/opusenc_psy.c
@@ -22,7 +22,6 @@
#include "opusenc_psy.h"
#include "opus_pvq.h"
#include "opustab.h"
-#include "mdct15.h"
#include "libavutil/qsort.h"
static float pvq_band_cost(CeltPVQ *pvq, CeltFrame *f, OpusRangeCoder *rc, int band,
@@ -99,7 +98,8 @@ static void step_collect_psy_metrics(OpusPsyContext *s, int index)
s->dsp->vector_fmul(s->scratch, s->scratch, s->window[s->bsize_analysis],
(OPUS_BLOCK_SIZE(s->bsize_analysis) << 1));
- s->mdct[s->bsize_analysis]->mdct(s->mdct[s->bsize_analysis], st->coeffs[ch], s->scratch, 1);
+ s->mdct_fn[s->bsize_analysis](s->mdct[s->bsize_analysis], st->coeffs[ch],
+ s->scratch, sizeof(float));
for (i = 0; i < CELT_MAX_BANDS; i++)
st->bands[ch][i] = &st->coeffs[ch][ff_celt_freq_bands[i] << s->bsize_analysis];
@@ -558,13 +558,16 @@ av_cold int ff_opus_psy_init(OpusPsyContext *s, AVCodecContext *avctx,
for (i = 0; i < CELT_BLOCK_NB; i++) {
float tmp;
const int len = OPUS_BLOCK_SIZE(i);
+ const float scale = 68 << (CELT_BLOCK_NB - 1 - i);
s->window[i] = av_malloc(2*len*sizeof(float));
if (!s->window[i]) {
ret = AVERROR(ENOMEM);
goto fail;
}
generate_window_func(s->window[i], 2*len, WFUNC_SINE, &tmp);
- if ((ret = ff_mdct15_init(&s->mdct[i], 0, i + 3, 68 << (CELT_BLOCK_NB - 1 - i))))
+ ret = av_tx_init(&s->mdct[i], &s->mdct_fn[i], AV_TX_FLOAT_MDCT,
+ 0, 15 << (i + 3), &scale, 0);
+ if (ret < 0)
goto fail;
}
@@ -575,7 +578,7 @@ fail:
av_freep(&s->dsp);
for (i = 0; i < CELT_BLOCK_NB; i++) {
- ff_mdct15_uninit(&s->mdct[i]);
+ av_tx_uninit(&s->mdct[i]);
av_freep(&s->window[i]);
}
@@ -598,7 +601,7 @@ av_cold int ff_opus_psy_end(OpusPsyContext *s)
av_freep(&s->dsp);
for (i = 0; i < CELT_BLOCK_NB; i++) {
- ff_mdct15_uninit(&s->mdct[i]);
+ av_tx_uninit(&s->mdct[i]);
av_freep(&s->window[i]);
}
diff --git a/libavcodec/opusenc_psy.h b/libavcodec/opusenc_psy.h
index d4fb096a3d..0a7cdb6f2c 100644
--- a/libavcodec/opusenc_psy.h
+++ b/libavcodec/opusenc_psy.h
@@ -22,6 +22,7 @@
#ifndef AVCODEC_OPUSENC_PSY_H
#define AVCODEC_OPUSENC_PSY_H
+#include "libavutil/tx.h"
#include "libavutil/mem_internal.h"
#include "opusenc.h"
@@ -70,7 +71,8 @@ typedef struct OpusPsyContext {
int max_steps;
float *window[CELT_BLOCK_NB];
- MDCT15Context *mdct[CELT_BLOCK_NB];
+ AVTXContext *mdct[CELT_BLOCK_NB];
+ av_tx_fn mdct_fn[CELT_BLOCK_NB];
int bsize_analysis;
DECLARE_ALIGNED(32, float, scratch)[2048];
--
2.37.2.609.g9ff673ca1a
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* [FFmpeg-devel] [PATCH 2/6] atrac9dec: switch to lavu/tx
[not found] ` <NCgcUxK--3-2@lynne.ee-NCgcZNj----2>
@ 2022-09-23 23:15 ` Lynne
[not found] ` <NCgciJh--3-2@lynne.ee-NCgclLI----2>
1 sibling, 0 replies; 19+ messages in thread
From: Lynne @ 2022-09-23 23:15 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 17 bytes --]
Patch attached.
[-- Attachment #2: 0002-atrac9dec-switch-to-lavu-tx.patch --]
[-- Type: text/x-diff, Size: 3040 bytes --]
From 5a310246569e19efd50b37016a80fe6171df0329 Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Sat, 24 Sep 2022 00:51:18 +0200
Subject: [PATCH 2/6] atrac9dec: switch to lavu/tx
---
libavcodec/atrac9dec.c | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/libavcodec/atrac9dec.c b/libavcodec/atrac9dec.c
index d3a5d05799..60962b1676 100644
--- a/libavcodec/atrac9dec.c
+++ b/libavcodec/atrac9dec.c
@@ -25,8 +25,8 @@
#include "codec_internal.h"
#include "decode.h"
#include "get_bits.h"
-#include "fft.h"
#include "atrac9tab.h"
+#include "libavutil/tx.h"
#include "libavutil/lfg.h"
#include "libavutil/float_dsp.h"
#include "libavutil/mem_internal.h"
@@ -86,7 +86,8 @@ typedef struct ATRAC9BlockData {
typedef struct ATRAC9Context {
AVCodecContext *avctx;
AVFloatDSPContext *fdsp;
- FFTContext imdct;
+ AVTXContext *tx;
+ av_tx_fn tx_fn;
ATRAC9BlockData block[5];
AVLFG lfg;
@@ -101,7 +102,7 @@ typedef struct ATRAC9Context {
uint8_t alloc_curve[48][48];
DECLARE_ALIGNED(32, float, imdct_win)[256];
- DECLARE_ALIGNED(32, float, temp)[256];
+ DECLARE_ALIGNED(32, float, temp)[2048];
} ATRAC9Context;
static VLC sf_vlc[2][8]; /* Signed/unsigned, length */
@@ -778,7 +779,7 @@ imdct:
const ptrdiff_t offset = wsize*frame_idx*sizeof(float);
float *dst = (float *)(frame->extended_data[dst_idx] + offset);
- s->imdct.imdct_half(&s->imdct, s->temp, c->coeffs);
+ s->tx_fn(s->tx, s->temp, c->coeffs, sizeof(float));
s->fdsp->vector_fmul_window(dst, c->prev_win, s->temp,
s->imdct_win, wsize >> 1);
memcpy(c->prev_win, s->temp + (wsize >> 1), sizeof(float)*wsize >> 1);
@@ -834,7 +835,7 @@ static av_cold int atrac9_decode_close(AVCodecContext *avctx)
{
ATRAC9Context *s = avctx->priv_data;
- ff_mdct_end(&s->imdct);
+ av_tx_uninit(&s->tx);
av_freep(&s->fdsp);
return 0;
@@ -896,10 +897,11 @@ static av_cold void atrac9_init_static(void)
static av_cold int atrac9_decode_init(AVCodecContext *avctx)
{
+ float scale;
static AVOnce static_table_init = AV_ONCE_INIT;
GetBitContext gb;
ATRAC9Context *s = avctx->priv_data;
- int version, block_config_idx, superframe_idx, alloc_c_len;
+ int err, version, block_config_idx, superframe_idx, alloc_c_len;
s->avctx = avctx;
@@ -959,8 +961,11 @@ static av_cold int atrac9_decode_init(AVCodecContext *avctx)
s->frame_count = 1 << superframe_idx;
s->frame_log2 = at9_tab_sri_frame_log2[s->samplerate_idx];
- if (ff_mdct_init(&s->imdct, s->frame_log2 + 1, 1, 1.0f / 32768.0f))
- return AVERROR(ENOMEM);
+ scale = 1.0f / 32768.0;
+ err = av_tx_init(&s->tx, &s->tx_fn, AV_TX_FLOAT_MDCT, 1,
+ 1 << s->frame_log2, &scale, 0);
+ if (err < 0)
+ return err;
s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
if (!s->fdsp)
--
2.37.2.609.g9ff673ca1a
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* [FFmpeg-devel] [PATCH 3/6] ac3: convert encoder and decoder to lavu/tx
[not found] ` <NCgciJh--3-2@lynne.ee-NCgclLI----2>
@ 2022-09-23 23:18 ` Lynne
[not found] ` <NCgdFqI--B-2@lynne.ee-NCgdIwE----2>
1 sibling, 0 replies; 19+ messages in thread
From: Lynne @ 2022-09-23 23:18 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 89 bytes --]
The fixed-point transforms are much better, faster, and more accurate.
Patch attached.
[-- Attachment #2: 0003-ac3-convert-encoder-and-decoder-to-lavu-tx.patch --]
[-- Type: text/x-diff, Size: 11123 bytes --]
From e54775cc93ceb27d9faabe1ddf9b1eacb269826b Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Sat, 24 Sep 2022 01:05:19 +0200
Subject: [PATCH 3/6] ac3: convert encoder and decoder to lavu/tx
The fixed-point transforms are much better, faster, and more accurate.
---
libavcodec/ac3dec.c | 22 +++++++++++++---------
libavcodec/ac3dec.h | 6 +++---
libavcodec/ac3dec_fixed.c | 3 ++-
libavcodec/ac3dec_float.c | 1 +
libavcodec/ac3enc.c | 2 +-
libavcodec/ac3enc.h | 7 ++++---
libavcodec/ac3enc_fixed.c | 19 ++++---------------
libavcodec/ac3enc_float.c | 18 +++---------------
libavcodec/ac3enc_template.c | 4 ++--
9 files changed, 33 insertions(+), 49 deletions(-)
diff --git a/libavcodec/ac3dec.c b/libavcodec/ac3dec.c
index aba8e0fb7f..cd3320caa0 100644
--- a/libavcodec/ac3dec.c
+++ b/libavcodec/ac3dec.c
@@ -217,13 +217,17 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
{
static AVOnce init_static_once = AV_ONCE_INIT;
AC3DecodeContext *s = avctx->priv_data;
+ const float scale = 1.0f;
int i, ret;
s->avctx = avctx;
- if ((ret = ff_mdct_init(&s->imdct_256, 8, 1, 1.0)) < 0 ||
- (ret = ff_mdct_init(&s->imdct_512, 9, 1, 1.0)) < 0)
+ if ((ret = av_tx_init(&s->tx_128, &s->tx_fn_128, IMDCT_TYPE, 1, 128, &scale, 0)))
return ret;
+
+ if ((ret = av_tx_init(&s->tx_256, &s->tx_fn_256, IMDCT_TYPE, 1, 256, &scale, 0)))
+ return ret;
+
AC3_RENAME(ff_kbd_window_init)(s->window, 5.0, 256);
ff_bswapdsp_init(&s->bdsp);
@@ -721,10 +725,10 @@ static inline void do_imdct(AC3DecodeContext *s, int channels, int offset)
for (ch = 1; ch <= channels; ch++) {
if (s->block_switch[ch]) {
int i;
- FFTSample *x = s->tmp_output + 128;
+ INTFLOAT *x = s->tmp_output + 128;
for (i = 0; i < 128; i++)
x[i] = s->transform_coeffs[ch][2 * i];
- s->imdct_256.imdct_half(&s->imdct_256, s->tmp_output, x);
+ s->tx_fn_128(s->tx_128, s->tmp_output, x, sizeof(INTFLOAT));
#if USE_FIXED
s->fdsp->vector_fmul_window_scaled(s->outptr[ch - 1], s->delay[ch - 1 + offset],
s->tmp_output, s->window, 128, 8);
@@ -734,9 +738,9 @@ static inline void do_imdct(AC3DecodeContext *s, int channels, int offset)
#endif
for (i = 0; i < 128; i++)
x[i] = s->transform_coeffs[ch][2 * i + 1];
- s->imdct_256.imdct_half(&s->imdct_256, s->delay[ch - 1 + offset], x);
+ s->tx_fn_256(s->tx_256, s->delay[ch - 1 + offset], x, sizeof(INTFLOAT));
} else {
- s->imdct_512.imdct_half(&s->imdct_512, s->tmp_output, s->transform_coeffs[ch]);
+ s->tx_fn_256(s->tx_256, s->tmp_output, s->transform_coeffs[ch], sizeof(INTFLOAT));
#if USE_FIXED
s->fdsp->vector_fmul_window_scaled(s->outptr[ch - 1], s->delay[ch - 1 + offset],
s->tmp_output, s->window, 128, 8);
@@ -744,7 +748,7 @@ static inline void do_imdct(AC3DecodeContext *s, int channels, int offset)
s->fdsp->vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1 + offset],
s->tmp_output, s->window, 128);
#endif
- memcpy(s->delay[ch - 1 + offset], s->tmp_output + 128, 128 * sizeof(FFTSample));
+ memcpy(s->delay[ch - 1 + offset], s->tmp_output + 128, 128 * sizeof(INTFLOAT));
}
}
}
@@ -1865,8 +1869,8 @@ skip:
static av_cold int ac3_decode_end(AVCodecContext *avctx)
{
AC3DecodeContext *s = avctx->priv_data;
- ff_mdct_end(&s->imdct_512);
- ff_mdct_end(&s->imdct_256);
+ av_tx_uninit(&s->tx_256);
+ av_tx_uninit(&s->tx_128);
av_freep(&s->fdsp);
av_freep(&s->downmix_coeffs[0]);
diff --git a/libavcodec/ac3dec.h b/libavcodec/ac3dec.h
index 88651ae61f..138b462abb 100644
--- a/libavcodec/ac3dec.h
+++ b/libavcodec/ac3dec.h
@@ -50,6 +50,7 @@
#ifndef AVCODEC_AC3DEC_H
#define AVCODEC_AC3DEC_H
+#include "libavutil/tx.h"
#include "libavutil/float_dsp.h"
#include "libavutil/fixed_dsp.h"
#include "libavutil/lfg.h"
@@ -60,7 +61,6 @@
#include "avcodec.h"
#include "bswapdsp.h"
#include "get_bits.h"
-#include "fft.h"
#include "fmtconvert.h"
#define AC3_OUTPUT_LFEON 8
@@ -223,8 +223,8 @@ typedef struct AC3DecodeContext {
///@name IMDCT
int block_switch[AC3_MAX_CHANNELS]; ///< block switch flags (blksw)
- FFTContext imdct_512; ///< for 512 sample IMDCT
- FFTContext imdct_256; ///< for 256 sample IMDCT
+ AVTXContext *tx_128, *tx_256;
+ av_tx_fn tx_fn_128, tx_fn_256;
///@}
///@name Optimization
diff --git a/libavcodec/ac3dec_fixed.c b/libavcodec/ac3dec_fixed.c
index 0a7ae6cfbf..c9e5cda69c 100644
--- a/libavcodec/ac3dec_fixed.c
+++ b/libavcodec/ac3dec_fixed.c
@@ -47,11 +47,12 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#define FFT_FLOAT 0
#define USE_FIXED 1
#include "ac3dec.h"
#include "codec_internal.h"
+#define IMDCT_TYPE AV_TX_INT32_MDCT
+#include "ac3dec.h"
static const int end_freq_inv_tab[8] =
{
diff --git a/libavcodec/ac3dec_float.c b/libavcodec/ac3dec_float.c
index 8c1adb3e01..b8868d8ee1 100644
--- a/libavcodec/ac3dec_float.c
+++ b/libavcodec/ac3dec_float.c
@@ -29,6 +29,7 @@
*/
#include "config_components.h"
+#define IMDCT_TYPE AV_TX_FLOAT_MDCT
#include "ac3dec.h"
#include "codec_internal.h"
diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index a090576823..fbedf40d20 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -2203,7 +2203,7 @@ av_cold int ff_ac3_encode_close(AVCodecContext *avctx)
av_freep(&block->cpl_coord_mant);
}
- s->mdct_end(s);
+ av_tx_uninit(&s->tx);
return 0;
}
diff --git a/libavcodec/ac3enc.h b/libavcodec/ac3enc.h
index f0dc006759..55e88d69e4 100644
--- a/libavcodec/ac3enc.h
+++ b/libavcodec/ac3enc.h
@@ -31,12 +31,13 @@
#include <stdint.h>
#include "libavutil/opt.h"
+#include "libavutil/tx.h"
+
#include "ac3.h"
#include "ac3defs.h"
#include "ac3dsp.h"
#include "avcodec.h"
#include "codec_internal.h"
-#include "fft.h"
#include "mathops.h"
#include "me_cmp.h"
#include "put_bits.h"
@@ -167,7 +168,8 @@ typedef struct AC3EncodeContext {
#endif
MECmpContext mecc;
AC3DSPContext ac3dsp; ///< AC-3 optimized functions
- FFTContext mdct; ///< FFT context for MDCT calculation
+ AVTXContext *tx; ///< FFT context for MDCT calculation
+ av_tx_fn tx_fn;
const SampleType *mdct_window; ///< MDCT window function array
AC3Block blocks[AC3_MAX_BLOCKS]; ///< per-block info
@@ -257,7 +259,6 @@ typedef struct AC3EncodeContext {
int warned_alternate_bitstream;
/* fixed vs. float function pointers */
- void (*mdct_end)(struct AC3EncodeContext *s);
int (*mdct_init)(struct AC3EncodeContext *s);
/* fixed vs. float templated function pointers */
diff --git a/libavcodec/ac3enc_fixed.c b/libavcodec/ac3enc_fixed.c
index a22d3b4abf..76e5392733 100644
--- a/libavcodec/ac3enc_fixed.c
+++ b/libavcodec/ac3enc_fixed.c
@@ -27,7 +27,7 @@
*/
#define AC3ENC_FLOAT 0
-#define FFT_FLOAT 0
+#include "internal.h"
#include "audiodsp.h"
#include "ac3enc.h"
#include "codec_internal.h"
@@ -66,20 +66,8 @@ static CoefType calc_cpl_coord(CoefSumType energy_ch, CoefSumType energy_cpl)
}
}
-
#include "ac3enc_template.c"
-
-/**
- * Finalize MDCT and free allocated memory.
- *
- * @param s AC-3 encoder private context
- */
-static av_cold void ac3_fixed_mdct_end(AC3EncodeContext *s)
-{
- ff_mdct_end(&s->mdct);
-}
-
/**
* Initialize MDCT tables.
*
@@ -89,6 +77,7 @@ static av_cold void ac3_fixed_mdct_end(AC3EncodeContext *s)
static av_cold int ac3_fixed_mdct_init(AC3EncodeContext *s)
{
float fwin[AC3_BLOCK_SIZE];
+ const float scale = -1.0f;
int32_t *iwin = av_malloc_array(AC3_BLOCK_SIZE, sizeof(*iwin));
if (!iwin)
@@ -104,7 +93,8 @@ static av_cold int ac3_fixed_mdct_init(AC3EncodeContext *s)
if (!s->fdsp)
return AVERROR(ENOMEM);
- return ff_mdct_init(&s->mdct, 9, 0, -1.0);
+ return av_tx_init(&s->tx, &s->tx_fn, AV_TX_INT32_MDCT, 0,
+ AC3_BLOCK_SIZE, &scale, 0);
}
@@ -112,7 +102,6 @@ static av_cold int ac3_fixed_encode_init(AVCodecContext *avctx)
{
AC3EncodeContext *s = avctx->priv_data;
s->fixed_point = 1;
- s->mdct_end = ac3_fixed_mdct_end;
s->mdct_init = ac3_fixed_mdct_init;
s->allocate_sample_buffers = allocate_sample_buffers;
return ff_ac3_encode_init(avctx);
diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
index 6238980690..8a3f605b48 100644
--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@@ -75,21 +75,8 @@ static void sum_square_butterfly(AC3EncodeContext *s, float sum[4],
s->ac3dsp.sum_square_butterfly_float(sum, coef0, coef1, len);
}
-
#include "ac3enc_template.c"
-
-/**
- * Finalize MDCT and free allocated memory.
- *
- * @param s AC-3 encoder private context
- */
-static av_cold void ac3_float_mdct_end(AC3EncodeContext *s)
-{
- ff_mdct_end(&s->mdct);
-}
-
-
/**
* Initialize MDCT tables.
*
@@ -98,6 +85,7 @@ static av_cold void ac3_float_mdct_end(AC3EncodeContext *s)
*/
static av_cold int ac3_float_mdct_init(AC3EncodeContext *s)
{
+ const float scale = -2.0 / AC3_WINDOW_SIZE;
float *window = av_malloc_array(AC3_BLOCK_SIZE, sizeof(*window));
if (!window) {
av_log(s->avctx, AV_LOG_ERROR, "Cannot allocate memory.\n");
@@ -107,14 +95,14 @@ static av_cold int ac3_float_mdct_init(AC3EncodeContext *s)
ff_kbd_window_init(window, 5.0, AC3_BLOCK_SIZE);
s->mdct_window = window;
- return ff_mdct_init(&s->mdct, 9, 0, -2.0 / AC3_WINDOW_SIZE);
+ return av_tx_init(&s->tx, &s->tx_fn, AV_TX_FLOAT_MDCT, 0,
+ AC3_BLOCK_SIZE, &scale, 0);
}
av_cold int ff_ac3_float_encode_init(AVCodecContext *avctx)
{
AC3EncodeContext *s = avctx->priv_data;
- s->mdct_end = ac3_float_mdct_end;
s->mdct_init = ac3_float_mdct_init;
s->allocate_sample_buffers = allocate_sample_buffers;
s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
diff --git a/libavcodec/ac3enc_template.c b/libavcodec/ac3enc_template.c
index de66964d0d..be4ecebc9c 100644
--- a/libavcodec/ac3enc_template.c
+++ b/libavcodec/ac3enc_template.c
@@ -98,8 +98,8 @@ static void apply_mdct(AC3EncodeContext *s)
&input_samples[AC3_BLOCK_SIZE],
s->mdct_window, AC3_BLOCK_SIZE);
- s->mdct.mdct_calc(&s->mdct, block->mdct_coef[ch+1],
- s->windowed_samples);
+ s->tx_fn(s->tx, block->mdct_coef[ch+1],
+ s->windowed_samples, sizeof(float));
}
}
}
--
2.37.2.609.g9ff673ca1a
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* [FFmpeg-devel] [PATCH 4/6] vorbisdec: convert to lavu/tx
[not found] ` <NCgdFqI--B-2@lynne.ee-NCgdIwE----2>
@ 2022-09-23 23:18 ` Lynne
[not found] ` <NCgdOA8--3-2@lynne.ee-NCgdR4N----2>
1 sibling, 0 replies; 19+ messages in thread
From: Lynne @ 2022-09-23 23:18 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 17 bytes --]
Patch attached.
[-- Attachment #2: 0004-vorbisdec-convert-to-lavu-tx.patch --]
[-- Type: text/x-diff, Size: 3334 bytes --]
From 1334c8c26a8d1c3f8e2aa98b902b2dab6e524a84 Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Sat, 24 Sep 2022 01:07:15 +0200
Subject: [PATCH 4/6] vorbisdec: convert to lavu/tx
---
libavcodec/vorbisdec.c | 29 ++++++++++++++++++++---------
1 file changed, 20 insertions(+), 9 deletions(-)
diff --git a/libavcodec/vorbisdec.c b/libavcodec/vorbisdec.c
index 0d04e7c2c4..44c76d0da2 100644
--- a/libavcodec/vorbisdec.c
+++ b/libavcodec/vorbisdec.c
@@ -29,6 +29,7 @@
#include <inttypes.h>
#include <math.h>
+#include "libavutil/tx.h"
#include "libavutil/avassert.h"
#include "libavutil/float_dsp.h"
@@ -36,7 +37,6 @@
#include "avcodec.h"
#include "codec_internal.h"
#include "decode.h"
-#include "fft.h"
#include "get_bits.h"
#include "vorbis.h"
#include "vorbisdsp.h"
@@ -129,7 +129,8 @@ typedef struct vorbis_context_s {
VorbisDSPContext dsp;
AVFloatDSPContext *fdsp;
- FFTContext mdct[2];
+ AVTXContext *tx[2];
+ av_tx_fn tx_fn[2];
uint8_t first_frame;
int64_t initial_pts;
uint32_t version;
@@ -201,8 +202,8 @@ static void vorbis_free(vorbis_context *vc)
av_freep(&vc->residues);
av_freep(&vc->modes);
- ff_mdct_end(&vc->mdct[0]);
- ff_mdct_end(&vc->mdct[1]);
+ av_tx_uninit(&vc->tx[0]);
+ av_tx_uninit(&vc->tx[1]);
if (vc->codebooks)
for (i = 0; i < vc->codebook_count; ++i) {
@@ -961,6 +962,8 @@ static int vorbis_parse_setup_hdr(vorbis_context *vc)
static int vorbis_parse_id_hdr(vorbis_context *vc)
{
+ int ret;
+ const float mdct_scale = -1.0f;
GetBitContext *gb = &vc->gb;
unsigned bl0, bl1;
@@ -1008,8 +1011,14 @@ static int vorbis_parse_id_hdr(vorbis_context *vc)
vc->previous_window = -1;
- ff_mdct_init(&vc->mdct[0], bl0, 1, -1.0);
- ff_mdct_init(&vc->mdct[1], bl1, 1, -1.0);
+ if ((ret = av_tx_init(&vc->tx[0], &vc->tx_fn[0], AV_TX_FLOAT_MDCT, 1, 1 << (bl0 - 1),
+ &mdct_scale, 0)))
+ return ret;
+
+ if ((ret = av_tx_init(&vc->tx[1], &vc->tx_fn[1], AV_TX_FLOAT_MDCT, 1, 1 << (bl1 - 1),
+ &mdct_scale, 0)))
+ return ret;
+
vc->fdsp = avpriv_float_dsp_alloc(vc->avctx->flags & AV_CODEC_FLAG_BITEXACT);
if (!vc->fdsp)
return AVERROR(ENOMEM);
@@ -1584,7 +1593,8 @@ static inline int vorbis_residue_decode(vorbis_context *vc, vorbis_residue *vr,
static int vorbis_parse_audio_packet(vorbis_context *vc, float **floor_ptr)
{
GetBitContext *gb = &vc->gb;
- FFTContext *mdct;
+ AVTXContext *tx;
+ av_tx_fn tx_fn;
int previous_window = vc->previous_window;
unsigned mode_number, blockflag, blocksize;
int i, j;
@@ -1706,12 +1716,13 @@ static int vorbis_parse_audio_packet(vorbis_context *vc, float **floor_ptr)
// Dotproduct, MDCT
- mdct = &vc->mdct[blockflag];
+ tx = vc->tx[blockflag];
+ tx_fn = vc->tx_fn[blockflag];
for (j = vc->audio_channels-1;j >= 0; j--) {
ch_res_ptr = vc->channel_residues + res_chan[j] * blocksize / 2;
vc->fdsp->vector_fmul(floor_ptr[j], floor_ptr[j], ch_res_ptr, blocksize / 2);
- mdct->imdct_half(mdct, ch_res_ptr, floor_ptr[j]);
+ tx_fn(tx, ch_res_ptr, floor_ptr[j], sizeof(float));
}
// Overlap/add, save data for next overlapping
--
2.37.2.609.g9ff673ca1a
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* [FFmpeg-devel] [PATCH 5/6] twinvq: convert to lavu/tx
[not found] ` <NCgdOA8--3-2@lynne.ee-NCgdR4N----2>
@ 2022-09-23 23:19 ` Lynne
[not found] ` <NCgdYSD--3-2@lynne.ee-NCgdaK4----2>
1 sibling, 0 replies; 19+ messages in thread
From: Lynne @ 2022-09-23 23:19 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 17 bytes --]
Patch attached.
[-- Attachment #2: 0005-twinvq-convert-to-lavu-tx.patch --]
[-- Type: text/x-diff, Size: 2877 bytes --]
From 685ac65ce0f391fd1d3a06e191c9659dacd375be Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Sat, 24 Sep 2022 01:07:44 +0200
Subject: [PATCH 5/6] twinvq: convert to lavu/tx
---
libavcodec/twinvq.c | 12 +++++++-----
libavcodec/twinvq.h | 6 ++++--
2 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/libavcodec/twinvq.c b/libavcodec/twinvq.c
index da10923d78..8cd3c91e14 100644
--- a/libavcodec/twinvq.c
+++ b/libavcodec/twinvq.c
@@ -328,7 +328,8 @@ static const uint8_t wtype_to_wsize[] = { 0, 0, 2, 2, 2, 1, 0, 1, 1 };
static void imdct_and_window(TwinVQContext *tctx, enum TwinVQFrameType ftype,
int wtype, float *in, float *prev, int ch)
{
- FFTContext *mdct = &tctx->mdct_ctx[ftype];
+ AVTXContext *tx = tctx->tx[ftype];
+ av_tx_fn tx_fn = tctx->tx_fn[ftype];
const TwinVQModeTab *mtab = tctx->mtab;
int bsize = mtab->size / mtab->fmode[ftype].sub;
int size = mtab->size;
@@ -357,7 +358,7 @@ static void imdct_and_window(TwinVQContext *tctx, enum TwinVQFrameType ftype,
wsize = types_sizes[wtype_to_wsize[sub_wtype]];
- mdct->imdct_half(mdct, buf1 + bsize * j, in + bsize * j);
+ tx_fn(tx, buf1 + bsize * j, in + bsize * j, sizeof(float));
tctx->fdsp->vector_fmul_window(out2, prev_buf + (bsize - wsize) / 2,
buf1 + bsize * j,
@@ -543,8 +544,9 @@ static av_cold int init_mdct_win(TwinVQContext *tctx)
for (i = 0; i < 3; i++) {
int bsize = tctx->mtab->size / tctx->mtab->fmode[i].sub;
- if ((ret = ff_mdct_init(&tctx->mdct_ctx[i], av_log2(bsize) + 1, 1,
- -sqrt(norm / bsize) / (1 << 15))))
+ const float scale = -sqrt(norm / bsize) / (1 << 15);
+ if ((ret = av_tx_init(&tctx->tx[i], &tctx->tx_fn[i], AV_TX_FLOAT_MDCT,
+ 1, bsize, &scale, 0)))
return ret;
}
@@ -745,7 +747,7 @@ av_cold int ff_twinvq_decode_close(AVCodecContext *avctx)
int i;
for (i = 0; i < 3; i++) {
- ff_mdct_end(&tctx->mdct_ctx[i]);
+ av_tx_uninit(&tctx->tx[i]);
av_freep(&tctx->cos_tabs[i]);
}
diff --git a/libavcodec/twinvq.h b/libavcodec/twinvq.h
index b3c881cfac..72b9ba8198 100644
--- a/libavcodec/twinvq.h
+++ b/libavcodec/twinvq.h
@@ -25,10 +25,11 @@
#include <math.h>
#include <stdint.h>
+#include "libavutil/tx.h"
#include "libavutil/common.h"
#include "libavutil/float_dsp.h"
#include "avcodec.h"
-#include "fft.h"
+#include "internal.h"
enum TwinVQCodec {
TWINVQ_CODEC_VQF,
@@ -136,7 +137,8 @@ typedef struct TwinVQModeTab {
typedef struct TwinVQContext {
AVCodecContext *avctx;
AVFloatDSPContext *fdsp;
- FFTContext mdct_ctx[3];
+ AVTXContext *tx[3];
+ av_tx_fn tx_fn[3];
const TwinVQModeTab *mtab;
--
2.37.2.609.g9ff673ca1a
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* [FFmpeg-devel] [PATCH 6/6] wmaprodec: convert to lavu/tx
[not found] ` <NCgdYSD--3-2@lynne.ee-NCgdaK4----2>
@ 2022-09-23 23:20 ` Lynne
2022-09-25 12:38 ` Andreas Rheinhardt
0 siblings, 1 reply; 19+ messages in thread
From: Lynne @ 2022-09-23 23:20 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 17 bytes --]
Patch attached.
[-- Attachment #2: 0006-wmaprodec-convert-to-lavu-tx.patch --]
[-- Type: text/x-diff, Size: 3427 bytes --]
From 4ad73f29065051c68991eb96aeae7f771039209a Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Sat, 24 Sep 2022 01:08:00 +0200
Subject: [PATCH 6/6] wmaprodec: convert to lavu/tx
---
libavcodec/wmaprodec.c | 24 ++++++++++++++----------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/libavcodec/wmaprodec.c b/libavcodec/wmaprodec.c
index 701dfa955c..68e17e0743 100644
--- a/libavcodec/wmaprodec.c
+++ b/libavcodec/wmaprodec.c
@@ -89,6 +89,7 @@
#include <inttypes.h>
#include "libavutil/audio_fifo.h"
+#include "libavutil/tx.h"
#include "libavutil/ffmath.h"
#include "libavutil/float_dsp.h"
#include "libavutil/intfloat.h"
@@ -185,7 +186,8 @@ typedef struct WMAProDecodeCtx {
uint8_t frame_data[MAX_FRAMESIZE +
AV_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data
PutBitContext pb; ///< context for filling the frame_data buffer
- FFTContext mdct_ctx[WMAPRO_BLOCK_SIZES]; ///< MDCT context per block size
+ AVTXContext *tx[WMAPRO_BLOCK_SIZES]; ///< MDCT context per block size
+ av_tx_fn tx_fn[WMAPRO_BLOCK_SIZES];
DECLARE_ALIGNED(32, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer
const float* windows[WMAPRO_BLOCK_SIZES]; ///< windows for the different block sizes
@@ -287,7 +289,7 @@ static av_cold int decode_end(WMAProDecodeCtx *s)
av_freep(&s->fdsp);
for (i = 0; i < WMAPRO_BLOCK_SIZES; i++)
- ff_mdct_end(&s->mdct_ctx[i]);
+ av_tx_uninit(&s->tx[i]);
return 0;
}
@@ -552,12 +554,13 @@ static av_cold int decode_init(WMAProDecodeCtx *s, AVCodecContext *avctx, int nu
return AVERROR(ENOMEM);
/** init MDCT, FIXME: only init needed sizes */
- for (int i = 0; i < WMAPRO_BLOCK_SIZES; i++) {
- ret = ff_mdct_init(&s->mdct_ctx[i], WMAPRO_BLOCK_MIN_BITS + 1 + i, 1,
- 1.0 / (1 << (WMAPRO_BLOCK_MIN_BITS + i - 1))
- / (1ll << (s->bits_per_sample - 1)));
- if (ret < 0)
- return ret;
+ for (i = 0; i < WMAPRO_BLOCK_SIZES; i++) {
+ const float scale = 1.0 / (1 << (WMAPRO_BLOCK_MIN_BITS + i - 1))
+ / (1ll << (s->bits_per_sample - 1));
+ int err = av_tx_init(&s->tx[i], &s->tx_fn[i], AV_TX_FLOAT_MDCT, 1,
+ 1 << (WMAPRO_BLOCK_MIN_BITS + i), &scale, 0);
+ if (err < 0)
+ return err;
}
/** init MDCT windows: simple sine window */
@@ -1386,7 +1389,8 @@ static int decode_subframe(WMAProDecodeCtx *s)
get_bits_count(&s->gb) - s->subframe_offset);
if (transmit_coeffs) {
- FFTContext *mdct = &s->mdct_ctx[av_log2(subframe_len) - WMAPRO_BLOCK_MIN_BITS];
+ AVTXContext *tx = s->tx[av_log2(subframe_len) - WMAPRO_BLOCK_MIN_BITS];
+ av_tx_fn tx_fn = s->tx_fn[av_log2(subframe_len) - WMAPRO_BLOCK_MIN_BITS];
/** reconstruct the per channel data */
inverse_channel_transform(s);
for (i = 0; i < s->channels_for_cur_subframe; i++) {
@@ -1412,7 +1416,7 @@ static int decode_subframe(WMAProDecodeCtx *s)
}
/** apply imdct (imdct_half == DCTIV with reverse) */
- mdct->imdct_half(mdct, s->channel[c].coeffs, s->tmp);
+ tx_fn(tx, s->channel[c].coeffs, s->tmp, sizeof(float));
}
}
--
2.37.2.609.g9ff673ca1a
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder to lavu/tx
2022-09-23 23:14 [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder to lavu/tx Lynne
[not found] ` <NCgcUxK--3-2@lynne.ee-NCgcZNj----2>
@ 2022-09-24 18:42 ` Martin Storsjö
2022-09-24 19:26 ` Hendrik Leppkes
1 sibling, 1 reply; 19+ messages in thread
From: Martin Storsjö @ 2022-09-24 18:42 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Sat, 24 Sep 2022, Lynne wrote:
> This commit changes both the encoder and decoder to use the new lavu/tx code,
> which has faster C transforms and more assembly optimizations.
What's the case of e.g. 32 bit arm - that does have a bunch of fft and
mdct assembly, but is that something that ends up used by opus today, or
does the mdct15 stuff use separate codepaths that aren't optimized there
today yet?
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder to lavu/tx
2022-09-24 18:42 ` [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder " Martin Storsjö
@ 2022-09-24 19:26 ` Hendrik Leppkes
2022-09-24 19:31 ` Hendrik Leppkes
0 siblings, 1 reply; 19+ messages in thread
From: Hendrik Leppkes @ 2022-09-24 19:26 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Sat, Sep 24, 2022 at 8:43 PM Martin Storsjö <martin@martin.st> wrote:
>
> On Sat, 24 Sep 2022, Lynne wrote:
>
> > This commit changes both the encoder and decoder to use the new lavu/tx code,
> > which has faster C transforms and more assembly optimizations.
>
> What's the case of e.g. 32 bit arm - that does have a bunch of fft and
> mdct assembly, but is that something that ends up used by opus today, or
> does the mdct15 stuff use separate codepaths that aren't optimized there
> today yet?
>
mdct15 only has some x86 assembly, nothing for ARM.
Only the normal (power of 2) fft/mdct has some ARM 32-bit assembly.
- Hendrik
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder to lavu/tx
2022-09-24 19:26 ` Hendrik Leppkes
@ 2022-09-24 19:31 ` Hendrik Leppkes
2022-09-24 19:40 ` Martin Storsjö
0 siblings, 1 reply; 19+ messages in thread
From: Hendrik Leppkes @ 2022-09-24 19:31 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Sat, Sep 24, 2022 at 9:26 PM Hendrik Leppkes <h.leppkes@gmail.com> wrote:
>
> On Sat, Sep 24, 2022 at 8:43 PM Martin Storsjö <martin@martin.st> wrote:
> >
> > On Sat, 24 Sep 2022, Lynne wrote:
> >
> > > This commit changes both the encoder and decoder to use the new lavu/tx code,
> > > which has faster C transforms and more assembly optimizations.
> >
> > What's the case of e.g. 32 bit arm - that does have a bunch of fft and
> > mdct assembly, but is that something that ends up used by opus today, or
> > does the mdct15 stuff use separate codepaths that aren't optimized there
> > today yet?
> >
>
> mdct15 only has some x86 assembly, nothing for ARM.
> Only the normal (power of 2) fft/mdct has some ARM 32-bit assembly.
>
Actually, I missed that the mdct15 internally uses one of the normal
fft functions for a part of the calculation, but how much impact that
has on performance vs. the new code where the C alone is quite a bit
faster would have to be confirmed by Lynne.
- Hendrik
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder to lavu/tx
2022-09-24 19:31 ` Hendrik Leppkes
@ 2022-09-24 19:40 ` Martin Storsjö
2022-09-24 21:57 ` Lynne
[not found] ` <NClNyyy--3-2@lynne.ee-NClVNO6----2>
0 siblings, 2 replies; 19+ messages in thread
From: Martin Storsjö @ 2022-09-24 19:40 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Sat, 24 Sep 2022, Hendrik Leppkes wrote:
> On Sat, Sep 24, 2022 at 9:26 PM Hendrik Leppkes <h.leppkes@gmail.com> wrote:
>>
>> On Sat, Sep 24, 2022 at 8:43 PM Martin Storsjö <martin@martin.st> wrote:
>> >
>> > On Sat, 24 Sep 2022, Lynne wrote:
>> >
>> > > This commit changes both the encoder and decoder to use the new lavu/tx code,
>> > > which has faster C transforms and more assembly optimizations.
>> >
>> > What's the case of e.g. 32 bit arm - that does have a bunch of fft and
>> > mdct assembly, but is that something that ends up used by opus today, or
>> > does the mdct15 stuff use separate codepaths that aren't optimized there
>> > today yet?
>> >
>>
>> mdct15 only has some x86 assembly, nothing for ARM.
>> Only the normal (power of 2) fft/mdct has some ARM 32-bit assembly.
>>
>
> Actually, I missed that the mdct15 internally uses one of the normal
> fft functions for a part of the calculation, but how much impact that
> has on performance vs. the new code where the C alone is quite a bit
> faster would have to be confirmed by Lynne.
Ok, fair enough.
What about ac3dsp then - that one seems like it's fairly optimized for
arm?
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder to lavu/tx
2022-09-24 19:40 ` Martin Storsjö
@ 2022-09-24 21:57 ` Lynne
2022-09-25 19:55 ` Martin Storsjö
[not found] ` <NClNyyy--3-2@lynne.ee-NClVNO6----2>
1 sibling, 1 reply; 19+ messages in thread
From: Lynne @ 2022-09-24 21:57 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Sep 24, 2022, 21:40 by martin@martin.st:
> On Sat, 24 Sep 2022, Hendrik Leppkes wrote:
>
>> On Sat, Sep 24, 2022 at 9:26 PM Hendrik Leppkes <h.leppkes@gmail.com> wrote:
>>
>>>
>>> On Sat, Sep 24, 2022 at 8:43 PM Martin Storsjö <martin@martin.st> wrote:
>>> >
>>> > On Sat, 24 Sep 2022, Lynne wrote:
>>> >
>>> > > This commit changes both the encoder and decoder to use the new lavu/tx code,
>>> > > which has faster C transforms and more assembly optimizations.
>>> >
>>> > What's the case of e.g. 32 bit arm - that does have a bunch of fft and
>>> > mdct assembly, but is that something that ends up used by opus today, or
>>> > does the mdct15 stuff use separate codepaths that aren't optimized there
>>> > today yet?
>>> >
>>>
>>> mdct15 only has some x86 assembly, nothing for ARM.
>>> Only the normal (power of 2) fft/mdct has some ARM 32-bit assembly.
>>>
>>
>> Actually, I missed that the mdct15 internally uses one of the normal
>> fft functions for a part of the calculation, but how much impact that
>> has on performance vs. the new code where the C alone is quite a bit
>> faster would have to be confirmed by Lynne.
>>
>
> Ok, fair enough.
>
I did some benchmarking. Just lavc's C nptwo MDCT is 10% slower than lavu's
C nptwo MDCT. I don't have 32bit ARM hardware to test on, but I do have an
aarch64 A53 core. On it, the performance difference with all optimizations with
this patch on or off was that the decoder became 15% faster. With lavu/tx's aarch64
assembly disabled to simulate arm32's situation, the decoder was still 10% faster
overall. It's probably going to be similar on arm32.
On x86, the performance difference between the decoder without this patch
and the decoder with this patch but all lavu/tx asm disabled was only 10% slower.
With assembly enabled and this patch, the decoder is 15% faster overall on an
Alder Lake system.
As for the overall decoding time consumption for Opus, the MDCT is very far behind
the largest overhead - coefficient decoding (on x86 with optimizations, 50% of the
time is spent there, whilst only 5% on the MDCT in total). It's a very optimized decoder.
In general, for the transform alone, a C non-power-of-two lavu MDCT for the lengths
used by Opus, the performance difference for using AVX vs C for the ptwo part is on
the order of 20% slower transforms for 960pt, and SSE vs C for 240pt is also around
20%. Most of this is due to the function call overhead, (framesize/2)/ptwo = 120,
60, 30 and 15 calls to ptwo FFTs per transform. The assembly function largely
eliminates this overhead by linking assembly functions together with a minimal
'ABI'.
> What about ac3dsp then - that one seems like it's fairly optimized for arm?
>
Haven't touched them, they're still being used. Unfortunately, for AC3,
the full MDCT optimizations in lavc do make a difference and the overall
decoder becomes 15% slower with this patch on for aarch64 with lavu/tx's
asm disabled and 7% slower with lavu/tx's asm enabled. I do plan to write
an aarch64 MDCT NEON SIMD code in a month or so, unless someone is faster,
which should make the decoder at least 10% faster with lavu/tx.
For Opus, the used ptwo lengths are (framesize/2)/15 = 32, 16, 8 and 4pt FFTs.
If you'd like to help out, I've documented the C factorizations used in
docs/transforms.md. You could also try porting the existing assembly. It should be
trivial if they don't use the upper half of the tables. lavc's and lavu's FFT tables
differ by size - lavu's are half the size of lavc's tables, because lavc's tables
contain the multiplication factors mirrored after the halfway point. That's used by
the RDFT, and by the x86 assembly. It's not worth replicating this, the
memory overhead is just too much, especially on bandwidth starved cores.
If the arm32 assembly uses the upper part, it shouldn't be too hard to
make it read from both the start and end point of the exptab array in the
recombination function of ptwo transforms.
The MDCT asm can be ported in a straightforward way and would improve
both decoders significantly. If the ABI is simpler than x86's, you could even
make the asm transform call into C functions, which would lessen the work.
A lot of the MDCT overhead is in the gather and multiplication part, whilst
the FFT is limited by mostly adds and memory bandwidth, so just with
MDCT assembly the decoder would get a lot faster.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder to lavu/tx
[not found] ` <NClNyyy--3-2@lynne.ee-NClVNO6----2>
@ 2022-09-25 7:54 ` Lynne
2022-09-25 12:34 ` Andreas Rheinhardt
0 siblings, 1 reply; 19+ messages in thread
From: Lynne @ 2022-09-25 7:54 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Sep 24, 2022, 23:57 by dev@lynne.ee:
> Sep 24, 2022, 21:40 by martin@martin.st:
>
>> What about ac3dsp then - that one seems like it's fairly optimized for arm?
>>
> Haven't touched them, they're still being used. Unfortunately, for AC3,
> the full MDCT optimizations in lavc do make a difference and the overall
> decoder becomes 15% slower with this patch on for aarch64 with lavu/tx's
> asm disabled and 7% slower with lavu/tx's asm enabled. I do plan to write
> an aarch64 MDCT NEON SIMD code in a month or so, unless someone is faster,
> which should make the decoder at least 10% faster with lavu/tx.
>
I'd just like to add this was for the float version of the ac3 decoder. The fixed-point
version is a few percent faster with the patch on an A53, and quite a bit
more accurate.
The lavc fixed-point FFT code also has some weird large spikes in #cycles
for some transform sizes, so the figure above is an average, but the dips
went from 117x realtime to 78x realtime, which on a slower CPU may
be the difference between stuttering and realtime playback.
On this CPU, the fixed-point version is 23% slower than the float version,
but on a CPU with slower float ops, it would make more sense to pick that
decoder up than the float version.
The 2 decoders produce nearly identical results, minus a few rounding
errors, since AC3 is inherently a fixed-point codec. The only difference
are the transforms themselves, and the extra ops needed to convert
the 25bit ints to floats in the float decoder.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder to lavu/tx
2022-09-25 7:54 ` Lynne
@ 2022-09-25 12:34 ` Andreas Rheinhardt
2022-09-25 21:08 ` Lynne
0 siblings, 1 reply; 19+ messages in thread
From: Andreas Rheinhardt @ 2022-09-25 12:34 UTC (permalink / raw)
To: ffmpeg-devel
Lynne:
> Sep 24, 2022, 23:57 by dev@lynne.ee:
>
>> Sep 24, 2022, 21:40 by martin@martin.st:
>>
>>> What about ac3dsp then - that one seems like it's fairly optimized for arm?
>>>
>> Haven't touched them, they're still being used. Unfortunately, for AC3,
>> the full MDCT optimizations in lavc do make a difference and the overall
>> decoder becomes 15% slower with this patch on for aarch64 with lavu/tx's
>> asm disabled and 7% slower with lavu/tx's asm enabled. I do plan to write
>> an aarch64 MDCT NEON SIMD code in a month or so, unless someone is faster,
>> which should make the decoder at least 10% faster with lavu/tx.
>>
>
> I'd just like to add this was for the float version of the ac3 decoder. The fixed-point
> version is a few percent faster with the patch on an A53, and quite a bit
> more accurate.
> The lavc fixed-point FFT code also has some weird large spikes in #cycles
> for some transform sizes, so the figure above is an average, but the dips
> went from 117x realtime to 78x realtime, which on a slower CPU may
> be the difference between stuttering and realtime playback.
> On this CPU, the fixed-point version is 23% slower than the float version,
> but on a CPU with slower float ops, it would make more sense to pick that
> decoder up than the float version.
> The 2 decoders produce nearly identical results, minus a few rounding
> errors, since AC3 is inherently a fixed-point codec. The only difference
> are the transforms themselves, and the extra ops needed to convert
> the 25bit ints to floats in the float decoder.
1. You forgot to remove mdct15 requirements from configure in this whole
patchset.
2. You forgot to update the FATE references for several tests; e.g. when
only applying the ac3 patch, then I get this:
TEST ac3-4.0
stddev: 7.60 PSNR: 78.71 MAXDIFF: 867 bytes: 761856/ 761856
MAXDIFF: |867 - 0| >= 1
Test ac3-4.0 failed. Look at tests/data/fate/ac3-4.0.err for details.
make: *** [src/tests/Makefile:307: fate-ac3-4.0] Error 1
TEST ac3-2.0
stddev: 2.57 PSNR: 88.10 MAXDIFF: 414 bytes: 804864/ 804864
MAXDIFF: |414 - 0| >= 1
Test ac3-2.0 failed. Look at tests/data/fate/ac3-2.0.err for details.
make: *** [src/tests/Makefile:307: fate-ac3-2.0] Error 1
TEST ac3-4.0-downmix-stereo
stddev: 2.99 PSNR: 86.81 MAXDIFF: 198 bytes: 380928/ 380928
MAXDIFF: |198 - 0| >= 1
Test ac3-4.0-downmix-stereo failed. Look at
tests/data/fate/ac3-4.0-downmix-stereo.err for details.
make: *** [src/tests/Makefile:307: fate-ac3-4.0-downmix-stereo] Error 1
TEST ac3-4.0-downmix-mono
stddev: 4.11 PSNR: 84.05 MAXDIFF: 281 bytes: 190464/ 190464
MAXDIFF: |281 - 0| >= 1
Test ac3-4.0-downmix-mono failed. Look at
tests/data/fate/ac3-4.0-downmix-mono.err for details.
make: *** [src/tests/Makefile:307: fate-ac3-4.0-downmix-mono] Error 1
TEST ac3-fixed-2.0
stddev: 382.35 PSNR: 44.68 MAXDIFF:32866 bytes: 804864/ 804864
MAXDIFF: |32866 - 0| >= 1
Test ac3-fixed-2.0 failed. Look at tests/data/fate/ac3-fixed-2.0.err for
details.
make: *** [src/tests/Makefile:307: fate-ac3-fixed-2.0] Error 1
TEST ac3-fixed-4.0-downmix-mono
stddev: 1140.81 PSNR: 35.18 MAXDIFF:34416 bytes: 190464/ 190464
MAXDIFF: |34416 - 0| >= 1
Test ac3-fixed-4.0-downmix-mono failed. Look at
tests/data/fate/ac3-fixed-4.0-downmix-mono.err for details.
make: *** [src/tests/Makefile:307: fate-ac3-fixed-4.0-downmix-mono] Error 1
TEST ac3-fixed-encode
--- - 2022-09-25 14:22:45.695390813 +0200
+++ tests/data/fate/ac3-fixed-encode 2022-09-25 14:22:45.687999547 +0200
@@ -1 +1 @@
-1f548175e11a95e62ce20e442fcc8d08
+e9d78bca187b4bbafc4512bcea8efd3e
Test ac3-fixed-encode failed. Look at
tests/data/fate/ac3-fixed-encode.err for details.
make: *** [src/tests/Makefile:307: fate-ac3-fixed-encode] Error 1
(Additionally, checksums in unknown_layout-ac3, lavf-rm, shortest,
copy-shortest1 and copy-shortest2 need to be updated.)
As the above shows, the difference between the reference files and the
decoded output becomes larger in several tests, i.e. the reference files
won't be usable lateron. If the new float and fixed-point decoders
produce indeed produce nearly identical output, then one could write
tests that decode the same file with both the floating point and the
fixed point decoder, check that both are nearly identical and print a
checksum of the output of the fixed point decoder.
Also note that there is currently no test that directly verifies your
claims of greater accuracy. One could write such a test by encoding a
file with ac3-fixed and decoding it again (with the fixed point decoder)
and printing the psnr of input and output. No encoding tests does this
at the moment.
- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH 6/6] wmaprodec: convert to lavu/tx
2022-09-23 23:20 ` [FFmpeg-devel] [PATCH 6/6] wmaprodec: " Lynne
@ 2022-09-25 12:38 ` Andreas Rheinhardt
0 siblings, 0 replies; 19+ messages in thread
From: Andreas Rheinhardt @ 2022-09-25 12:38 UTC (permalink / raw)
To: ffmpeg-devel
Lynne:
> - for (int i = 0; i < WMAPRO_BLOCK_SIZES; i++) {
> - ret = ff_mdct_init(&s->mdct_ctx[i], WMAPRO_BLOCK_MIN_BITS + 1 + i, 1,
> - 1.0 / (1 << (WMAPRO_BLOCK_MIN_BITS + i - 1))
> - / (1ll << (s->bits_per_sample - 1)));
> - if (ret < 0)
> - return ret;
> + for (i = 0; i < WMAPRO_BLOCK_SIZES; i++) {
Unnecessary change.
> + const float scale = 1.0 / (1 << (WMAPRO_BLOCK_MIN_BITS + i - 1))
> + / (1ll << (s->bits_per_sample - 1));
> + int err = av_tx_init(&s->tx[i], &s->tx_fn[i], AV_TX_FLOAT_MDCT, 1,
> + 1 << (WMAPRO_BLOCK_MIN_BITS + i), &scale, 0);
> + if (err < 0)
> + return err;
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder to lavu/tx
2022-09-24 21:57 ` Lynne
@ 2022-09-25 19:55 ` Martin Storsjö
2022-09-25 20:45 ` Lynne
0 siblings, 1 reply; 19+ messages in thread
From: Martin Storsjö @ 2022-09-25 19:55 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Ben Avison
On Sat, 24 Sep 2022, Lynne wrote:
>> What about ac3dsp then - that one seems like it's fairly optimized for arm?
>>
>
> Haven't touched them, they're still being used. Unfortunately, for AC3,
> the full MDCT optimizations in lavc do make a difference and the overall
> decoder becomes 15% slower with this patch on for aarch64 with lavu/tx's
> asm disabled and 7% slower with lavu/tx's asm enabled.
Hmm, that's a shame...
> I do plan to write an aarch64 MDCT NEON SIMD code in a month or so,
> unless someone is faster, which should make the decoder at least 10%
> faster with lavu/tx.
Would you consider holding off of converting the ac3 decoder until this
point, to avoid unnecessary temporary performance regressions at least for
the architectures that are covered by the new lavu/tx framework?
> If you'd like to help out, I've documented the C factorizations used in
> docs/transforms.md.
Sorry, I don't think I have time at the moment to take on writing new code
from scratch for this...
I could maybe consider porting the aarch64 assembly to arm32; if it's not
register starved, it's usually quite straightforward to do such rewrites
(there's either half the number of SIMD registers compared to aarch64, or
the same number but half the length).
The reason why I'm asking about arm32, is because ffmpeg has got a bunch
of users who have spent a fair amount of effort on reaching specific
performance levels for some codecs, both for raspberry pi 1 (which doesn't
have neon but only vfp) and for the newer ones with neon. I don't remember
exactly which codecs are relevant for these users - I doubt opus is, but
ac3 and dca are, iirc.
I'm CCing Ben Avison who has contributed a lot of optimizations in this
area.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder to lavu/tx
2022-09-25 19:55 ` Martin Storsjö
@ 2022-09-25 20:45 ` Lynne
0 siblings, 0 replies; 19+ messages in thread
From: Lynne @ 2022-09-25 20:45 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Sep 25, 2022, 21:55 by martin@martin.st:
> On Sat, 24 Sep 2022, Lynne wrote:
>
>>> What about ac3dsp then - that one seems like it's fairly optimized for arm?
>>>
>>
>> Haven't touched them, they're still being used. Unfortunately, for AC3,
>> the full MDCT optimizations in lavc do make a difference and the overall
>> decoder becomes 15% slower with this patch on for aarch64 with lavu/tx's
>> asm disabled and 7% slower with lavu/tx's asm enabled.
>>
>
> Hmm, that's a shame...
>
>> I do plan to write an aarch64 MDCT NEON SIMD code in a month or so, unless someone is faster, which should make the decoder at least 10% faster with lavu/tx.
>>
>
> Would you consider holding off of converting the ac3 decoder until this point, to avoid unnecessary temporary performance regressions at least for the architectures that are covered by the new lavu/tx framework?
>
>> If you'd like to help out, I've documented the C factorizations used in
>> docs/transforms.md.
>>
>
> Sorry, I don't think I have time at the moment to take on writing new code from scratch for this...
>
> I could maybe consider porting the aarch64 assembly to arm32; if it's not register starved, it's usually quite straightforward to do such rewrites (there's either half the number of SIMD registers compared to aarch64, or the same number but half the length)
>
For the basis transforms (double 4, double 8 and 8, single 16), there's no starvation.
For the 32pt transform, it's a bit starved, but nothing you couldn't work out.
For the 64pt and up, absolutely all registers are used to the point of needing to
stash vector regs across gprs. If all registers are written back to memory (no register
sharing between transform sizes), it becomes as starved as the 32pt.
It's obvious to see where the starvation happens (only 32pt -> 64pt) and how to fix it,
but it's still work to convert code. Take a look at it and see if you can spot something
that would make it difficult?
> The reason why I'm asking about arm32, is because ffmpeg has got a bunch of users who have spent a fair amount of effort on reaching specific performance levels for some codecs, both for raspberry pi 1 (which doesn't have neon but only vfp) and for the newer ones with neon. I don't remember exactly which codecs are relevant for these users - I doubt opus is, but ac3 and dca are, iirc.
>
We do maintain old versions for years after a release. And we recently-ish
had a major bump, and very recently 5.1. I think there's enough time to
bring them back up and make them faster still before stuck users become
quite outdated, what about you? Maybe someone who's interested could
notice and help out?
> I'm CCing Ben Avison who has contributed a lot of optimizations in this area.
>
Thanks.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder to lavu/tx
2022-09-25 12:34 ` Andreas Rheinhardt
@ 2022-09-25 21:08 ` Lynne
2022-09-25 21:17 ` Andreas Rheinhardt
0 siblings, 1 reply; 19+ messages in thread
From: Lynne @ 2022-09-25 21:08 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Sep 25, 2022, 14:34 by andreas.rheinhardt@outlook.com:
> Lynne:
>
>> Sep 24, 2022, 23:57 by dev@lynne.ee:
>>
>>> Sep 24, 2022, 21:40 by martin@martin.st:
>>>
>>>> What about ac3dsp then - that one seems like it's fairly optimized for arm?
>>>>
>>> Haven't touched them, they're still being used. Unfortunately, for AC3,
>>> the full MDCT optimizations in lavc do make a difference and the overall
>>> decoder becomes 15% slower with this patch on for aarch64 with lavu/tx's
>>> asm disabled and 7% slower with lavu/tx's asm enabled. I do plan to write
>>> an aarch64 MDCT NEON SIMD code in a month or so, unless someone is faster,
>>> which should make the decoder at least 10% faster with lavu/tx.
>>>
>>
>> I'd just like to add this was for the float version of the ac3 decoder. The fixed-point
>> version is a few percent faster with the patch on an A53, and quite a bit
>> more accurate.
>> The lavc fixed-point FFT code also has some weird large spikes in #cycles
>> for some transform sizes, so the figure above is an average, but the dips
>> went from 117x realtime to 78x realtime, which on a slower CPU may
>> be the difference between stuttering and realtime playback.
>> On this CPU, the fixed-point version is 23% slower than the float version,
>> but on a CPU with slower float ops, it would make more sense to pick that
>> decoder up than the float version.
>> The 2 decoders produce nearly identical results, minus a few rounding
>> errors, since AC3 is inherently a fixed-point codec. The only difference
>> are the transforms themselves, and the extra ops needed to convert
>> the 25bit ints to floats in the float decoder.
>>
>
> 1. You forgot to remove mdct15 requirements from configure in this whole
> patchset.
> 2. You forgot to update the FATE references for several tests; e.g. when
> only applying the ac3 patch, then I get this:
>
I know. durandal pointed it out the day I sent them. I'll send them again
later.
I'm planning to just push the Opus patch in a day with the mdct15
line in configure gone.
> As the above shows, the difference between the reference files and the
> decoded output becomes larger in several tests, i.e. the reference files
> won't be usable lateron. If the new float and fixed-point decoders
> produce indeed produce nearly identical output, then one could write
> tests that decode the same file with both the floating point and the
> fixed point decoder, check that both are nearly identical and print a
> checksum of the output of the fixed point decoder.
>
I have a standalone program I've hacked on as I need to for the fixed-point
transforms: https://0x0.st/oWxO.c
The square root of the squared rounding error across the entire range
(1 to 21 bits) of transforms from 32pt to 1024pt is 6.855655 for lavu and
7.141428 for lavc, which is slightly worse. If you extend the range
to 22bits, the 1024pt transform in lavc explodes, while lavu is still fine,
thus showing a greater range.
The rounding errors are a lesser problem than hitting the max range,
because then you get huge spikes in the output.
I can further reduce the error in lavu at the cost of speed, but I think
this is sufficient.
> Also note that there is currently no test that directly verifies your
> claims of greater accuracy. One could write such a test by encoding a
> file with ac3-fixed and decoding it again (with the fixed point decoder)
> and printing the psnr of input and output. No encoding tests does this
> at the moment.
>
I'm not writing that, but I like the idea, the point of fixed-point decoders
isn't bitexactness, but speed on slow hardware, so we shouldn't be testing
an MD5.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder to lavu/tx
2022-09-25 21:08 ` Lynne
@ 2022-09-25 21:17 ` Andreas Rheinhardt
2022-09-25 21:46 ` Lynne
0 siblings, 1 reply; 19+ messages in thread
From: Andreas Rheinhardt @ 2022-09-25 21:17 UTC (permalink / raw)
To: ffmpeg-devel
Lynne:
> Sep 25, 2022, 14:34 by andreas.rheinhardt@outlook.com:
>
>> Lynne:
>>
>>> Sep 24, 2022, 23:57 by dev@lynne.ee:
>>>
>>>> Sep 24, 2022, 21:40 by martin@martin.st:
>>>>
>>>>> What about ac3dsp then - that one seems like it's fairly optimized for arm?
>>>>>
>>>> Haven't touched them, they're still being used. Unfortunately, for AC3,
>>>> the full MDCT optimizations in lavc do make a difference and the overall
>>>> decoder becomes 15% slower with this patch on for aarch64 with lavu/tx's
>>>> asm disabled and 7% slower with lavu/tx's asm enabled. I do plan to write
>>>> an aarch64 MDCT NEON SIMD code in a month or so, unless someone is faster,
>>>> which should make the decoder at least 10% faster with lavu/tx.
>>>>
>>>
>>> I'd just like to add this was for the float version of the ac3 decoder. The fixed-point
>>> version is a few percent faster with the patch on an A53, and quite a bit
>>> more accurate.
>>> The lavc fixed-point FFT code also has some weird large spikes in #cycles
>>> for some transform sizes, so the figure above is an average, but the dips
>>> went from 117x realtime to 78x realtime, which on a slower CPU may
>>> be the difference between stuttering and realtime playback.
>>> On this CPU, the fixed-point version is 23% slower than the float version,
>>> but on a CPU with slower float ops, it would make more sense to pick that
>>> decoder up than the float version.
>>> The 2 decoders produce nearly identical results, minus a few rounding
>>> errors, since AC3 is inherently a fixed-point codec. The only difference
>>> are the transforms themselves, and the extra ops needed to convert
>>> the 25bit ints to floats in the float decoder.
>>>
>>
>> 1. You forgot to remove mdct15 requirements from configure in this whole
>> patchset.
>> 2. You forgot to update the FATE references for several tests; e.g. when
>> only applying the ac3 patch, then I get this:
>>
>
> I know. durandal pointed it out the day I sent them. I'll send them again
> later.
> I'm planning to just push the Opus patch in a day with the mdct15
> line in configure gone.
>
>
>> As the above shows, the difference between the reference files and the
>> decoded output becomes larger in several tests, i.e. the reference files
>> won't be usable lateron. If the new float and fixed-point decoders
>> produce indeed produce nearly identical output, then one could write
>> tests that decode the same file with both the floating point and the
>> fixed point decoder, check that both are nearly identical and print a
>> checksum of the output of the fixed point decoder.
>>
>
> I have a standalone program I've hacked on as I need to for the fixed-point
> transforms: https://0x0.st/oWxO.c
> The square root of the squared rounding error across the entire range
> (1 to 21 bits) of transforms from 32pt to 1024pt is 6.855655 for lavu and
> 7.141428 for lavc, which is slightly worse. If you extend the range
> to 22bits, the 1024pt transform in lavc explodes, while lavu is still fine,
> thus showing a greater range.
> The rounding errors are a lesser problem than hitting the max range,
> because then you get huge spikes in the output.
> I can further reduce the error in lavu at the cost of speed, but I think
> this is sufficient.
>
>
>> Also note that there is currently no test that directly verifies your
>> claims of greater accuracy. One could write such a test by encoding a
>> file with ac3-fixed and decoding it again (with the fixed point decoder)
>> and printing the psnr of input and output. No encoding tests does this
>> at the moment.
>>
>
> I'm not writing that, but I like the idea, the point of fixed-point decoders
> isn't bitexactness, but speed on slow hardware, so we shouldn't be testing
> an MD5.
Are your fixed-point transforms bitexact across all arches/cpuflags?
- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder to lavu/tx
2022-09-25 21:17 ` Andreas Rheinhardt
@ 2022-09-25 21:46 ` Lynne
0 siblings, 0 replies; 19+ messages in thread
From: Lynne @ 2022-09-25 21:46 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Sep 25, 2022, 23:17 by andreas.rheinhardt@outlook.com:
> Lynne:
>
>> Sep 25, 2022, 14:34 by andreas.rheinhardt@outlook.com:
>>
>>> Lynne:
>>>
>>>> Sep 24, 2022, 23:57 by dev@lynne.ee:
>>>>
>>>>> Sep 24, 2022, 21:40 by martin@martin.st:
>>>>>
>>>>>> What about ac3dsp then - that one seems like it's fairly optimized for arm?
>>>>>>
>>>>> Haven't touched them, they're still being used. Unfortunately, for AC3,
>>>>> the full MDCT optimizations in lavc do make a difference and the overall
>>>>> decoder becomes 15% slower with this patch on for aarch64 with lavu/tx's
>>>>> asm disabled and 7% slower with lavu/tx's asm enabled. I do plan to write
>>>>> an aarch64 MDCT NEON SIMD code in a month or so, unless someone is faster,
>>>>> which should make the decoder at least 10% faster with lavu/tx.
>>>>>
>>>>
>>>> I'd just like to add this was for the float version of the ac3 decoder. The fixed-point
>>>> version is a few percent faster with the patch on an A53, and quite a bit
>>>> more accurate.
>>>> The lavc fixed-point FFT code also has some weird large spikes in #cycles
>>>> for some transform sizes, so the figure above is an average, but the dips
>>>> went from 117x realtime to 78x realtime, which on a slower CPU may
>>>> be the difference between stuttering and realtime playback.
>>>> On this CPU, the fixed-point version is 23% slower than the float version,
>>>> but on a CPU with slower float ops, it would make more sense to pick that
>>>> decoder up than the float version.
>>>> The 2 decoders produce nearly identical results, minus a few rounding
>>>> errors, since AC3 is inherently a fixed-point codec. The only difference
>>>> are the transforms themselves, and the extra ops needed to convert
>>>> the 25bit ints to floats in the float decoder.
>>>>
>>>
>>> 1. You forgot to remove mdct15 requirements from configure in this whole
>>> patchset.
>>> 2. You forgot to update the FATE references for several tests; e.g. when
>>> only applying the ac3 patch, then I get this:
>>>
>>
>> I know. durandal pointed it out the day I sent them. I'll send them again
>> later.
>> I'm planning to just push the Opus patch in a day with the mdct15
>> line in configure gone.
>>
>>
>>> As the above shows, the difference between the reference files and the
>>> decoded output becomes larger in several tests, i.e. the reference files
>>> won't be usable lateron. If the new float and fixed-point decoders
>>> produce indeed produce nearly identical output, then one could write
>>> tests that decode the same file with both the floating point and the
>>> fixed point decoder, check that both are nearly identical and print a
>>> checksum of the output of the fixed point decoder.
>>>
>>
>> I have a standalone program I've hacked on as I need to for the fixed-point
>> transforms: https://0x0.st/oWxO.c
>> The square root of the squared rounding error across the entire range
>> (1 to 21 bits) of transforms from 32pt to 1024pt is 6.855655 for lavu and
>> 7.141428 for lavc, which is slightly worse. If you extend the range
>> to 22bits, the 1024pt transform in lavc explodes, while lavu is still fine,
>> thus showing a greater range.
>> The rounding errors are a lesser problem than hitting the max range,
>> because then you get huge spikes in the output.
>> I can further reduce the error in lavu at the cost of speed, but I think
>> this is sufficient.
>>
>>
>>> Also note that there is currently no test that directly verifies your
>>> claims of greater accuracy. One could write such a test by encoding a
>>> file with ac3-fixed and decoding it again (with the fixed point decoder)
>>> and printing the psnr of input and output. No encoding tests does this
>>> at the moment.
>>>
>>
>> I'm not writing that, but I like the idea, the point of fixed-point decoders
>> isn't bitexactness, but speed on slow hardware, so we shouldn't be testing
>> an MD5.
>>
>
> Are your fixed-point transforms bitexact across all arches/cpuflags?
>
As much as libavcodec's. This is because we use a float value for the MDCT scale,
and we calculate the exptabs and FFT tables with floats before converting
them to ints during init. If issues arise, we could specialcase them, though as
libavcodec's hasn't needed that, lavu doesn't need it either.
Since the FFT tables are always constant, they would benefit from hardcoding,
as it would take out any local machine precision out of the equation. The actual
constants are quantized versions of the computed floats, which also has a fair leeway.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 19+ messages in thread
end of thread, other threads:[~2022-09-25 21:46 UTC | newest]
Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-23 23:14 [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder to lavu/tx Lynne
[not found] ` <NCgcUxK--3-2@lynne.ee-NCgcZNj----2>
2022-09-23 23:15 ` [FFmpeg-devel] [PATCH 2/6] atrac9dec: switch " Lynne
[not found] ` <NCgciJh--3-2@lynne.ee-NCgclLI----2>
2022-09-23 23:18 ` [FFmpeg-devel] [PATCH 3/6] ac3: convert encoder and decoder " Lynne
[not found] ` <NCgdFqI--B-2@lynne.ee-NCgdIwE----2>
2022-09-23 23:18 ` [FFmpeg-devel] [PATCH 4/6] vorbisdec: convert " Lynne
[not found] ` <NCgdOA8--3-2@lynne.ee-NCgdR4N----2>
2022-09-23 23:19 ` [FFmpeg-devel] [PATCH 5/6] twinvq: " Lynne
[not found] ` <NCgdYSD--3-2@lynne.ee-NCgdaK4----2>
2022-09-23 23:20 ` [FFmpeg-devel] [PATCH 6/6] wmaprodec: " Lynne
2022-09-25 12:38 ` Andreas Rheinhardt
2022-09-24 18:42 ` [FFmpeg-devel] [PATCH 1/6] opus: convert encoder and decoder " Martin Storsjö
2022-09-24 19:26 ` Hendrik Leppkes
2022-09-24 19:31 ` Hendrik Leppkes
2022-09-24 19:40 ` Martin Storsjö
2022-09-24 21:57 ` Lynne
2022-09-25 19:55 ` Martin Storsjö
2022-09-25 20:45 ` Lynne
[not found] ` <NClNyyy--3-2@lynne.ee-NClVNO6----2>
2022-09-25 7:54 ` Lynne
2022-09-25 12:34 ` Andreas Rheinhardt
2022-09-25 21:08 ` Lynne
2022-09-25 21:17 ` Andreas Rheinhardt
2022-09-25 21:46 ` Lynne
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git