Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/2] lavu/tx: add real to real and real to imaginary RDFT transforms
@ 2023-08-03 16:26 Lynne
       [not found] ` <NawD9Lq--3-9@lynne.ee-NawDDOJ----9>
  2023-08-03 20:32 ` [FFmpeg-devel] [PATCH " Michael Niedermayer
  0 siblings, 2 replies; 7+ messages in thread
From: Lynne @ 2023-08-03 16:26 UTC (permalink / raw)
  To: Ffmpeg Devel

[-- Attachment #1: Type: text/plain, Size: 241 bytes --]

These are in-place transforms, required for DCT-I and DST-I.

Templated as the mod2 variant requires minor modifications, and is
required specifically for DCT-I/DST-I.

Quite optimized, as there's no need for any additional buffer storage.


[-- Attachment #2: 0001-lavu-tx-add-real-to-real-and-real-to-imaginary-RDFT-.patch --]
[-- Type: text/x-diff, Size: 18587 bytes --]

From 2ea5e2541c2551bf1b56e967d35946289a85aa49 Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Thu, 3 Aug 2023 18:21:23 +0200
Subject: [PATCH 1/2] lavu/tx: add real to real and real to imaginary RDFT
 transforms

These are in-place transforms, required for DCT-I and DST-I.

Templated as the mod2 variant requires minor modifications, and is
required specifically for DCT-I/DST-I.
---
 doc/APIchanges          |   3 +
 libavutil/tx.c          |  18 ++++-
 libavutil/tx.h          |  10 +++
 libavutil/tx_template.c | 175 +++++++++++++++++++++++++++++++---------
 libavutil/version.h     |   2 +-
 5 files changed, 167 insertions(+), 41 deletions(-)

diff --git a/doc/APIchanges b/doc/APIchanges
index 5afe8bcb75..edd178be4f 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -2,6 +2,9 @@ The last version increases of all libraries were on 2023-02-09
 
 API changes, most recent first:
 
+2023-07-xx - xxxxxxxxxx - lavu 58.15.100 - tx.h
+  Add AV_TX_REAL_TO_REAL and AV_TX_REAL_TO_IMAGINARY
+
 2023-07-xx - xxxxxxxxxx - lavc 60 - avcodec.h
   Deprecate AV_CODEC_FLAG_DROPCHANGED without replacement.
 
diff --git a/libavutil/tx.c b/libavutil/tx.c
index e25abf998f..e9826e6107 100644
--- a/libavutil/tx.c
+++ b/libavutil/tx.c
@@ -437,7 +437,9 @@ int ff_tx_decompose_length(int dst[TX_MAX_DECOMPOSITIONS], enum AVTXType type,
 
             /* Check direction for non-orthogonal codelets */
             if (((cd->flags & FF_TX_FORWARD_ONLY) && inv) ||
-                ((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv))
+                ((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv) ||
+                ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_REAL)) && inv) ||
+                ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_IMAGINARY)) && inv))
                 continue;
 
             /* Check if the CPU supports the required ISA */
@@ -560,6 +562,10 @@ static void print_flags(AVBPrint *bp, uint64_t f)
         av_bprintf(bp, "%spreshuf", prev > 1 ? sep : "");
     if ((f & AV_TX_FULL_IMDCT) && ++prev)
         av_bprintf(bp, "%simdct_full", prev > 1 ? sep : "");
+    if ((f & AV_TX_REAL_TO_REAL) && ++prev)
+        av_bprintf(bp, "%sreal_to_real", prev > 1 ? sep : "");
+    if ((f & AV_TX_REAL_TO_IMAGINARY) && ++prev)
+        av_bprintf(bp, "%sreal_to_imaginary", prev > 1 ? sep : "");
     if ((f & FF_TX_ASM_CALL) && ++prev)
         av_bprintf(bp, "%sasm_call", prev > 1 ? sep : "");
     av_bprintf(bp, "]");
@@ -717,7 +723,11 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
     uint64_t req_flags = flags;
 
     /* Flags the codelet may require to be present */
-    uint64_t inv_req_mask = AV_TX_FULL_IMDCT | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL;
+    uint64_t inv_req_mask = AV_TX_FULL_IMDCT |
+                            AV_TX_REAL_TO_REAL |
+                            AV_TX_REAL_TO_IMAGINARY |
+                            FF_TX_PRESHUFFLE |
+                            FF_TX_ASM_CALL;
 
     /* Unaligned codelets are compatible with the aligned flag */
     if (req_flags & FF_TX_ALIGNED)
@@ -742,7 +752,9 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
 
             /* Check direction for non-orthogonal codelets */
             if (((cd->flags & FF_TX_FORWARD_ONLY) && inv) ||
-                ((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv))
+                ((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv) ||
+                ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_REAL)) && inv) ||
+                ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_IMAGINARY)) && inv))
                 continue;
 
             /* Check if the requested flags match from both sides */
diff --git a/libavutil/tx.h b/libavutil/tx.h
index 064edbc097..d178e8ee9d 100644
--- a/libavutil/tx.h
+++ b/libavutil/tx.h
@@ -149,6 +149,16 @@ enum AVTXFlags {
      * Ignored for all transforms but inverse MDCTs.
      */
     AV_TX_FULL_IMDCT = 1ULL << 2,
+
+    /**
+     * Perform a real to half-complex RDFT.
+     * Only the real, or imaginary coefficients will
+     * be output, depending on the flag used. Only available for forward RDFTs.
+     * Output array must have enough space to hold N complex values
+     * (regular size for a real to complex transform).
+     */
+    AV_TX_REAL_TO_REAL      = 1ULL << 3,
+    AV_TX_REAL_TO_IMAGINARY = 1ULL << 4,
 };
 
 /**
diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c
index c4ec9502e0..50c65d00b5 100644
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@@ -1613,14 +1613,17 @@ static av_cold int TX_NAME(ff_tx_rdft_init)(AVTXContext *s,
     int ret;
     double f, m;
     TXSample *tab;
+    int len4 = FFALIGN(len, 4) / 4;
 
     s->scale_d = *((SCALE_TYPE *)scale);
     s->scale_f = s->scale_d;
 
+    flags &= ~(AV_TX_REAL_TO_REAL | AV_TX_REAL_TO_IMAGINARY);
+
     if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, NULL, len >> 1, inv, scale)))
         return ret;
 
-    if (!(s->exp = av_mallocz((8 + (len >> 2) - 1)*sizeof(*s->exp))))
+    if (!(s->exp = av_mallocz((8 + 2*len4)*sizeof(*s->exp))))
         return AVERROR(ENOMEM);
 
     tab = (TXSample *)s->exp;
@@ -1639,17 +1642,20 @@ static av_cold int TX_NAME(ff_tx_rdft_init)(AVTXContext *s,
     *tab++ = RESCALE( (0.5 - inv) * m);
     *tab++ = RESCALE(-(0.5 - inv) * m);
 
-    for (int i = 0; i < len >> 2; i++)
+    for (int i = 0; i < len4; i++)
         *tab++ = RESCALE(cos(i*f));
-    for (int i = len >> 2; i >= 0; i--)
-        *tab++ = RESCALE(cos(i*f) * (inv ? +1.0 : -1.0));
+
+    tab = ((TXSample *)s->exp) + len4 + 8;
+
+    for (int i = 0; i < len4; i++)
+        *tab++ = RESCALE(cos(((float)len/4.0 - (float)i + 0)*f) * (inv ? +1.0 : -1.0));
 
     return 0;
 }
 
-#define DECL_RDFT(name, inv)                                                   \
-static void TX_NAME(ff_tx_rdft_ ##name)(AVTXContext *s, void *_dst,            \
-                                       void *_src, ptrdiff_t stride)           \
+#define DECL_RDFT(n, inv)                                                      \
+static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst,               \
+                                     void *_src, ptrdiff_t stride)             \
 {                                                                              \
     const int len2 = s->len >> 1;                                              \
     const int len4 = s->len >> 2;                                              \
@@ -1698,40 +1704,131 @@ static void TX_NAME(ff_tx_rdft_ ##name)(AVTXContext *s, void *_dst,            \
         data[len2].re = data[0].im;                                            \
         data[   0].im = data[len2].im = 0;                                     \
     }                                                                          \
-}
+}                                                                              \
+                                                                               \
+static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = {                   \
+    .name       = TX_NAME_STR("rdft_" #n),                                     \
+    .function   = TX_NAME(ff_tx_rdft_ ##n),                                    \
+    .type       = TX_TYPE(RDFT),                                               \
+    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |       \
+                  inv ? FF_TX_INVERSE_ONLY : FF_TX_FORWARD_ONLY,               \
+    .factors    = { 4, TX_FACTOR_ANY },                                        \
+    .nb_factors = 2,                                                           \
+    .min_len    = 4,                                                           \
+    .max_len    = TX_LEN_UNLIMITED,                                            \
+    .init       = TX_NAME(ff_tx_rdft_init),                                    \
+    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
+    .prio       = FF_TX_PRIO_BASE,                                             \
+};
 
-DECL_RDFT(r2c, 0)
-DECL_RDFT(c2r, 1)
+DECL_RDFT(r2c,  0)
+DECL_RDFT(c2r,  1)
 
-static const FFTXCodelet TX_NAME(ff_tx_rdft_r2c_def) = {
-    .name       = TX_NAME_STR("rdft_r2c"),
-    .function   = TX_NAME(ff_tx_rdft_r2c),
-    .type       = TX_TYPE(RDFT),
-    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE |
-                  FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,
-    .factors    = { 2, TX_FACTOR_ANY },
-    .nb_factors = 2,
-    .min_len    = 2,
-    .max_len    = TX_LEN_UNLIMITED,
-    .init       = TX_NAME(ff_tx_rdft_init),
-    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
-    .prio       = FF_TX_PRIO_BASE,
+#define DECL_RDFT_HALF(n, mode, mod2)                                          \
+static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst,               \
+                                        void *_src, ptrdiff_t stride)          \
+{                                                                              \
+    const int len = s->len;                                                    \
+    const int len2 = len >> 1;                                                 \
+    const int len4 = len >> 2;                                                 \
+    const int aligned_len4 = FFALIGN(len, 4)/4;                                \
+    const TXSample *fact = (void *)s->exp;                                     \
+    const TXSample *tcos = fact + 8;                                           \
+    const TXSample *tsin = tcos + aligned_len4;                                \
+    TXComplex *data = _dst;                                                    \
+    TXSample *out = _dst; /* Half-complex is forward-only */                   \
+    TXSample tmp_dc;                                                           \
+    av_unused TXSample tmp_mid;                                                \
+    TXSample tmp[4];                                                           \
+    TXComplex sf, sl;                                                          \
+                                                                               \
+    s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex));                       \
+                                                                               \
+    tmp_dc = data[0].re;                                                       \
+    data[   0].re = tmp_dc + data[0].im;                                       \
+    tmp_dc        = tmp_dc - data[0].im;                                       \
+                                                                               \
+    data[   0].re = MULT(fact[0], data[   0].re);                              \
+    tmp_dc        = MULT(fact[1],        tmp_dc);                              \
+    data[len4].re = MULT(fact[2], data[len4].re);                              \
+                                                                               \
+    if (!mod2) {                                                               \
+        data[len4].im = MULT(fact[3], data[len4].im);                          \
+    } else {                                                                   \
+        sf = data[len4];                                                       \
+        sl = data[len4 + 1];                                                   \
+        if (mode == AV_TX_REAL_TO_REAL)                                        \
+            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
+        else                                                                   \
+            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
+        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
+        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
+                                                                               \
+        if (mode == AV_TX_REAL_TO_REAL) {                                      \
+            tmp[3]  = tmp[1]*tcos[len4] - tmp[2]*tsin[len4];                   \
+            tmp_mid = (tmp[0] - tmp[3]);                                       \
+        } else {                                                               \
+            tmp[3]  = tmp[1]*tsin[len4] + tmp[2]*tcos[len4];                   \
+            tmp_mid = (tmp[0] + tmp[3]);                                       \
+        }                                                                      \
+    }                                                                          \
+                                                                               \
+    /* NOTE: unrolling this breaks non-mod8 lengths */                         \
+    for (int i = 1; i <= len4; i++) {                                          \
+        TXSample tmp[4];                                                       \
+        TXComplex sf = data[i];                                                \
+        TXComplex sl = data[len2 - i];                                         \
+                                                                               \
+        if (mode == AV_TX_REAL_TO_REAL)                                        \
+            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
+        else                                                                   \
+            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
+                                                                               \
+        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
+        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
+                                                                               \
+        if (mode == AV_TX_REAL_TO_REAL) {                                      \
+            tmp[3]           = tmp[1]*tcos[i] - tmp[2]*tsin[i];                \
+            out[i]           = (tmp[0] + tmp[3]);                              \
+            out[len - i]     = (tmp[0] - tmp[3]);                              \
+        } else {                                                               \
+            tmp[3]           = tmp[1]*tsin[i] + tmp[2]*tcos[i];                \
+            out[i - 1]       = (tmp[3] - tmp[0]);                              \
+            out[len - i - 1] = (tmp[0] + tmp[3]);                              \
+        }                                                                      \
+    }                                                                          \
+                                                                               \
+    for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++)       \
+        out[len2 - i] = out[len - i];                                          \
+                                                                               \
+    if (mode == AV_TX_REAL_TO_REAL) {                                          \
+        out[len2] = tmp_dc;                                                    \
+        if (mod2)                                                              \
+            out[len4 + 1] = tmp_mid;                                           \
+    } else if (mod2) {                                                         \
+        out[len4] = tmp_mid;                                                   \
+    }                                                                          \
+}                                                                              \
+                                                                               \
+static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = {                   \
+    .name       = TX_NAME_STR("rdft_" #n),                                     \
+    .function   = TX_NAME(ff_tx_rdft_ ##n),                                    \
+    .type       = TX_TYPE(RDFT),                                               \
+    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | mode |                     \
+                  FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,                     \
+    .factors    = { 2 + 2*(!mod2), TX_FACTOR_ANY },                            \
+    .nb_factors = 2,                                                           \
+    .min_len    = 2 + 2*(!mod2),                                               \
+    .max_len    = TX_LEN_UNLIMITED,                                            \
+    .init       = TX_NAME(ff_tx_rdft_init),                                    \
+    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
+    .prio       = FF_TX_PRIO_BASE,                                             \
 };
 
-static const FFTXCodelet TX_NAME(ff_tx_rdft_c2r_def) = {
-    .name       = TX_NAME_STR("rdft_c2r"),
-    .function   = TX_NAME(ff_tx_rdft_c2r),
-    .type       = TX_TYPE(RDFT),
-    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE |
-                  FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY,
-    .factors    = { 2, TX_FACTOR_ANY },
-    .nb_factors = 2,
-    .min_len    = 2,
-    .max_len    = TX_LEN_UNLIMITED,
-    .init       = TX_NAME(ff_tx_rdft_init),
-    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
-    .prio       = FF_TX_PRIO_BASE,
-};
+DECL_RDFT_HALF(r2r,      AV_TX_REAL_TO_REAL,      0)
+DECL_RDFT_HALF(r2r_mod2, AV_TX_REAL_TO_REAL,      1)
+DECL_RDFT_HALF(r2i,      AV_TX_REAL_TO_IMAGINARY, 0)
+DECL_RDFT_HALF(r2i_mod2, AV_TX_REAL_TO_IMAGINARY, 1)
 
 static av_cold int TX_NAME(ff_tx_dct_init)(AVTXContext *s,
                                            const FFTXCodelet *cd,
@@ -1997,6 +2094,10 @@ const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = {
     &TX_NAME(ff_tx_mdct_naive_inv_def),
     &TX_NAME(ff_tx_mdct_inv_full_def),
     &TX_NAME(ff_tx_rdft_r2c_def),
+    &TX_NAME(ff_tx_rdft_r2r_def),
+    &TX_NAME(ff_tx_rdft_r2r_mod2_def),
+    &TX_NAME(ff_tx_rdft_r2i_def),
+    &TX_NAME(ff_tx_rdft_r2i_mod2_def),
     &TX_NAME(ff_tx_rdft_c2r_def),
     &TX_NAME(ff_tx_dctII_def),
     &TX_NAME(ff_tx_dctIII_def),
diff --git a/libavutil/version.h b/libavutil/version.h
index 24af520e08..9e798b0e3f 100644
--- a/libavutil/version.h
+++ b/libavutil/version.h
@@ -79,7 +79,7 @@
  */
 
 #define LIBAVUTIL_VERSION_MAJOR  58
-#define LIBAVUTIL_VERSION_MINOR  14
+#define LIBAVUTIL_VERSION_MINOR  15
 #define LIBAVUTIL_VERSION_MICRO 100
 
 #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
-- 
2.40.1


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH 2/2] lavu/tx: add DCT-I and DST-I transforms
       [not found] ` <NawD9Lq--3-9@lynne.ee-NawDDOJ----9>
@ 2023-08-03 16:31   ` Lynne
  2023-08-03 16:42   ` [FFmpeg-devel] [PATCH 1/2] lavu/tx: add real to real and real to imaginary RDFT transforms Lynne
       [not found]   ` <NawGxLe--3-9@lynne.ee-NawH0-d----9>
  2 siblings, 0 replies; 7+ messages in thread
From: Lynne @ 2023-08-03 16:31 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 618 bytes --]

These are true, actual DCT-I and DST-I transforms, unlike the
libavcodec versions, which are plainly not.

Error tests via https://github.com/cyanreg/lavu_fft_test

RMS error on a 2048-sample DCT-I:
RMSE   av_tx = 0.000000 (4096 matches, first mismatch at -1)
RMSE  fftw3f = 0.000000 (4096 matches, first mismatch at -1)
RMSE   avfft = 0.011440 (0 matches, first mismatch at 0)

RMS error on a 2048-sample DST-I:
RMSE   av_tx = 0.000000 (4096 matches, first mismatch at -1)
RMSE  fftw3f = 0.000000 (4096 matches, first mismatch at -1)
RMSE   avfft = 0.015316 (0 matches, first mismatch at 0)


[-- Attachment #2: 0002-lavu-tx-add-DCT-I-and-DST-I-transforms.patch --]
[-- Type: text/x-diff, Size: 5093 bytes --]

From 0bbe264a0c597a5a871ffc2bfea06e717bc9e0a1 Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Thu, 3 Aug 2023 18:23:02 +0200
Subject: [PATCH 2/2] lavu/tx: add DCT-I and DST-I transforms

These are true, actual DCT-I and DST-I transforms, unlike the
libavcodec versions, which are plainly not.
---
 libavutil/tx.h          |  24 ++++++++++
 libavutil/tx_template.c | 103 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)

diff --git a/libavutil/tx.h b/libavutil/tx.h
index d178e8ee9d..4696988cae 100644
--- a/libavutil/tx.h
+++ b/libavutil/tx.h
@@ -105,6 +105,30 @@ enum AVTXType {
     AV_TX_DOUBLE_DCT = 10,
     AV_TX_INT32_DCT  = 11,
 
+    /**
+     * Discrete Cosine Transform I
+     *
+     * The forward transform is a DCT-I.
+     * The inverse transform is a DCT-I multiplied by 2/(N + 1).
+     *
+     * The input array is always overwritten.
+     */
+    AV_TX_FLOAT_DCT_I  = 12,
+    AV_TX_DOUBLE_DCT_I = 13,
+    AV_TX_INT32_DCT_I  = 14,
+
+    /**
+     * Discrete Sine Transform I
+     *
+     * The forward transform is a DST-I.
+     * The inverse transform is a DST-I multiplied by 2/(N + 1).
+     *
+     * The input array is always overwritten.
+     */
+    AV_TX_FLOAT_DST_I  = 15,
+    AV_TX_DOUBLE_DST_I = 16,
+    AV_TX_INT32_DST_I  = 17,
+
     /* Not part of the API, do not use */
     AV_TX_NB,
 };
diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c
index 50c65d00b5..9bdac1e57d 100644
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@@ -2004,6 +2004,107 @@ static const FFTXCodelet TX_NAME(ff_tx_dctIII_def) = {
     .prio       = FF_TX_PRIO_BASE,
 };
 
+static av_cold int TX_NAME(ff_tx_dcstI_init)(AVTXContext *s,
+                                             const FFTXCodelet *cd,
+                                             uint64_t flags,
+                                             FFTXCodeletOptions *opts,
+                                             int len, int inv,
+                                             const void *scale)
+{
+    int ret;
+    SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
+
+    if (0 && inv) {
+        len *= 2;
+        s->len *= 2;
+        rsc *= 0.5;
+    }
+
+    /* We want a half-complex RDFT */
+    flags |= cd->type == TX_TYPE(DCT_I) ? AV_TX_REAL_TO_REAL :
+                                          AV_TX_REAL_TO_IMAGINARY;
+
+    if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL,
+                                (len - 1 + 2*(cd->type == TX_TYPE(DST_I)))*2,
+                                0, &rsc)))
+        return ret;
+
+    s->tmp = av_mallocz((len + 1)*2*sizeof(TXSample));
+    if (!s->tmp)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static void TX_NAME(ff_tx_dctI)(AVTXContext *s, void *_dst,
+                                void *_src, ptrdiff_t stride)
+{
+    TXSample *dst = _dst;
+    TXSample *src = _src;
+    const int len = s->len - 1;
+    TXSample *tmp = (TXSample *)s->tmp;
+
+    stride /= sizeof(TXSample);
+
+    for (int i = 0; i < len; i++)
+        tmp[i] = tmp[2*len - i] = src[i * stride];
+
+    tmp[len] = src[len * stride]; /* Middle */
+
+    s->fn[0](&s->sub[0], dst, tmp, sizeof(TXSample));
+}
+
+static void TX_NAME(ff_tx_dstI)(AVTXContext *s, void *_dst,
+                                void *_src, ptrdiff_t stride)
+{
+    TXSample *dst = _dst;
+    TXSample *src = _src;
+    const int len = s->len + 1;
+    TXSample *tmp = (void *)s->tmp;
+
+    stride /= sizeof(TXSample);
+
+    tmp[0] = 0;
+
+    for (int i = 1; i < len; i++) {
+        TXSample a = src[(i - 1) * stride];
+        tmp[i] = -a;
+        tmp[2*len - i] = a;
+    }
+
+    tmp[len] = 0; /* i == n, Nyquist */
+
+    s->fn[0](&s->sub[0], dst, tmp, sizeof(float));
+}
+
+static const FFTXCodelet TX_NAME(ff_tx_dctI_def) = {
+    .name       = TX_NAME_STR("dctI"),
+    .function   = TX_NAME(ff_tx_dctI),
+    .type       = TX_TYPE(DCT_I),
+    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE,
+    .factors    = { 2, TX_FACTOR_ANY },
+    .nb_factors = 2,
+    .min_len    = 2,
+    .max_len    = TX_LEN_UNLIMITED,
+    .init       = TX_NAME(ff_tx_dcstI_init),
+    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
+    .prio       = FF_TX_PRIO_BASE,
+};
+
+static const FFTXCodelet TX_NAME(ff_tx_dstI_def) = {
+    .name       = TX_NAME_STR("dstI"),
+    .function   = TX_NAME(ff_tx_dstI),
+    .type       = TX_TYPE(DST_I),
+    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE,
+    .factors    = { 2, TX_FACTOR_ANY },
+    .nb_factors = 2,
+    .min_len    = 2,
+    .max_len    = TX_LEN_UNLIMITED,
+    .init       = TX_NAME(ff_tx_dcstI_init),
+    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
+    .prio       = FF_TX_PRIO_BASE,
+};
+
 int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab)
 {
     int off = 0;
@@ -2101,6 +2202,8 @@ const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = {
     &TX_NAME(ff_tx_rdft_c2r_def),
     &TX_NAME(ff_tx_dctII_def),
     &TX_NAME(ff_tx_dctIII_def),
+    &TX_NAME(ff_tx_dctI_def),
+    &TX_NAME(ff_tx_dstI_def),
 
     NULL,
 };
-- 
2.40.1


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] lavu/tx: add real to real and real to imaginary RDFT transforms
       [not found] ` <NawD9Lq--3-9@lynne.ee-NawDDOJ----9>
  2023-08-03 16:31   ` [FFmpeg-devel] [PATCH 2/2] lavu/tx: add DCT-I and DST-I transforms Lynne
@ 2023-08-03 16:42   ` Lynne
       [not found]   ` <NawGxLe--3-9@lynne.ee-NawH0-d----9>
  2 siblings, 0 replies; 7+ messages in thread
From: Lynne @ 2023-08-03 16:42 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Aug 3, 2023, 18:26 by dev@lynne.ee:

> These are in-place transforms, required for DCT-I and DST-I.
>
> Templated as the mod2 variant requires minor modifications, and is
> required specifically for DCT-I/DST-I.
>
> Quite optimized, as there's no need for any additional buffer storage.
>

Specifically, for R2R, vs fftw
  26280 decicycles in           av_tx (r2c), 1048574 runs,      2 skips
  69940 decicycles in         fftwf_execute, 1048576 runs,      0 skips

And for R2I
  25856 decicycles in           av_tx (r2c), 1048571 runs,      5 skips
  65561 decicycles in         fftwf_execute, 1048576 runs,      0 skips
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] lavu/tx: add real to real and real to imaginary RDFT transforms
  2023-08-03 16:26 [FFmpeg-devel] [PATCH 1/2] lavu/tx: add real to real and real to imaginary RDFT transforms Lynne
       [not found] ` <NawD9Lq--3-9@lynne.ee-NawDDOJ----9>
@ 2023-08-03 20:32 ` Michael Niedermayer
  2023-08-03 20:39   ` Andreas Rheinhardt
  1 sibling, 1 reply; 7+ messages in thread
From: Michael Niedermayer @ 2023-08-03 20:32 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


[-- Attachment #1.1: Type: text/plain, Size: 2585 bytes --]

On Thu, Aug 03, 2023 at 06:26:16PM +0200, Lynne wrote:
> These are in-place transforms, required for DCT-I and DST-I.
> 
> Templated as the mod2 variant requires minor modifications, and is
> required specifically for DCT-I/DST-I.
> 
> Quite optimized, as there's no need for any additional buffer storage.
> 

>  doc/APIchanges          |    3 
>  libavutil/tx.c          |   18 ++++
>  libavutil/tx.h          |   10 ++
>  libavutil/tx_template.c |  175 +++++++++++++++++++++++++++++++++++++-----------
>  libavutil/version.h     |    2 
>  5 files changed, 167 insertions(+), 41 deletions(-)
> 6e6308365cd78a84c7db5800207e6b5977945079  0001-lavu-tx-add-real-to-real-and-real-to-imaginary-RDFT-.patch
> From 2ea5e2541c2551bf1b56e967d35946289a85aa49 Mon Sep 17 00:00:00 2001
> From: Lynne <dev@lynne.ee>
> Date: Thu, 3 Aug 2023 18:21:23 +0200
> Subject: [PATCH 1/2] lavu/tx: add real to real and real to imaginary RDFT
>  transforms
> 
> These are in-place transforms, required for DCT-I and DST-I.
> 
> Templated as the mod2 variant requires minor modifications, and is
> required specifically for DCT-I/DST-I.

not sure if i forgot some patch but
this seems to break fate-binkaudio-dct

make V=2 fate-binkaudio-dct -j32
...
[bink @ 0x559fc8d8dcc0] Failed to open codec in avformat_find_stream_info
[bink @ 0x559fc8d8dcc0] Packet corrupt (stream = 0, dts = NOPTS).
[bink @ 0x559fc8d8dcc0] Failed to open codec in avformat_find_stream_info
[bink @ 0x559fc8d8dcc0] Could not find codec parameters for stream 1 (Audio: binkaudio_dct, 44000 Hz, 0 channels, fltp): unspecified number of channels
Consider increasing the value for the 'analyzeduration' (0) and 'probesize' (5000000) options
Input #0, bink, from 'fate/fate-suite//bink/binkaudio_dct.bik':
  Duration: 00:00:08.87, start: 0.000000, bitrate: 183 kb/s
  Stream #0:0[0x0]: Video: binkvideo (BIKi / 0x694B4942), yuv420p(tv), 640x480, 30 fps, 30 tbr, 30 tbn
  Stream #0:1[0x0]: Audio: binkaudio_dct, 44000 Hz, stereo, fltp
[aist#0:1/binkaudio_dct @ 0x559fc8d93080] Error while opening decoder: Function not implemented
[aost#0:0/pcm_s16le @ 0x559fc8de5c40] Error initializing a simple filtergraph
Error opening output file -.
Error opening output files: Function not implemented
threads=1
tests/Makefile:308: recipe for target 'fate-binkaudio-dct' failed
make: *** [fate-binkaudio-dct] Error 218



[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Nations do behave wisely once they have exhausted all other alternatives. 
-- Abba Eban

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] lavu/tx: add real to real and real to imaginary RDFT transforms
  2023-08-03 20:32 ` [FFmpeg-devel] [PATCH " Michael Niedermayer
@ 2023-08-03 20:39   ` Andreas Rheinhardt
  2023-08-04  2:04     ` Lynne
  0 siblings, 1 reply; 7+ messages in thread
From: Andreas Rheinhardt @ 2023-08-03 20:39 UTC (permalink / raw)
  To: ffmpeg-devel

Michael Niedermayer:
> On Thu, Aug 03, 2023 at 06:26:16PM +0200, Lynne wrote:
>> These are in-place transforms, required for DCT-I and DST-I.
>>
>> Templated as the mod2 variant requires minor modifications, and is
>> required specifically for DCT-I/DST-I.
>>
>> Quite optimized, as there's no need for any additional buffer storage.
>>
> 
>>  doc/APIchanges          |    3 
>>  libavutil/tx.c          |   18 ++++
>>  libavutil/tx.h          |   10 ++
>>  libavutil/tx_template.c |  175 +++++++++++++++++++++++++++++++++++++-----------
>>  libavutil/version.h     |    2 
>>  5 files changed, 167 insertions(+), 41 deletions(-)
>> 6e6308365cd78a84c7db5800207e6b5977945079  0001-lavu-tx-add-real-to-real-and-real-to-imaginary-RDFT-.patch
>> From 2ea5e2541c2551bf1b56e967d35946289a85aa49 Mon Sep 17 00:00:00 2001
>> From: Lynne <dev@lynne.ee>
>> Date: Thu, 3 Aug 2023 18:21:23 +0200
>> Subject: [PATCH 1/2] lavu/tx: add real to real and real to imaginary RDFT
>>  transforms
>>
>> These are in-place transforms, required for DCT-I and DST-I.
>>
>> Templated as the mod2 variant requires minor modifications, and is
>> required specifically for DCT-I/DST-I.
> 
> not sure if i forgot some patch but
> this seems to break fate-binkaudio-dct
> 

It's not just you; patchwork reported four failing tests:
make: *** [fate-filter-firequalizer] Error 218
make: *** [fate-binkaudio-rdft] Error 218
make: *** [fate-binkaudio-dct] Error 218
make: *** [fate-qdm2] Error 218

- Andreas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] lavu/tx: add real to real and real to imaginary RDFT transforms
  2023-08-03 20:39   ` Andreas Rheinhardt
@ 2023-08-04  2:04     ` Lynne
  0 siblings, 0 replies; 7+ messages in thread
From: Lynne @ 2023-08-04  2:04 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Aug 3, 2023, 22:39 by andreas.rheinhardt@outlook.com:

> Michael Niedermayer:
>
>> On Thu, Aug 03, 2023 at 06:26:16PM +0200, Lynne wrote:
>>
>>> These are in-place transforms, required for DCT-I and DST-I.
>>>
>>> Templated as the mod2 variant requires minor modifications, and is
>>> required specifically for DCT-I/DST-I.
>>>
>>> Quite optimized, as there's no need for any additional buffer storage.
>>>
>>> doc/APIchanges          |    3 
>>>  libavutil/tx.c          |   18 ++++
>>>  libavutil/tx.h          |   10 ++
>>>  libavutil/tx_template.c |  175 +++++++++++++++++++++++++++++++++++++-----------
>>>  libavutil/version.h     |    2 
>>>  5 files changed, 167 insertions(+), 41 deletions(-)
>>> 6e6308365cd78a84c7db5800207e6b5977945079  0001-lavu-tx-add-real-to-real-and-real-to-imaginary-RDFT-.patch
>>> From 2ea5e2541c2551bf1b56e967d35946289a85aa49 Mon Sep 17 00:00:00 2001
>>> From: Lynne <dev@lynne.ee>
>>> Date: Thu, 3 Aug 2023 18:21:23 +0200
>>> Subject: [PATCH 1/2] lavu/tx: add real to real and real to imaginary RDFT
>>>  transforms
>>>
>>> These are in-place transforms, required for DCT-I and DST-I.
>>>
>>> Templated as the mod2 variant requires minor modifications, and is
>>> required specifically for DCT-I/DST-I.
>>>
>>
>> not sure if i forgot some patch but
>> this seems to break fate-binkaudio-dct
>>
>
> It's not just you; patchwork reported four failing tests:
> make: *** [fate-filter-firequalizer] Error 218
> make: *** [fate-binkaudio-rdft] Error 218
> make: *** [fate-binkaudio-dct] Error 218
> make: *** [fate-qdm2] Error 218
>
> - Andreas
>

Thanks, I had a typo on the standard, non-half RDFTs. One line change:

@@ -1711,7 +1703,7 @@ static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = {                   \
     .function   = TX_NAME(ff_tx_rdft_ ##n),                                    \
     .type       = TX_TYPE(RDFT),                                               \
     .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |       \
-                  inv ? FF_TX_INVERSE_ONLY : FF_TX_FORWARD_ONLY,               \
+                  (inv ? FF_TX_INVERSE_ONLY : FF_TX_FORWARD_ONLY),             \
     .factors    = { 4, TX_FACTOR_ANY },                                        \
     .nb_factors = 2,                                                           \
     .min_len    = 4,                                                           \

V2 sent.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH v2 1/2] lavu/tx: add real to real and real to imaginary RDFT transforms
       [not found]   ` <NawGxLe--3-9@lynne.ee-NawH0-d----9>
@ 2023-08-04  2:05     ` Lynne
  0 siblings, 0 replies; 7+ messages in thread
From: Lynne @ 2023-08-04  2:05 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 169 bytes --]

These are in-place transforms, required for DCT-I and DST-I.

Templated as the mod2 variant requires minor modifications, and is
required specifically for DCT-I/DST-I.


[-- Attachment #2: v2-0001-lavu-tx-add-real-to-real-and-real-to-imaginary-RD.patch --]
[-- Type: text/x-diff, Size: 18590 bytes --]

From c12c72e9de37a9eedf83e8ceb5ee444575420237 Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Thu, 3 Aug 2023 18:21:23 +0200
Subject: [PATCH v2 1/2] lavu/tx: add real to real and real to imaginary RDFT
 transforms

These are in-place transforms, required for DCT-I and DST-I.

Templated as the mod2 variant requires minor modifications, and is
required specifically for DCT-I/DST-I.
---
 doc/APIchanges          |   3 +
 libavutil/tx.c          |  18 ++++-
 libavutil/tx.h          |  10 +++
 libavutil/tx_template.c | 175 +++++++++++++++++++++++++++++++---------
 libavutil/version.h     |   2 +-
 5 files changed, 167 insertions(+), 41 deletions(-)

diff --git a/doc/APIchanges b/doc/APIchanges
index 5afe8bcb75..edd178be4f 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -2,6 +2,9 @@ The last version increases of all libraries were on 2023-02-09
 
 API changes, most recent first:
 
+2023-07-xx - xxxxxxxxxx - lavu 58.15.100 - tx.h
+  Add AV_TX_REAL_TO_REAL and AV_TX_REAL_TO_IMAGINARY
+
 2023-07-xx - xxxxxxxxxx - lavc 60 - avcodec.h
   Deprecate AV_CODEC_FLAG_DROPCHANGED without replacement.
 
diff --git a/libavutil/tx.c b/libavutil/tx.c
index e25abf998f..e9826e6107 100644
--- a/libavutil/tx.c
+++ b/libavutil/tx.c
@@ -437,7 +437,9 @@ int ff_tx_decompose_length(int dst[TX_MAX_DECOMPOSITIONS], enum AVTXType type,
 
             /* Check direction for non-orthogonal codelets */
             if (((cd->flags & FF_TX_FORWARD_ONLY) && inv) ||
-                ((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv))
+                ((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv) ||
+                ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_REAL)) && inv) ||
+                ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_IMAGINARY)) && inv))
                 continue;
 
             /* Check if the CPU supports the required ISA */
@@ -560,6 +562,10 @@ static void print_flags(AVBPrint *bp, uint64_t f)
         av_bprintf(bp, "%spreshuf", prev > 1 ? sep : "");
     if ((f & AV_TX_FULL_IMDCT) && ++prev)
         av_bprintf(bp, "%simdct_full", prev > 1 ? sep : "");
+    if ((f & AV_TX_REAL_TO_REAL) && ++prev)
+        av_bprintf(bp, "%sreal_to_real", prev > 1 ? sep : "");
+    if ((f & AV_TX_REAL_TO_IMAGINARY) && ++prev)
+        av_bprintf(bp, "%sreal_to_imaginary", prev > 1 ? sep : "");
     if ((f & FF_TX_ASM_CALL) && ++prev)
         av_bprintf(bp, "%sasm_call", prev > 1 ? sep : "");
     av_bprintf(bp, "]");
@@ -717,7 +723,11 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
     uint64_t req_flags = flags;
 
     /* Flags the codelet may require to be present */
-    uint64_t inv_req_mask = AV_TX_FULL_IMDCT | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL;
+    uint64_t inv_req_mask = AV_TX_FULL_IMDCT |
+                            AV_TX_REAL_TO_REAL |
+                            AV_TX_REAL_TO_IMAGINARY |
+                            FF_TX_PRESHUFFLE |
+                            FF_TX_ASM_CALL;
 
     /* Unaligned codelets are compatible with the aligned flag */
     if (req_flags & FF_TX_ALIGNED)
@@ -742,7 +752,9 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
 
             /* Check direction for non-orthogonal codelets */
             if (((cd->flags & FF_TX_FORWARD_ONLY) && inv) ||
-                ((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv))
+                ((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv) ||
+                ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_REAL)) && inv) ||
+                ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_IMAGINARY)) && inv))
                 continue;
 
             /* Check if the requested flags match from both sides */
diff --git a/libavutil/tx.h b/libavutil/tx.h
index 064edbc097..d178e8ee9d 100644
--- a/libavutil/tx.h
+++ b/libavutil/tx.h
@@ -149,6 +149,16 @@ enum AVTXFlags {
      * Ignored for all transforms but inverse MDCTs.
      */
     AV_TX_FULL_IMDCT = 1ULL << 2,
+
+    /**
+     * Perform a real to half-complex RDFT.
+     * Only the real, or imaginary coefficients will
+     * be output, depending on the flag used. Only available for forward RDFTs.
+     * Output array must have enough space to hold N complex values
+     * (regular size for a real to complex transform).
+     */
+    AV_TX_REAL_TO_REAL      = 1ULL << 3,
+    AV_TX_REAL_TO_IMAGINARY = 1ULL << 4,
 };
 
 /**
diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c
index c4ec9502e0..c56dcf0826 100644
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@@ -1613,14 +1613,17 @@ static av_cold int TX_NAME(ff_tx_rdft_init)(AVTXContext *s,
     int ret;
     double f, m;
     TXSample *tab;
+    int len4 = FFALIGN(len, 4) / 4;
 
     s->scale_d = *((SCALE_TYPE *)scale);
     s->scale_f = s->scale_d;
 
+    flags &= ~(AV_TX_REAL_TO_REAL | AV_TX_REAL_TO_IMAGINARY);
+
     if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, NULL, len >> 1, inv, scale)))
         return ret;
 
-    if (!(s->exp = av_mallocz((8 + (len >> 2) - 1)*sizeof(*s->exp))))
+    if (!(s->exp = av_mallocz((8 + 2*len4)*sizeof(*s->exp))))
         return AVERROR(ENOMEM);
 
     tab = (TXSample *)s->exp;
@@ -1639,17 +1642,20 @@ static av_cold int TX_NAME(ff_tx_rdft_init)(AVTXContext *s,
     *tab++ = RESCALE( (0.5 - inv) * m);
     *tab++ = RESCALE(-(0.5 - inv) * m);
 
-    for (int i = 0; i < len >> 2; i++)
+    for (int i = 0; i < len4; i++)
         *tab++ = RESCALE(cos(i*f));
-    for (int i = len >> 2; i >= 0; i--)
-        *tab++ = RESCALE(cos(i*f) * (inv ? +1.0 : -1.0));
+
+    tab = ((TXSample *)s->exp) + len4 + 8;
+
+    for (int i = 0; i < len4; i++)
+        *tab++ = RESCALE(cos(((float)len/4.0 - (float)i + 0)*f) * (inv ? +1.0 : -1.0));
 
     return 0;
 }
 
-#define DECL_RDFT(name, inv)                                                   \
-static void TX_NAME(ff_tx_rdft_ ##name)(AVTXContext *s, void *_dst,            \
-                                       void *_src, ptrdiff_t stride)           \
+#define DECL_RDFT(n, inv)                                                      \
+static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst,               \
+                                     void *_src, ptrdiff_t stride)             \
 {                                                                              \
     const int len2 = s->len >> 1;                                              \
     const int len4 = s->len >> 2;                                              \
@@ -1698,40 +1704,131 @@ static void TX_NAME(ff_tx_rdft_ ##name)(AVTXContext *s, void *_dst,            \
         data[len2].re = data[0].im;                                            \
         data[   0].im = data[len2].im = 0;                                     \
     }                                                                          \
-}
+}                                                                              \
+                                                                               \
+static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = {                   \
+    .name       = TX_NAME_STR("rdft_" #n),                                     \
+    .function   = TX_NAME(ff_tx_rdft_ ##n),                                    \
+    .type       = TX_TYPE(RDFT),                                               \
+    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |       \
+                  (inv ? FF_TX_INVERSE_ONLY : FF_TX_FORWARD_ONLY),             \
+    .factors    = { 4, TX_FACTOR_ANY },                                        \
+    .nb_factors = 2,                                                           \
+    .min_len    = 4,                                                           \
+    .max_len    = TX_LEN_UNLIMITED,                                            \
+    .init       = TX_NAME(ff_tx_rdft_init),                                    \
+    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
+    .prio       = FF_TX_PRIO_BASE,                                             \
+};
 
-DECL_RDFT(r2c, 0)
-DECL_RDFT(c2r, 1)
+DECL_RDFT(r2c,  0)
+DECL_RDFT(c2r,  1)
 
-static const FFTXCodelet TX_NAME(ff_tx_rdft_r2c_def) = {
-    .name       = TX_NAME_STR("rdft_r2c"),
-    .function   = TX_NAME(ff_tx_rdft_r2c),
-    .type       = TX_TYPE(RDFT),
-    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE |
-                  FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,
-    .factors    = { 2, TX_FACTOR_ANY },
-    .nb_factors = 2,
-    .min_len    = 2,
-    .max_len    = TX_LEN_UNLIMITED,
-    .init       = TX_NAME(ff_tx_rdft_init),
-    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
-    .prio       = FF_TX_PRIO_BASE,
+#define DECL_RDFT_HALF(n, mode, mod2)                                          \
+static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst,               \
+                                        void *_src, ptrdiff_t stride)          \
+{                                                                              \
+    const int len = s->len;                                                    \
+    const int len2 = len >> 1;                                                 \
+    const int len4 = len >> 2;                                                 \
+    const int aligned_len4 = FFALIGN(len, 4)/4;                                \
+    const TXSample *fact = (void *)s->exp;                                     \
+    const TXSample *tcos = fact + 8;                                           \
+    const TXSample *tsin = tcos + aligned_len4;                                \
+    TXComplex *data = _dst;                                                    \
+    TXSample *out = _dst; /* Half-complex is forward-only */                   \
+    TXSample tmp_dc;                                                           \
+    av_unused TXSample tmp_mid;                                                \
+    TXSample tmp[4];                                                           \
+    TXComplex sf, sl;                                                          \
+                                                                               \
+    s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex));                       \
+                                                                               \
+    tmp_dc = data[0].re;                                                       \
+    data[   0].re = tmp_dc + data[0].im;                                       \
+    tmp_dc        = tmp_dc - data[0].im;                                       \
+                                                                               \
+    data[   0].re = MULT(fact[0], data[   0].re);                              \
+    tmp_dc        = MULT(fact[1],        tmp_dc);                              \
+    data[len4].re = MULT(fact[2], data[len4].re);                              \
+                                                                               \
+    if (!mod2) {                                                               \
+        data[len4].im = MULT(fact[3], data[len4].im);                          \
+    } else {                                                                   \
+        sf = data[len4];                                                       \
+        sl = data[len4 + 1];                                                   \
+        if (mode == AV_TX_REAL_TO_REAL)                                        \
+            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
+        else                                                                   \
+            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
+        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
+        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
+                                                                               \
+        if (mode == AV_TX_REAL_TO_REAL) {                                      \
+            tmp[3]  = tmp[1]*tcos[len4] - tmp[2]*tsin[len4];                   \
+            tmp_mid = (tmp[0] - tmp[3]);                                       \
+        } else {                                                               \
+            tmp[3]  = tmp[1]*tsin[len4] + tmp[2]*tcos[len4];                   \
+            tmp_mid = (tmp[0] + tmp[3]);                                       \
+        }                                                                      \
+    }                                                                          \
+                                                                               \
+    /* NOTE: unrolling this breaks non-mod8 lengths */                         \
+    for (int i = 1; i <= len4; i++) {                                          \
+        TXSample tmp[4];                                                       \
+        TXComplex sf = data[i];                                                \
+        TXComplex sl = data[len2 - i];                                         \
+                                                                               \
+        if (mode == AV_TX_REAL_TO_REAL)                                        \
+            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
+        else                                                                   \
+            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
+                                                                               \
+        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
+        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
+                                                                               \
+        if (mode == AV_TX_REAL_TO_REAL) {                                      \
+            tmp[3]           = tmp[1]*tcos[i] - tmp[2]*tsin[i];                \
+            out[i]           = (tmp[0] + tmp[3]);                              \
+            out[len - i]     = (tmp[0] - tmp[3]);                              \
+        } else {                                                               \
+            tmp[3]           = tmp[1]*tsin[i] + tmp[2]*tcos[i];                \
+            out[i - 1]       = (tmp[3] - tmp[0]);                              \
+            out[len - i - 1] = (tmp[0] + tmp[3]);                              \
+        }                                                                      \
+    }                                                                          \
+                                                                               \
+    for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++)       \
+        out[len2 - i] = out[len - i];                                          \
+                                                                               \
+    if (mode == AV_TX_REAL_TO_REAL) {                                          \
+        out[len2] = tmp_dc;                                                    \
+        if (mod2)                                                              \
+            out[len4 + 1] = tmp_mid;                                           \
+    } else if (mod2) {                                                         \
+        out[len4] = tmp_mid;                                                   \
+    }                                                                          \
+}                                                                              \
+                                                                               \
+static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = {                   \
+    .name       = TX_NAME_STR("rdft_" #n),                                     \
+    .function   = TX_NAME(ff_tx_rdft_ ##n),                                    \
+    .type       = TX_TYPE(RDFT),                                               \
+    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | mode |                     \
+                  FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,                     \
+    .factors    = { 2 + 2*(!mod2), TX_FACTOR_ANY },                            \
+    .nb_factors = 2,                                                           \
+    .min_len    = 2 + 2*(!mod2),                                               \
+    .max_len    = TX_LEN_UNLIMITED,                                            \
+    .init       = TX_NAME(ff_tx_rdft_init),                                    \
+    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
+    .prio       = FF_TX_PRIO_BASE,                                             \
 };
 
-static const FFTXCodelet TX_NAME(ff_tx_rdft_c2r_def) = {
-    .name       = TX_NAME_STR("rdft_c2r"),
-    .function   = TX_NAME(ff_tx_rdft_c2r),
-    .type       = TX_TYPE(RDFT),
-    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE |
-                  FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY,
-    .factors    = { 2, TX_FACTOR_ANY },
-    .nb_factors = 2,
-    .min_len    = 2,
-    .max_len    = TX_LEN_UNLIMITED,
-    .init       = TX_NAME(ff_tx_rdft_init),
-    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
-    .prio       = FF_TX_PRIO_BASE,
-};
+DECL_RDFT_HALF(r2r,      AV_TX_REAL_TO_REAL,      0)
+DECL_RDFT_HALF(r2r_mod2, AV_TX_REAL_TO_REAL,      1)
+DECL_RDFT_HALF(r2i,      AV_TX_REAL_TO_IMAGINARY, 0)
+DECL_RDFT_HALF(r2i_mod2, AV_TX_REAL_TO_IMAGINARY, 1)
 
 static av_cold int TX_NAME(ff_tx_dct_init)(AVTXContext *s,
                                            const FFTXCodelet *cd,
@@ -1997,6 +2094,10 @@ const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = {
     &TX_NAME(ff_tx_mdct_naive_inv_def),
     &TX_NAME(ff_tx_mdct_inv_full_def),
     &TX_NAME(ff_tx_rdft_r2c_def),
+    &TX_NAME(ff_tx_rdft_r2r_def),
+    &TX_NAME(ff_tx_rdft_r2r_mod2_def),
+    &TX_NAME(ff_tx_rdft_r2i_def),
+    &TX_NAME(ff_tx_rdft_r2i_mod2_def),
     &TX_NAME(ff_tx_rdft_c2r_def),
     &TX_NAME(ff_tx_dctII_def),
     &TX_NAME(ff_tx_dctIII_def),
diff --git a/libavutil/version.h b/libavutil/version.h
index 24af520e08..9e798b0e3f 100644
--- a/libavutil/version.h
+++ b/libavutil/version.h
@@ -79,7 +79,7 @@
  */
 
 #define LIBAVUTIL_VERSION_MAJOR  58
-#define LIBAVUTIL_VERSION_MINOR  14
+#define LIBAVUTIL_VERSION_MINOR  15
 #define LIBAVUTIL_VERSION_MICRO 100
 
 #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
-- 
2.40.1


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-08-04  2:05 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-03 16:26 [FFmpeg-devel] [PATCH 1/2] lavu/tx: add real to real and real to imaginary RDFT transforms Lynne
     [not found] ` <NawD9Lq--3-9@lynne.ee-NawDDOJ----9>
2023-08-03 16:31   ` [FFmpeg-devel] [PATCH 2/2] lavu/tx: add DCT-I and DST-I transforms Lynne
2023-08-03 16:42   ` [FFmpeg-devel] [PATCH 1/2] lavu/tx: add real to real and real to imaginary RDFT transforms Lynne
     [not found]   ` <NawGxLe--3-9@lynne.ee-NawH0-d----9>
2023-08-04  2:05     ` [FFmpeg-devel] [PATCH v2 " Lynne
2023-08-03 20:32 ` [FFmpeg-devel] [PATCH " Michael Niedermayer
2023-08-03 20:39   ` Andreas Rheinhardt
2023-08-04  2:04     ` Lynne

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git