[FFmpeg-devel] [PATCH v3 0/2] lavc/aarch64/fdct: add neon-optimized fdct for aarch64

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] [PATCH v3 0/2] lavc/aarch64/fdct: add neon-optimized fdct for aarch64
@ 2024-04-17 18:01 Ramiro Polla
  2024-04-17 18:01 ` [FFmpeg-devel] [PATCH v3 1/2] checkasm: add test for fdct Ramiro Polla
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Ramiro Polla @ 2024-04-17 18:01 UTC (permalink / raw)
  To: ffmpeg-devel

This patch set adds fdct to checkasm and neon-optimized fdct for aarch64.

Ramiro Polla (2):
  checkasm: add test for fdct
  lavc/aarch64/fdct: add neon-optimized fdct for aarch64

 libavcodec/aarch64/Makefile               |   2 +
 libavcodec/aarch64/fdct.h                 |  26 ++
 libavcodec/aarch64/fdctdsp_init_aarch64.c |  39 +++
 libavcodec/aarch64/fdctdsp_neon.S         | 368 ++++++++++++++++++++++
 libavcodec/avcodec.h                      |   1 +
 libavcodec/fdctdsp.c                      |   4 +-
 libavcodec/fdctdsp.h                      |   2 +
 libavcodec/options_table.h                |   1 +
 libavcodec/tests/aarch64/dct.c            |   2 +
 tests/checkasm/Makefile                   |   1 +
 tests/checkasm/checkasm.c                 |   3 +
 tests/checkasm/checkasm.h                 |   1 +
 tests/checkasm/fdctdsp.c                  |  68 ++++
 tests/fate/checkasm.mak                   |   1 +
 14 files changed, 518 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/fdct.h
 create mode 100644 libavcodec/aarch64/fdctdsp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/fdctdsp_neon.S
 create mode 100644 tests/checkasm/fdctdsp.c

-- 
2.30.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH v3 1/2] checkasm: add test for fdct
  2024-04-17 18:01 [FFmpeg-devel] [PATCH v3 0/2] lavc/aarch64/fdct: add neon-optimized fdct for aarch64 Ramiro Polla
@ 2024-04-17 18:01 ` Ramiro Polla
  2024-05-02 18:04   ` Rémi Denis-Courmont
  2024-04-17 18:01 ` [FFmpeg-devel] [PATCH v3 2/2] lavc/aarch64/fdct: add neon-optimized fdct for aarch64 Ramiro Polla
  2024-04-17 20:49 ` [FFmpeg-devel] [PATCH v3 0/2] " Martin Storsjö
  2 siblings, 1 reply; 10+ messages in thread
From: Ramiro Polla @ 2024-04-17 18:01 UTC (permalink / raw)
  To: ffmpeg-devel

Reviewed-by: Martin Storsjö <martin@martin.st>
---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/fdctdsp.c  | 68 +++++++++++++++++++++++++++++++++++++++
 tests/fate/checkasm.mak   |  1 +
 5 files changed, 74 insertions(+)
 create mode 100644 tests/checkasm/fdctdsp.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 2673e1d098..70a6120c70 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -4,6 +4,7 @@ AVCODECOBJS-$(CONFIG_AC3DSP)            += ac3dsp.o
 AVCODECOBJS-$(CONFIG_AUDIODSP)          += audiodsp.o
 AVCODECOBJS-$(CONFIG_BLOCKDSP)          += blockdsp.o
 AVCODECOBJS-$(CONFIG_BSWAPDSP)          += bswapdsp.o
+AVCODECOBJS-$(CONFIG_FDCTDSP)           += fdctdsp.o
 AVCODECOBJS-$(CONFIG_FMTCONVERT)        += fmtconvert.o
 AVCODECOBJS-$(CONFIG_G722DSP)           += g722dsp.o
 AVCODECOBJS-$(CONFIG_H264CHROMA)        += h264chroma.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 8be6cb0f55..92c3a30ad3 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -106,6 +106,9 @@ static const struct {
     #if CONFIG_EXR_DECODER
         { "exrdsp", checkasm_check_exrdsp },
     #endif
+    #if CONFIG_FDCTDSP
+        { "fdctdsp", checkasm_check_fdctdsp },
+    #endif
     #if CONFIG_FLAC_DECODER
         { "flacdsp", checkasm_check_flacdsp },
     #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index f90920dee7..d3e8f9a37a 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -85,6 +85,7 @@ void checkasm_check_blockdsp(void);
 void checkasm_check_bswapdsp(void);
 void checkasm_check_colorspace(void);
 void checkasm_check_exrdsp(void);
+void checkasm_check_fdctdsp(void);
 void checkasm_check_fixed_dsp(void);
 void checkasm_check_flacdsp(void);
 void checkasm_check_float_dsp(void);
diff --git a/tests/checkasm/fdctdsp.c b/tests/checkasm/fdctdsp.c
new file mode 100644
index 0000000000..68a9b5e435
--- /dev/null
+++ b/tests/checkasm/fdctdsp.c
@@ -0,0 +1,68 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "checkasm.h"
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/fdctdsp.h"
+
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/mem_internal.h"
+
+static int int16_cmp_off_by_n(const int16_t *ref, const int16_t *test, size_t n, int accuracy)
+{
+    for (size_t i = 0; i < n; i++) {
+        if (abs(ref[i] - test[i]) > accuracy)
+            return 1;
+    }
+    return 0;
+}
+
+static void check_fdct(void)
+{
+    LOCAL_ALIGNED_16(int16_t, block0, [64]);
+    LOCAL_ALIGNED_16(int16_t, block1, [64]);
+
+    AVCodecContext avctx = { 0 };
+    FDCTDSPContext h;
+
+    ff_fdctdsp_init(&h, &avctx);
+
+    if (check_func(h.fdct, "fdct")) {
+        declare_func(void, int16_t *);
+        for (int i = 0; i < 64; i++) {
+            uint8_t r = rnd();
+            block0[i] = r;
+            block1[i] = r;
+        }
+        call_ref(block0);
+        call_new(block1);
+        if (int16_cmp_off_by_n(block0, block1, 64, 2))
+            fail();
+        bench_new(block1);
+    }
+}
+
+void checkasm_check_fdctdsp(void)
+{
+    check_fdct();
+    report("fdctdsp");
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index 3b5b867a97..10a42f2f9d 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -8,6 +8,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp                                 \
                 fate-checkasm-blockdsp                                  \
                 fate-checkasm-bswapdsp                                  \
                 fate-checkasm-exrdsp                                    \
+                fate-checkasm-fdctdsp                                   \
                 fate-checkasm-fixed_dsp                                 \
                 fate-checkasm-flacdsp                                   \
                 fate-checkasm-float_dsp                                 \
-- 
2.30.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH v3 2/2] lavc/aarch64/fdct: add neon-optimized fdct for aarch64
  2024-04-17 18:01 [FFmpeg-devel] [PATCH v3 0/2] lavc/aarch64/fdct: add neon-optimized fdct for aarch64 Ramiro Polla
  2024-04-17 18:01 ` [FFmpeg-devel] [PATCH v3 1/2] checkasm: add test for fdct Ramiro Polla
@ 2024-04-17 18:01 ` Ramiro Polla
  2024-04-17 20:49 ` [FFmpeg-devel] [PATCH v3 0/2] " Martin Storsjö
  2 siblings, 0 replies; 10+ messages in thread
From: Ramiro Polla @ 2024-04-17 18:01 UTC (permalink / raw)
  To: ffmpeg-devel

The code is imported from libjpeg-turbo-3.0.1. The neon registers used
have been changed to avoid modifying v8-v15.

Reviewed-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/Makefile               |   2 +
 libavcodec/aarch64/fdct.h                 |  26 ++
 libavcodec/aarch64/fdctdsp_init_aarch64.c |  39 +++
 libavcodec/aarch64/fdctdsp_neon.S         | 368 ++++++++++++++++++++++
 libavcodec/avcodec.h                      |   1 +
 libavcodec/fdctdsp.c                      |   4 +-
 libavcodec/fdctdsp.h                      |   2 +
 libavcodec/options_table.h                |   1 +
 libavcodec/tests/aarch64/dct.c            |   2 +
 9 files changed, 444 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/fdct.h
 create mode 100644 libavcodec/aarch64/fdctdsp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/fdctdsp_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 95ad4dd202..a3256bb1cc 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -1,5 +1,6 @@
 # subsystems
 OBJS-$(CONFIG_AC3DSP)                   += aarch64/ac3dsp_init_aarch64.o
+OBJS-$(CONFIG_FDCTDSP)                  += aarch64/fdctdsp_init_aarch64.o
 OBJS-$(CONFIG_FMTCONVERT)               += aarch64/fmtconvert_init.o
 OBJS-$(CONFIG_H264CHROMA)               += aarch64/h264chroma_init_aarch64.o
 OBJS-$(CONFIG_H264DSP)                  += aarch64/h264dsp_init_aarch64.o
@@ -37,6 +38,7 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP)           += aarch64/videodsp.o
 # subsystems
 NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/sbrdsp_neon.o
 NEON-OBJS-$(CONFIG_AC3DSP)              += aarch64/ac3dsp_neon.o
+NEON-OBJS-$(CONFIG_FDCTDSP)             += aarch64/fdctdsp_neon.o
 NEON-OBJS-$(CONFIG_FMTCONVERT)          += aarch64/fmtconvert_neon.o
 NEON-OBJS-$(CONFIG_H264CHROMA)          += aarch64/h264cmc_neon.o
 NEON-OBJS-$(CONFIG_H264DSP)             += aarch64/h264dsp_neon.o              \
diff --git a/libavcodec/aarch64/fdct.h b/libavcodec/aarch64/fdct.h
new file mode 100644
index 0000000000..0901b53a83
--- /dev/null
+++ b/libavcodec/aarch64/fdct.h
@@ -0,0 +1,26 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_FDCT_H
+#define AVCODEC_AARCH64_FDCT_H
+
+#include <stdint.h>
+
+void ff_fdct_neon(int16_t *block);
+
+#endif /* AVCODEC_AARCH64_FDCT_H */
diff --git a/libavcodec/aarch64/fdctdsp_init_aarch64.c b/libavcodec/aarch64/fdctdsp_init_aarch64.c
new file mode 100644
index 0000000000..59d91bc8fc
--- /dev/null
+++ b/libavcodec/aarch64/fdctdsp_init_aarch64.c
@@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/fdctdsp.h"
+#include "fdct.h"
+
+av_cold void ff_fdctdsp_init_aarch64(FDCTDSPContext *c, AVCodecContext *avctx,
+                                     unsigned high_bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        if (!high_bit_depth) {
+            if (avctx->dct_algo == FF_DCT_AUTO ||
+                avctx->dct_algo == FF_DCT_NEON) {
+                c->fdct = ff_fdct_neon;
+            }
+        }
+    }
+}
diff --git a/libavcodec/aarch64/fdctdsp_neon.S b/libavcodec/aarch64/fdctdsp_neon.S
new file mode 100644
index 0000000000..53fa4debe5
--- /dev/null
+++ b/libavcodec/aarch64/fdctdsp_neon.S
@@ -0,0 +1,368 @@
+/*
+ * Armv8 Neon optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ *                          All Rights Reserved.
+ * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
+ * Author:  Ragesh Radhakrishnan <ragesh.r@linaro.org>
+ * Copyright (C) 2014-2016, 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
+ * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+// #define EIGHT_BIT_SAMPLES
+
+/* Constants for jsimd_fdct_islow_neon() */
+
+#define F_0_298   2446  /* FIX(0.298631336) */
+#define F_0_390   3196  /* FIX(0.390180644) */
+#define F_0_541   4433  /* FIX(0.541196100) */
+#define F_0_765   6270  /* FIX(0.765366865) */
+#define F_0_899   7373  /* FIX(0.899976223) */
+#define F_1_175   9633  /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+const jsimd_fdct_islow_neon_consts, align=4
+        .short F_0_298
+        .short -F_0_390
+        .short F_0_541
+        .short F_0_765
+        .short - F_0_899
+        .short F_1_175
+        .short F_1_501
+        .short - F_1_847
+        .short - F_1_961
+        .short F_2_053
+        .short - F_2_562
+        .short F_3_072
+        .short 0          /* padding */
+        .short 0
+        .short 0
+        .short 0
+endconst
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/*****************************************************************************/
+
+/*
+ * jsimd_fdct_islow_neon
+ *
+ * This file contains a slower but more accurate integer implementation of the
+ * forward DCT (Discrete Cosine Transform). The following code is based
+ * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
+ * more details.
+ */
+
+#define CONST_BITS  13
+#ifdef EIGHT_BIT_SAMPLES
+#define PASS1_BITS  2
+#else
+#define PASS1_BITS  1   /* lose a little precision to avoid overflow */
+#endif
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define XFIX_P_0_298  v0.h[0]
+#define XFIX_N_0_390  v0.h[1]
+#define XFIX_P_0_541  v0.h[2]
+#define XFIX_P_0_765  v0.h[3]
+#define XFIX_N_0_899  v0.h[4]
+#define XFIX_P_1_175  v0.h[5]
+#define XFIX_P_1_501  v0.h[6]
+#define XFIX_N_1_847  v0.h[7]
+#define XFIX_N_1_961  v1.h[0]
+#define XFIX_P_2_053  v1.h[1]
+#define XFIX_N_2_562  v1.h[2]
+#define XFIX_P_3_072  v1.h[3]
+
+function ff_fdct_neon, export=1
+
+        DATA            .req x0
+        TMP             .req x9
+
+        /* Load constants */
+        movrel          TMP, jsimd_fdct_islow_neon_consts
+        ld1             {v0.8h, v1.8h}, [TMP]
+
+        /* Load all DATA into Neon registers with the following allocation:
+         *       0 1 2 3 | 4 5 6 7
+         *      ---------+--------
+         *   0 | d16     | d17    | v16.8h
+         *   1 | d18     | d19    | v17.8h
+         *   2 | d20     | d21    | v18.8h
+         *   3 | d22     | d23    | v19.8h
+         *   4 | d24     | d25    | v20.8h
+         *   5 | d26     | d27    | v21.8h
+         *   6 | d28     | d29    | v22.8h
+         *   7 | d30     | d31    | v23.8h
+         */
+
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+        sub             DATA, DATA, #64
+
+        /* Transpose */
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v31, v2
+
+        /* 1-D FDCT */
+        add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
+        sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
+        add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
+        sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
+        add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
+        sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
+        add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
+        sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
+
+        /* Even part */
+        add             v4.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
+        sub             v5.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
+        add             v6.8h, v25.8h, v26.8h   /* tmp11 = tmp1 + tmp2; */
+        sub             v7.8h, v25.8h, v26.8h   /* tmp12 = tmp1 - tmp2; */
+
+        add             v16.8h, v4.8h, v6.8h   /* tmp10 + tmp11 */
+        sub             v20.8h, v4.8h, v6.8h   /* tmp10 - tmp11 */
+
+        add             v18.8h, v7.8h, v5.8h   /* tmp12 + tmp13 */
+
+        shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
+        shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
+
+        smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+        smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+        mov             v22.16b, v18.16b
+        mov             v25.16b, v24.16b
+
+        smlal           v18.4s, v5.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+        smlal2          v24.4s, v5.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+        smlal           v22.4s, v7.4h, XFIX_N_1_847   /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+        smlal2          v25.4s, v7.8h, XFIX_N_1_847   /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+        rshrn           v18.4h, v18.4s, #DESCALE_P1
+        rshrn           v22.4h, v22.4s, #DESCALE_P1
+        rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+        rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+
+        /* Odd part */
+        add             v2.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
+        add             v3.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
+        add             v6.8h, v28.8h, v30.8h        /* z3 = tmp4 + tmp6; */
+        add             v7.8h, v29.8h, v31.8h        /* z4 = tmp5 + tmp7; */
+        smull           v4.4s, v6.4h, XFIX_P_1_175   /* z5 lo = z3 lo * XFIX_P_1_175 */
+        smull2          v5.4s, v6.8h, XFIX_P_1_175
+        smlal           v4.4s, v7.4h, XFIX_P_1_175   /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+        smlal2          v5.4s, v7.8h, XFIX_P_1_175
+
+        smull2          v24.4s, v28.8h, XFIX_P_0_298
+        smull2          v25.4s, v29.8h, XFIX_P_2_053
+        smull2          v26.4s, v30.8h, XFIX_P_3_072
+        smull2          v27.4s, v31.8h, XFIX_P_1_501
+        smull           v23.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+        smull           v21.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+        smull           v19.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+        smull           v17.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+        smull2          v28.4s, v2.8h, XFIX_N_0_899
+        smull2          v29.4s, v3.8h, XFIX_N_2_562
+        smull2          v30.4s, v6.8h, XFIX_N_1_961
+        smull2          v31.4s, v7.8h, XFIX_N_0_390
+        smull           v2.4s, v2.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+        smull           v3.4s, v3.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+        smull           v6.4s, v6.4h, XFIX_N_1_961    /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+        smull           v7.4s, v7.4h, XFIX_N_0_390    /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
+
+        add             v6.4s, v6.4s, v4.4s    /* z3 += z5 */
+        add             v30.4s, v30.4s, v5.4s
+        add             v7.4s, v7.4s, v4.4s    /* z4 += z5 */
+        add             v31.4s, v31.4s, v5.4s
+
+        add             v23.4s, v23.4s, v2.4s   /* tmp4 += z1 */
+        add             v24.4s, v24.4s, v28.4s
+        add             v21.4s, v21.4s, v3.4s   /* tmp5 += z2 */
+        add             v25.4s, v25.4s, v29.4s
+        add             v19.4s, v19.4s, v6.4s   /* tmp6 += z3 */
+        add             v26.4s, v26.4s, v30.4s
+        add             v17.4s, v17.4s, v7.4s   /* tmp7 += z4 */
+        add             v27.4s, v27.4s, v31.4s
+
+        add             v23.4s, v23.4s, v6.4s   /* tmp4 += z3 */
+        add             v24.4s, v24.4s, v30.4s
+        add             v21.4s, v21.4s, v7.4s   /* tmp5 += z4 */
+        add             v25.4s, v25.4s, v31.4s
+        add             v19.4s, v19.4s, v3.4s   /* tmp6 += z2 */
+        add             v26.4s, v26.4s, v29.4s
+        add             v17.4s, v17.4s, v2.4s   /* tmp7 += z1 */
+        add             v27.4s, v27.4s, v28.4s
+
+        rshrn           v23.4h, v23.4s, #DESCALE_P1
+        rshrn           v21.4h, v21.4s, #DESCALE_P1
+        rshrn           v19.4h, v19.4s, #DESCALE_P1
+        rshrn           v17.4h, v17.4s, #DESCALE_P1
+        rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+        rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+        rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+        rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+
+        /* Transpose */
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v31, v2
+
+        /* 1-D FDCT */
+        add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
+        sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
+        add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
+        sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
+        add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
+        sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
+        add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
+        sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
+
+        /* Even part */
+        add             v4.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
+        sub             v5.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
+        add             v6.8h, v25.8h, v26.8h   /* tmp11 = tmp1 + tmp2; */
+        sub             v7.8h, v25.8h, v26.8h   /* tmp12 = tmp1 - tmp2; */
+
+        add             v16.8h, v4.8h, v6.8h   /* tmp10 + tmp11 */
+        sub             v20.8h, v4.8h, v6.8h   /* tmp10 - tmp11 */
+
+        add             v18.8h, v7.8h, v5.8h   /* tmp12 + tmp13 */
+
+        srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
+        srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
+
+        smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+        smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+        mov             v22.16b, v18.16b
+        mov             v25.16b, v24.16b
+
+        smlal           v18.4s, v5.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+        smlal2          v24.4s, v5.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+        smlal           v22.4s, v7.4h, XFIX_N_1_847   /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+        smlal2          v25.4s, v7.8h, XFIX_N_1_847   /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+        rshrn           v18.4h, v18.4s, #DESCALE_P2
+        rshrn           v22.4h, v22.4s, #DESCALE_P2
+        rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS+PASS1_BITS); */
+        rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS+PASS1_BITS); */
+
+        /* Odd part */
+        add             v2.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
+        add             v3.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
+        add             v6.8h, v28.8h, v30.8h   /* z3 = tmp4 + tmp6; */
+        add             v7.8h, v29.8h, v31.8h   /* z4 = tmp5 + tmp7; */
+
+        smull           v4.4s, v6.4h, XFIX_P_1_175   /* z5 lo = z3 lo * XFIX_P_1_175 */
+        smull2          v5.4s, v6.8h, XFIX_P_1_175
+        smlal           v4.4s, v7.4h, XFIX_P_1_175   /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+        smlal2          v5.4s, v7.8h, XFIX_P_1_175
+
+        smull2          v24.4s, v28.8h, XFIX_P_0_298
+        smull2          v25.4s, v29.8h, XFIX_P_2_053
+        smull2          v26.4s, v30.8h, XFIX_P_3_072
+        smull2          v27.4s, v31.8h, XFIX_P_1_501
+        smull           v23.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+        smull           v21.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+        smull           v19.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+        smull           v17.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+        smull2          v28.4s, v2.8h, XFIX_N_0_899
+        smull2          v29.4s, v3.8h, XFIX_N_2_562
+        smull2          v30.4s, v6.8h, XFIX_N_1_961
+        smull2          v31.4s, v7.8h, XFIX_N_0_390
+        smull           v2.4s, v2.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+        smull           v3.4s, v3.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+        smull           v6.4s, v6.4h, XFIX_N_1_961    /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+        smull           v7.4s, v7.4h, XFIX_N_0_390    /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
+
+        add             v6.4s, v6.4s, v4.4s    /* z3 += z5 */
+        add             v30.4s, v30.4s, v5.4s
+        add             v7.4s, v7.4s, v4.4s    /* z4 += z5 */
+        add             v31.4s, v31.4s, v5.4s
+
+        add             v23.4s, v23.4s, v2.4s   /* tmp4 += z1 */
+        add             v24.4s, v24.4s, v28.4s
+        add             v21.4s, v21.4s, v3.4s   /* tmp5 += z2 */
+        add             v25.4s, v25.4s, v29.4s
+        add             v19.4s, v19.4s, v6.4s   /* tmp6 += z3 */
+        add             v26.4s, v26.4s, v30.4s
+        add             v17.4s, v17.4s, v7.4s   /* tmp7 += z4 */
+        add             v27.4s, v27.4s, v31.4s
+
+        add             v23.4s, v23.4s, v6.4s   /* tmp4 += z3 */
+        add             v24.4s, v24.4s, v30.4s
+        add             v21.4s, v21.4s, v7.4s   /* tmp5 += z4 */
+        add             v25.4s, v25.4s, v31.4s
+        add             v19.4s, v19.4s, v3.4s   /* tmp6 += z2 */
+        add             v26.4s, v26.4s, v29.4s
+        add             v17.4s, v17.4s, v2.4s   /* tmp7 += z1 */
+        add             v27.4s, v27.4s, v28.4s
+
+        rshrn           v23.4h, v23.4s, #DESCALE_P2
+        rshrn           v21.4h, v21.4s, #DESCALE_P2
+        rshrn           v19.4h, v19.4s, #DESCALE_P2
+        rshrn           v17.4h, v17.4s, #DESCALE_P2
+        rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS+PASS1_BITS); */
+        rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS+PASS1_BITS); */
+        rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS+PASS1_BITS); */
+        rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS+PASS1_BITS); */
+
+        /* Store results */
+        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+
+        ret
+
+        .unreq          DATA
+        .unreq          TMP
+endfunc
+
+#undef XFIX_P_0_298
+#undef XFIX_N_0_390
+#undef XFIX_P_0_541
+#undef XFIX_P_0_765
+#undef XFIX_N_0_899
+#undef XFIX_P_1_175
+#undef XFIX_P_1_501
+#undef XFIX_N_1_847
+#undef XFIX_N_1_961
+#undef XFIX_P_2_053
+#undef XFIX_N_2_562
+#undef XFIX_P_3_072
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 968009a192..2da63c87ea 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -1538,6 +1538,7 @@ typedef struct AVCodecContext {
 #define FF_DCT_MMX     3
 #define FF_DCT_ALTIVEC 5
 #define FF_DCT_FAAN    6
+#define FF_DCT_NEON    7
 
     /**
      * IDCT algorithm, see FF_IDCT_* below.
diff --git a/libavcodec/fdctdsp.c b/libavcodec/fdctdsp.c
index f8ba17426c..d20558ce88 100644
--- a/libavcodec/fdctdsp.c
+++ b/libavcodec/fdctdsp.c
@@ -42,7 +42,9 @@ av_cold void ff_fdctdsp_init(FDCTDSPContext *c, AVCodecContext *avctx)
         c->fdct248 = ff_fdct248_islow_8;
     }
 
-#if ARCH_PPC
+#if ARCH_AARCH64
+    ff_fdctdsp_init_aarch64(c, avctx, high_bit_depth);
+#elif ARCH_PPC
     ff_fdctdsp_init_ppc(c, avctx, high_bit_depth);
 #elif ARCH_X86
     ff_fdctdsp_init_x86(c, avctx, high_bit_depth);
diff --git a/libavcodec/fdctdsp.h b/libavcodec/fdctdsp.h
index 7378eab870..cad99ed7ca 100644
--- a/libavcodec/fdctdsp.h
+++ b/libavcodec/fdctdsp.h
@@ -32,6 +32,8 @@ typedef struct FDCTDSPContext {
 
 FF_VISIBILITY_PUSH_HIDDEN
 void ff_fdctdsp_init(FDCTDSPContext *c, struct AVCodecContext *avctx);
+void ff_fdctdsp_init_aarch64(FDCTDSPContext *c, struct AVCodecContext *avctx,
+                             unsigned high_bit_depth);
 void ff_fdctdsp_init_ppc(FDCTDSPContext *c, struct AVCodecContext *avctx,
                          unsigned high_bit_depth);
 void ff_fdctdsp_init_x86(FDCTDSPContext *c, struct AVCodecContext *avctx,
diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
index 7a70fa7b6c..33f1bce887 100644
--- a/libavcodec/options_table.h
+++ b/libavcodec/options_table.h
@@ -158,6 +158,7 @@ static const AVOption avcodec_options[] = {
 {"mmx", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_MMX }, INT_MIN, INT_MAX, V|E, .unit = "dct"},
 {"altivec", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_ALTIVEC }, INT_MIN, INT_MAX, V|E, .unit = "dct"},
 {"faan", "floating point AAN DCT", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FAAN }, INT_MIN, INT_MAX, V|E, .unit = "dct"},
+{"neon", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_NEON }, INT_MIN, INT_MAX, V|E, .unit = "dct"},
 {"lumi_mask", "compresses bright areas stronger than medium ones", OFFSET(lumi_masking), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E},
 {"tcplx_mask", "temporal complexity masking", OFFSET(temporal_cplx_masking), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E},
 {"scplx_mask", "spatial complexity masking", OFFSET(spatial_cplx_masking), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E},
diff --git a/libavcodec/tests/aarch64/dct.c b/libavcodec/tests/aarch64/dct.c
index 9e477328d5..e98a887cd5 100644
--- a/libavcodec/tests/aarch64/dct.c
+++ b/libavcodec/tests/aarch64/dct.c
@@ -19,9 +19,11 @@
 #include "config.h"
 
 #include "libavutil/cpu.h"
+#include "libavcodec/aarch64/fdct.h"
 #include "libavcodec/aarch64/idct.h"
 
 static const struct algo fdct_tab_arch[] = {
+    { "neon", ff_fdct_neon, FF_IDCT_PERM_NONE, AV_CPU_FLAG_NEON },
     { 0 }
 };
 
-- 
2.30.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH v3 0/2] lavc/aarch64/fdct: add neon-optimized fdct for aarch64
  2024-04-17 18:01 [FFmpeg-devel] [PATCH v3 0/2] lavc/aarch64/fdct: add neon-optimized fdct for aarch64 Ramiro Polla
  2024-04-17 18:01 ` [FFmpeg-devel] [PATCH v3 1/2] checkasm: add test for fdct Ramiro Polla
  2024-04-17 18:01 ` [FFmpeg-devel] [PATCH v3 2/2] lavc/aarch64/fdct: add neon-optimized fdct for aarch64 Ramiro Polla
@ 2024-04-17 20:49 ` Martin Storsjö
  2024-05-13 12:55   ` Ramiro Polla
  2 siblings, 1 reply; 10+ messages in thread
From: Martin Storsjö @ 2024-04-17 20:49 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, 17 Apr 2024, Ramiro Polla wrote:

> This patch set adds fdct to checkasm and neon-optimized fdct for aarch64.
>
> Ramiro Polla (2):
>  checkasm: add test for fdct
>  lavc/aarch64/fdct: add neon-optimized fdct for aarch64
>
> libavcodec/aarch64/Makefile               |   2 +
> libavcodec/aarch64/fdct.h                 |  26 ++
> libavcodec/aarch64/fdctdsp_init_aarch64.c |  39 +++
> libavcodec/aarch64/fdctdsp_neon.S         | 368 ++++++++++++++++++++++
> libavcodec/avcodec.h                      |   1 +
> libavcodec/fdctdsp.c                      |   4 +-
> libavcodec/fdctdsp.h                      |   2 +
> libavcodec/options_table.h                |   1 +
> libavcodec/tests/aarch64/dct.c            |   2 +
> tests/checkasm/Makefile                   |   1 +
> tests/checkasm/checkasm.c                 |   3 +
> tests/checkasm/checkasm.h                 |   1 +
> tests/checkasm/fdctdsp.c                  |  68 ++++
> tests/fate/checkasm.mak                   |   1 +
> 14 files changed, 518 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/fdct.h
> create mode 100644 libavcodec/aarch64/fdctdsp_init_aarch64.c
> create mode 100644 libavcodec/aarch64/fdctdsp_neon.S
> create mode 100644 tests/checkasm/fdctdsp.c

LGTM, thanks!

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH v3 1/2] checkasm: add test for fdct
  2024-04-17 18:01 ` [FFmpeg-devel] [PATCH v3 1/2] checkasm: add test for fdct Ramiro Polla
@ 2024-05-02 18:04   ` Rémi Denis-Courmont
  2024-05-06 17:18     ` Ramiro Polla
  0 siblings, 1 reply; 10+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-02 18:04 UTC (permalink / raw)
  To: ffmpeg-devel

Le keskiviikkona 17. huhtikuuta 2024, 21.01.37 EEST Ramiro Polla a écrit :
> Reviewed-by: Martin Storsjö <martin@martin.st>
> ---
>  tests/checkasm/Makefile   |  1 +
>  tests/checkasm/checkasm.c |  3 ++
>  tests/checkasm/checkasm.h |  1 +
>  tests/checkasm/fdctdsp.c  | 68 +++++++++++++++++++++++++++++++++++++++
>  tests/fate/checkasm.mak   |  1 +
>  5 files changed, 74 insertions(+)
>  create mode 100644 tests/checkasm/fdctdsp.c
> 
> diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
> index 2673e1d098..70a6120c70 100644
> --- a/tests/checkasm/Makefile
> +++ b/tests/checkasm/Makefile
> @@ -4,6 +4,7 @@ AVCODECOBJS-$(CONFIG_AC3DSP)            += ac3dsp.o
>  AVCODECOBJS-$(CONFIG_AUDIODSP)          += audiodsp.o
>  AVCODECOBJS-$(CONFIG_BLOCKDSP)          += blockdsp.o
>  AVCODECOBJS-$(CONFIG_BSWAPDSP)          += bswapdsp.o
> +AVCODECOBJS-$(CONFIG_FDCTDSP)           += fdctdsp.o
>  AVCODECOBJS-$(CONFIG_FMTCONVERT)        += fmtconvert.o
>  AVCODECOBJS-$(CONFIG_G722DSP)           += g722dsp.o
>  AVCODECOBJS-$(CONFIG_H264CHROMA)        += h264chroma.o
> diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
> index 8be6cb0f55..92c3a30ad3 100644
> --- a/tests/checkasm/checkasm.c
> +++ b/tests/checkasm/checkasm.c
> @@ -106,6 +106,9 @@ static const struct {
>      #if CONFIG_EXR_DECODER
>          { "exrdsp", checkasm_check_exrdsp },
>      #endif
> +    #if CONFIG_FDCTDSP
> +        { "fdctdsp", checkasm_check_fdctdsp },
> +    #endif
>      #if CONFIG_FLAC_DECODER
>          { "flacdsp", checkasm_check_flacdsp },
>      #endif
> diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
> index f90920dee7..d3e8f9a37a 100644
> --- a/tests/checkasm/checkasm.h
> +++ b/tests/checkasm/checkasm.h
> @@ -85,6 +85,7 @@ void checkasm_check_blockdsp(void);
>  void checkasm_check_bswapdsp(void);
>  void checkasm_check_colorspace(void);
>  void checkasm_check_exrdsp(void);
> +void checkasm_check_fdctdsp(void);
>  void checkasm_check_fixed_dsp(void);
>  void checkasm_check_flacdsp(void);
>  void checkasm_check_float_dsp(void);
> diff --git a/tests/checkasm/fdctdsp.c b/tests/checkasm/fdctdsp.c
> new file mode 100644
> index 0000000000..68a9b5e435
> --- /dev/null
> +++ b/tests/checkasm/fdctdsp.c
> @@ -0,0 +1,68 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +#include <string.h>
> +
> +#include "checkasm.h"
> +
> +#include "libavcodec/avcodec.h"
> +#include "libavcodec/fdctdsp.h"
> +
> +#include "libavutil/common.h"
> +#include "libavutil/internal.h"
> +#include "libavutil/mem_internal.h"
> +
> +static int int16_cmp_off_by_n(const int16_t *ref, const int16_t *test,
> size_t n, int accuracy) +{
> +    for (size_t i = 0; i < n; i++) {
> +        if (abs(ref[i] - test[i]) > accuracy)
> +            return 1;
> +    }
> +    return 0;
> +}
> +
> +static void check_fdct(void)
> +{
> +    LOCAL_ALIGNED_16(int16_t, block0, [64]);
> +    LOCAL_ALIGNED_16(int16_t, block1, [64]);
> +
> +    AVCodecContext avctx = { 0 };

AFAICT, that is not a legal context for ff_fdctdst_init(), which expect 
bits_per_raw_sample to be one of 8, 9 or 10. It would also be good manners to 
initialise dct_algo.

> +    FDCTDSPContext h;
> +
> +    ff_fdctdsp_init(&h, &avctx);
> +
> +    if (check_func(h.fdct, "fdct")) {
> +        declare_func(void, int16_t *);
> +        for (int i = 0; i < 64; i++) {
> +            uint8_t r = rnd();
> +            block0[i] = r;
> +            block1[i] = r;
> +        }
> +        call_ref(block0);
> +        call_new(block1);
> +        if (int16_cmp_off_by_n(block0, block1, 64, 2))
> +            fail();
> +        bench_new(block1);
> +    }
> +}
> +
> +void checkasm_check_fdctdsp(void)
> +{
> +    check_fdct();
> +    report("fdctdsp");
> +}
> diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
> index 3b5b867a97..10a42f2f9d 100644
> --- a/tests/fate/checkasm.mak
> +++ b/tests/fate/checkasm.mak
> @@ -8,6 +8,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp                     
>            \ fate-checkasm-blockdsp                                  \
> fate-checkasm-bswapdsp                                  \
> fate-checkasm-exrdsp                                    \ +               
> fate-checkasm-fdctdsp                                   \
> fate-checkasm-fixed_dsp                                 \
> fate-checkasm-flacdsp                                   \
> fate-checkasm-float_dsp                                 \


-- 
レミ・デニ-クールモン
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH v3 1/2] checkasm: add test for fdct
  2024-05-02 18:04   ` Rémi Denis-Courmont
@ 2024-05-06 17:18     ` Ramiro Polla
  2024-05-06 17:49       ` Rémi Denis-Courmont
  0 siblings, 1 reply; 10+ messages in thread
From: Ramiro Polla @ 2024-05-06 17:18 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Thu, May 2, 2024 at 8:05 PM Rémi Denis-Courmont <remi@remlab.net> wrote:
> Le keskiviikkona 17. huhtikuuta 2024, 21.01.37 EEST Ramiro Polla a écrit :
[...]
> > +static void check_fdct(void)
> > +{
> > +    LOCAL_ALIGNED_16(int16_t, block0, [64]);
> > +    LOCAL_ALIGNED_16(int16_t, block1, [64]);
> > +
> > +    AVCodecContext avctx = { 0 };
>
> AFAICT, that is not a legal context for ff_fdctdst_init(), which expect
> bits_per_raw_sample to be one of 8, 9 or 10. It would also be good manners to
> initialise dct_algo.

Thanks for spotting it. New patch coming up in a while.

I'll send a similar patch to fix checkasm/idctdsp after this is merged.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH v3 1/2] checkasm: add test for fdct
  2024-05-06 17:18     ` Ramiro Polla
@ 2024-05-06 17:49       ` Rémi Denis-Courmont
  2024-05-13 16:49         ` James Almer
  0 siblings, 1 reply; 10+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-06 17:49 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le maanantaina 6. toukokuuta 2024, 20.18.11 EEST Ramiro Polla a écrit :
> I'll send a similar patch to fix checkasm/idctdsp after this is merged.

The idctdsp test does not actually test the iDCT, but only the trivial-ish 
add/put helpers, so it does not care about the context. You're welcome to fix 
it anyway of course.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH v3 0/2] lavc/aarch64/fdct: add neon-optimized fdct for aarch64
  2024-04-17 20:49 ` [FFmpeg-devel] [PATCH v3 0/2] " Martin Storsjö
@ 2024-05-13 12:55   ` Ramiro Polla
  0 siblings, 0 replies; 10+ messages in thread
From: Ramiro Polla @ 2024-05-13 12:55 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, Apr 17, 2024 at 10:49 PM Martin Storsjö <martin@martin.st> wrote:
> On Wed, 17 Apr 2024, Ramiro Polla wrote:
> > This patch set adds fdct to checkasm and neon-optimized fdct for aarch64.
> >
> > Ramiro Polla (2):
> >  checkasm: add test for fdct
> >  lavc/aarch64/fdct: add neon-optimized fdct for aarch64
> >
> > libavcodec/aarch64/Makefile               |   2 +
> > libavcodec/aarch64/fdct.h                 |  26 ++
> > libavcodec/aarch64/fdctdsp_init_aarch64.c |  39 +++
> > libavcodec/aarch64/fdctdsp_neon.S         | 368 ++++++++++++++++++++++
> > libavcodec/avcodec.h                      |   1 +
> > libavcodec/fdctdsp.c                      |   4 +-
> > libavcodec/fdctdsp.h                      |   2 +
> > libavcodec/options_table.h                |   1 +
> > libavcodec/tests/aarch64/dct.c            |   2 +
> > tests/checkasm/Makefile                   |   1 +
> > tests/checkasm/checkasm.c                 |   3 +
> > tests/checkasm/checkasm.h                 |   1 +
> > tests/checkasm/fdctdsp.c                  |  68 ++++
> > tests/fate/checkasm.mak                   |   1 +
> > 14 files changed, 518 insertions(+), 1 deletion(-)
> > create mode 100644 libavcodec/aarch64/fdct.h
> > create mode 100644 libavcodec/aarch64/fdctdsp_init_aarch64.c
> > create mode 100644 libavcodec/aarch64/fdctdsp_neon.S
> > create mode 100644 tests/checkasm/fdctdsp.c
>
> LGTM, thanks!

Pushed.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH v3 1/2] checkasm: add test for fdct
  2024-05-06 17:49       ` Rémi Denis-Courmont
@ 2024-05-13 16:49         ` James Almer
  2024-05-13 19:14           ` Ramiro Polla
  0 siblings, 1 reply; 10+ messages in thread
From: James Almer @ 2024-05-13 16:49 UTC (permalink / raw)
  To: ffmpeg-devel

On 5/6/2024 2:49 PM, Rémi Denis-Courmont wrote:
> Le maanantaina 6. toukokuuta 2024, 20.18.11 EEST Ramiro Polla a écrit :
>> I'll send a similar patch to fix checkasm/idctdsp after this is merged.
> 
> The idctdsp test does not actually test the iDCT, but only the trivial-ish
> add/put helpers, so it does not care about the context. You're welcome to fix
> it anyway of course.

I personally find it ugly how we're storing a whole AVCodecContext on 
stack in these tests just to pass two ints to an init function.
Maybe we can make said values be input parameters for these instead of a 
pointer to avctx.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH v3 1/2] checkasm: add test for fdct
  2024-05-13 16:49         ` James Almer
@ 2024-05-13 19:14           ` Ramiro Polla
  0 siblings, 0 replies; 10+ messages in thread
From: Ramiro Polla @ 2024-05-13 19:14 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Mon, May 13, 2024 at 6:49 PM James Almer <jamrial@gmail.com> wrote:
> On 5/6/2024 2:49 PM, Rémi Denis-Courmont wrote:
> > Le maanantaina 6. toukokuuta 2024, 20.18.11 EEST Ramiro Polla a écrit :
> >> I'll send a similar patch to fix checkasm/idctdsp after this is merged.
> >
> > The idctdsp test does not actually test the iDCT, but only the trivial-ish
> > add/put helpers, so it does not care about the context. You're welcome to fix
> > it anyway of course.
>
> I personally find it ugly how we're storing a whole AVCodecContext on
> stack in these tests just to pass two ints to an init function.
> Maybe we can make said values be input parameters for these instead of a
> pointer to avctx.

It could make sense for fdct, but for idct we need a few more
parameters (bits_per_raw_sample, codec_id, flags, idct_algo, lowres).
That would make the function calls much longer, and in that case I'd
prefer just keeping AVCodecContext.
Or having an input parameter structure for each *dsp context, but that
seems a bit overkill.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-05-13 19:14 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-17 18:01 [FFmpeg-devel] [PATCH v3 0/2] lavc/aarch64/fdct: add neon-optimized fdct for aarch64 Ramiro Polla
2024-04-17 18:01 ` [FFmpeg-devel] [PATCH v3 1/2] checkasm: add test for fdct Ramiro Polla
2024-05-02 18:04   ` Rémi Denis-Courmont
2024-05-06 17:18     ` Ramiro Polla
2024-05-06 17:49       ` Rémi Denis-Courmont
2024-05-13 16:49         ` James Almer
2024-05-13 19:14           ` Ramiro Polla
2024-04-17 18:01 ` [FFmpeg-devel] [PATCH v3 2/2] lavc/aarch64/fdct: add neon-optimized fdct for aarch64 Ramiro Polla
2024-04-17 20:49 ` [FFmpeg-devel] [PATCH v3 0/2] " Martin Storsjö
2024-05-13 12:55   ` Ramiro Polla

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git