[FFmpeg-devel] [PATCH v2 1/2] avutil/loongarch: add LSX optimization for aac audio decode

From: pengxu <pengxu@loongson.cn>
To: ffmpeg-devel@ffmpeg.org
Subject: [FFmpeg-devel] [PATCH v2 1/2] avutil/loongarch: add LSX optimization for aac audio decode
Date: Thu, 18 Apr 2024 15:36:08 +0800
Message-ID: <20240418073609.19365-2-pengxu@loongson.cn> (raw)
In-Reply-To: <20240418073609.19365-1-pengxu@loongson.cn>

Add functions:
    vector_fmul_window_lsx
    butterflies_float_lsx
    vector_fmul_scalar_lsx

./ffmpeg -i ../../1.aac -f null -
before:482x
after:523x
---
 libavutil/float_dsp.c                         |   2 +
 libavutil/float_dsp.h                         |   1 +
 libavutil/loongarch/Makefile                  |   5 +-
 libavutil/loongarch/float_dsp.S               | 287 ++++++++++++++++++
 libavutil/loongarch/float_dsp.h               |  32 ++
 .../loongarch/float_dsp_init_loongarch.c      |  35 +++
 6 files changed, 361 insertions(+), 1 deletion(-)
 create mode 100644 libavutil/loongarch/float_dsp.S
 create mode 100644 libavutil/loongarch/float_dsp.h
 create mode 100644 libavutil/loongarch/float_dsp_init_loongarch.c

diff --git a/libavutil/float_dsp.c b/libavutil/float_dsp.c
index e9fb023466..7128ff3f96 100644
--- a/libavutil/float_dsp.c
+++ b/libavutil/float_dsp.c
@@ -162,6 +162,8 @@ av_cold AVFloatDSPContext *avpriv_float_dsp_alloc(int bit_exact)
     ff_float_dsp_init_x86(fdsp);
 #elif ARCH_MIPS
     ff_float_dsp_init_mips(fdsp);
+#elif ARCH_LOONGARCH64
+    ff_float_dsp_init_loongarch(fdsp);
 #endif
     return fdsp;
 }
diff --git a/libavutil/float_dsp.h b/libavutil/float_dsp.h
index 342a8715c5..679a930eab 100644
--- a/libavutil/float_dsp.h
+++ b/libavutil/float_dsp.h
@@ -206,6 +206,7 @@ void ff_float_dsp_init_ppc(AVFloatDSPContext *fdsp, int strict);
 void ff_float_dsp_init_riscv(AVFloatDSPContext *fdsp);
 void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp);
 void ff_float_dsp_init_mips(AVFloatDSPContext *fdsp);
+void ff_float_dsp_init_loongarch(AVFloatDSPContext *fdsp);
 
 /**
  * Allocate a float DSP context.
diff --git a/libavutil/loongarch/Makefile b/libavutil/loongarch/Makefile
index 2addd9351c..ae710f0515 100644
--- a/libavutil/loongarch/Makefile
+++ b/libavutil/loongarch/Makefile
@@ -1 +1,4 @@
-OBJS += loongarch/cpu.o
+OBJS += loongarch/cpu.o \
+		loongarch/float_dsp_init_loongarch.o
+
+LSX-OBJS += loongarch/float_dsp.o
diff --git a/libavutil/loongarch/float_dsp.S b/libavutil/loongarch/float_dsp.S
new file mode 100644
index 0000000000..5073c8424f
--- /dev/null
+++ b/libavutil/loongarch/float_dsp.S
@@ -0,0 +1,287 @@
+/*
+ * Loongarch LASX/LSX optimizeds dsp
+ *
+ * Copyright (c) 2024 Loongson Technology Corporation Limited
+ * Contributed by PengXu <pengxu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/loongarch/loongson_asm.S"
+
+
+/* void vector_fmul_window(float *dst, const float *src0,
+                           const float *src1, const float *win, int len) */
+function vector_fmul_window_lsx
+    addi.d      sp,     sp,     -8
+    st.d        $r23,   sp,     0
+
+    move        t4,     a0
+    move        t5,     a1
+    move        t6,     a2
+    move        t7,     a3
+    move        t8,     a4
+    slli.d      t8,     t8,     2
+
+    add.d       t4,     t4,     t8
+    add.d       t7,     t7,     t8
+    add.d       t5,     t5,     t8
+
+    add.d       a6,     $r0,    t8
+    addi.d      a7,     t8,     -4
+
+    move        a5,     $r0
+    srai.d      t0,     a4,     2
+    beq         a5,     t0,     .VFW02
+
+.VFW01:
+    sub.d       t1,     t5,     a6
+    addi.d      t2,     a7,     -12
+    vld         vr1,    t1,     0x00  //s0
+    vldx        vr2,    a2,     t2    //s1
+
+    sub.d       t1,     t7,     a6
+    vld         vr3,    t1,     0x00  //wi
+    vldx        vr4,    t7,     t2    //wj
+
+    vpermi.w    vr2,    vr2,    0x1b
+    vpermi.w    vr4,    vr4,    0x1b
+
+    vfmul.s     vr5,    vr2,    vr3
+    vfmsub.s    vr5,    vr1,    vr4,    vr5  //dsti
+
+    vfmul.s     vr6,    vr2,    vr4
+    vfmadd.s    vr6,    vr1,    vr3,    vr6  //dstj
+
+    vpermi.w    vr6,    vr6,    0x1b
+
+    sub.d       t1,     t4,     a6
+    vst         vr5,    t1,     0x00
+    vstx        vr6,    t4,     t2
+
+    addi.d      a6,     a6,     -16
+    addi.d      a7,     a7,     -16
+
+    addi.d      a5,     a5,     1
+    blt         a5,     t0,     .VFW01
+
+.VFW02:
+    andi        t0,     a4,     2
+    beq         $r0,    t0,     .VFW03
+
+    sub.d       t0,     t5,     a6
+    addi.d      t1,     a7,     -4
+    add.d       t1,     t5,     t1
+
+    sub.d       t2,     t7,     a6
+    addi.d      t3,     a7,     -4
+    add.d       t3,     t7,     t3
+
+    fld.s       f0,     t0,     0x00  //s0
+    fld.s       f1,     t0,     0x04
+
+    fld.s       f2,     t1,     0x04  //s1
+    fld.s       f3,     t1,     0x00
+
+    fld.s       f4,     t2,     0x00  //wi
+    fld.s       f5,     t2,     0x04
+
+    fld.s       f6,     t3,     0x04  //wj
+    fld.s       f7,     t3,     0x00
+
+    fmul.s      f8,     f2,     f4
+    fmsub.s     f8,     f0,     f6,    f8  //dsti
+    fmul.s      f9,     f3,     f5
+    fmsub.s     f9,     f1,     f7,    f9
+
+    fmul.s      f10,    f2,     f6
+    fmadd.s     f10,    f0,     f4,    f10  //dstj
+    fmul.s      f11,    f3,     f7
+    fmadd.s     f11,    f1,     f5,    f11
+
+    sub.d       t2,     t4,     a6
+    add.d       t3,     t4,     a7
+    addi.d      t3,     t3,     -4
+
+    fst.s       f8,     t2,     0x00
+    fst.s       f9,     t2,     0x04
+    fst.s       f10,    t3,     0x04
+    fst.s       f11,    t3,     0x00
+
+    addi.d      a6,     a6,     -2
+    addi.d      a7,     a7,     -2
+
+.VFW03:
+    andi        t0,     a4,     1
+    beq         $r0,    t0,     .VFW04
+
+    sub.d       t0,     t5,     a6
+
+    fldx.s      f0,     t5,     t0  //s0
+    fldx.s      f2,     t6,     a7   //s1
+    fldx.s      f4,     t7,     t0  //wi
+    fldx.s      f6,     t7,     a7   //wj
+
+    fmul.s      f8,     f2,     f4
+    fmsub.s     f8,     f0,     f6,    f8  //dsti
+
+    fmul.s      f10,    f2,     f6
+    fmadd.s     f10,    f0,     f4,    f10  //dstj
+
+    sub.d       t0,     t4,     a6
+
+    fst.s       f8,     t0,     0x00
+    fstx.s      f10,    t4,     a7
+
+    addi.d      a6,     a6,     -1
+    addi.d      a7,     a7,     -1
+
+.VFW04:
+    ld.d        $r23,   sp,     0
+    addi.d      sp,     sp,     8
+
+endfunc
+
+
+/* void butterflies_float(float *restrict v1, float *restrict v2,
+                          int len)  */
+function butterflies_float_lsx
+    move        a6,     $r0
+    move        a7,     $r0
+
+    move        t4,     a0
+    move        t5,     a1
+    move        t6,     a2
+
+    srai.d      t0,     t6,     2
+    beq         a6,     t0,     .BFL02
+
+.BFL01:
+    vldx        vr0,    t4,     a7
+    vldx        vr1,    t5,     a7
+
+    vfsub.s     vr3,    vr0,    vr1
+    vfadd.s     vr4,    vr0,    vr1
+
+    vstx        vr4,    t4,     a7
+    vstx        vr3,    t5,     a7
+
+    addi.d      a7,     a7,     16
+    addi.d      a6,     a6,     1
+    blt         a6,     t0,     .BFL01
+
+.BFL02:
+    andi        t0,     t6,     2
+    beq         $r0,    t0,     .BFL03
+
+    add.d       t1,     t4,     a7
+    add.d       t2,     t5,     a7
+
+    fld.s       f0,     t1,     0x00
+    fld.s       f1,     t1,     0x04
+    fld.s       f2,     t2,     0x00
+    fld.s       f3,     t2,     0x04
+
+    fsub.s      f4,     f0,     f2
+    fsub.s      f5,     f1,     f3
+    fadd.s      f6,     f0,     f2
+    fadd.s      f7,     f1,     f3
+
+    fst.s       f6,     t1,     0x00
+    fst.s       f7,     t1,     0x04
+    fst.s       f4,     t2,     0x00
+    fst.s       f5,     t2,     0x04
+
+    addi.d      a7,     a7,     8
+
+.BFL03:
+    andi        t0,     t6,     1
+    beq         $r0,    t0,     .BFL04
+
+    fldx.s      f0,     t4,     a7
+    fldx.s      f2,     t5,     a7
+
+    fsub.s      f4,     f0,     f2
+    fadd.s      f6,     f0,     f2
+
+    fstx.s      f6,     t4,     a7
+    fstx.s      f4,     t5,     a7
+
+    addi.d      a7,     a7,     4
+
+.BFL04:
+endfunc
+
+
+/*  void vector_fmul_scalar_lsx(float *dst, const float *src, float mul,
+                                int len)  */
+function vector_fmul_scalar_lsx
+    move        a6,     $r0
+    move        a7,     $r0
+
+    move        t4,     a0
+    move        t5,     a1
+    move        t6,     a2
+
+    vpermi.w    vr0,    vr0,    0x00
+
+    srai.d      t0,     t6,     2
+    beq         a6,     t0,     .BFS02
+
+.BFS01:
+    vldx        vr1,    t5,     a7
+
+    vfmul.s     vr2,    vr1,    vr0
+
+    vstx        vr2,    t4,     a7
+
+    addi.d      a7,     a7,     16
+    addi.d      a6,     a6,     1
+    blt         a6,     t0,     .BFS01
+
+.BFS02:
+    andi        t0,     t6,     2
+    beq         $r0,    t0,     .BFS03
+
+    add.d       t1,     t5,     a7
+    add.d       t2,     t4,     a7
+
+    fld.s       f1,     t1,     0x00
+    fld.s       f2,     t1,     0x04
+
+    fmul.s      f3,     f1,     f0
+    fmul.s      f4,     f2,     f0
+
+    fst.s       f3,     t2,     0x00
+    fst.s       f4,     t2,     0x04
+
+    addi.d      a7,     a7,     8
+
+.BFS03:
+    andi        t0,     t6,     1
+    beq         $r0,    t0,     .BFS04
+
+    fldx.s      f1,     t5,     a7
+
+    fmul.s      f3,     f1,     f0
+
+    fstx.s      f3,     t4,     a7
+
+    addi.d      a7,     a7,     4
+
+.BFS04:
+endfunc
\ No newline at end of file
diff --git a/libavutil/loongarch/float_dsp.h b/libavutil/loongarch/float_dsp.h
new file mode 100644
index 0000000000..644c1f3713
--- /dev/null
+++ b/libavutil/loongarch/float_dsp.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024 Loongson Technology Corporation Limited
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_LOONGARCH_FLOAT_DSP_H
+#define AVUTIL_LOONGARCH_FLOAT_DSP_H
+
+#include "libavutil/float_dsp.h"
+
+void vector_fmul_window_lsx(float *dst, const float *src0,
+                            const float *src1, const float *win, int len);
+
+void butterflies_float_lsx(float *restrict v1, float *restrict v2, int len);
+
+void vector_fmul_scalar_lsx(float *dst, const float *src, float mul, int len);
+
+#endif /* AVUTIL_LOONGARCH_FLOAT_DSP_H */
\ No newline at end of file
diff --git a/libavutil/loongarch/float_dsp_init_loongarch.c b/libavutil/loongarch/float_dsp_init_loongarch.c
new file mode 100644
index 0000000000..592ba78058
--- /dev/null
+++ b/libavutil/loongarch/float_dsp_init_loongarch.c
@@ -0,0 +1,35 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "float_dsp.h"
+#include "libavutil/loongarch/cpu.h"
+
+av_cold void ff_float_dsp_init_loongarch(AVFloatDSPContext *fdsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_lsx(cpu_flags)) {
+        fdsp->vector_fmul_window  = vector_fmul_window_lsx;
+        fdsp->butterflies_float   = butterflies_float_lsx;
+        fdsp->vector_fmul_scalar  = vector_fmul_scalar_lsx;
+    }
+}
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".