* [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
@ 2023-09-26 9:24 flow gg
2023-09-26 18:34 ` Rémi Denis-Courmont
` (2 more replies)
0 siblings, 3 replies; 18+ messages in thread
From: flow gg @ 2023-09-26 9:24 UTC (permalink / raw)
To: ffmpeg-devel
[-- Attachment #1: Type: text/plain, Size: 52 bytes --]
benchmark:
fcmul_add_c: 19.7
fcmul_add_rvv_f32: 6.7
[-- Attachment #2: 0001-af_afir-RISC-V-V-fcmul_add.patch --]
[-- Type: text/x-patch, Size: 4830 bytes --]
From 6bef2523728a472bb803ce085a1aafdfd624e212 Mon Sep 17 00:00:00 2001
From: h <hlefthleft@gmail.com>
Date: Tue, 26 Sep 2023 15:03:12 +0800
Subject: [PATCH] af_afir: RISC-V V fcmul_add
fcmul_add_c: 19.7
fcmul_add_rvv_f32: 6.7
---
libavfilter/af_afirdsp.h | 3 ++
libavfilter/riscv/Makefile | 2 +
libavfilter/riscv/af_afir_init.c | 39 +++++++++++++++++++
libavfilter/riscv/af_afir_rvv.S | 65 ++++++++++++++++++++++++++++++++
4 files changed, 109 insertions(+)
create mode 100644 libavfilter/riscv/Makefile
create mode 100644 libavfilter/riscv/af_afir_init.c
create mode 100644 libavfilter/riscv/af_afir_rvv.S
diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
index 4208501393..d2d1e909c1 100644
--- a/libavfilter/af_afirdsp.h
+++ b/libavfilter/af_afirdsp.h
@@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
} AudioFIRDSPContext;
void ff_afir_init_x86(AudioFIRDSPContext *s);
+void ff_afir_init_riscv(AudioFIRDSPContext *s);
static void fcmul_add_c(float *sum, const float *t, const float *c, ptrdiff_t len)
{
@@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext *dsp)
#if ARCH_X86
ff_afir_init_x86(dsp);
+#elif ARCH_RISCV
+ ff_afir_init_riscv(dsp);
#endif
}
diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
new file mode 100644
index 0000000000..0b968a9c0d
--- /dev/null
+++ b/libavfilter/riscv/Makefile
@@ -0,0 +1,2 @@
+OBJS += riscv/af_afir_init.o
+RVV-OBJS += riscv/af_afir_rvv.o
diff --git a/libavfilter/riscv/af_afir_init.c b/libavfilter/riscv/af_afir_init.c
new file mode 100644
index 0000000000..ffa176abd2
--- /dev/null
+++ b/libavfilter/riscv/af_afir_init.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2023 hleft
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavfilter/af_afirdsp.h"
+
+void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
+ ptrdiff_t len);
+
+av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
+{
+#if HAVE_RVV
+ int flags = av_get_cpu_flags();
+
+ if (flags & AV_CPU_FLAG_RVV_F32)
+ s->fcmul_add = ff_fcmul_add_rvv;
+#endif
+}
diff --git a/libavfilter/riscv/af_afir_rvv.S b/libavfilter/riscv/af_afir_rvv.S
new file mode 100644
index 0000000000..06c3979575
--- /dev/null
+++ b/libavfilter/riscv/af_afir_rvv.S
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2023 hleft
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+// void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
+func ff_fcmul_add_rvv, zve32f
+1:
+ li t1, 4
+ vsetvli t0, t1, e32, m1, ta, ma
+
+ li t2, 8
+
+ vlsseg2e32.v v0, (a1), t2
+ vlsseg2e32.v v2, (a2), t2
+ vlsseg2e32.v v4, (a0), t2
+
+ vfmul.vv v6, v0, v2
+ vfmul.vv v7, v1, v3
+ vfmul.vv v8, v0, v3
+ vfmul.vv v9, v1, v2
+
+ vfadd.vv v4, v4, v6
+ vfsub.vv v4, v4, v7
+ vfadd.vv v5, v5, v8
+ vfadd.vv v5, v5, v9
+
+ vssseg2e32.v v4, (a0), t2
+
+ mul t3, t2, t1
+ add a0, a0, t3
+ add a1, a1, t3
+ add a2, a2, t3
+
+ sub a3, a3, t0
+ bgtz a3, 1b
+
+ flw fa0, 0(a1)
+ flw fa1, 0(a2)
+ flw fa2, 0(a0)
+
+ fmul.s fa0, fa0, fa1
+ fadd.s fa2, fa2, fa0
+
+ fsw fa2, 0(a0)
+
+ ret
+endfunc
--
2.42.0
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-09-26 9:24 [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add flow gg
@ 2023-09-26 18:34 ` Rémi Denis-Courmont
2023-09-26 18:40 ` Paul B Mahol
2023-09-26 18:50 ` Rémi Denis-Courmont
2023-09-27 16:41 ` Rémi Denis-Courmont
2 siblings, 1 reply; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-09-26 18:34 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: flow gg
Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> benchmark:
> fcmul_add_c: 19.7
> fcmul_add_rvv_f32: 6.7
Nit: please pad mnemonics to at least 8 columns for consistency.
I'm a bit surprised that the performance improves this much, considering that
the C910 is notoriously bad at both segmented strided loads. It might be that
the C versions is just very bad due to lack of aliasing optimisations. Oh
well.
Note that you could do the double versions with very little extra efforts.
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-09-26 18:34 ` Rémi Denis-Courmont
@ 2023-09-26 18:40 ` Paul B Mahol
2023-09-26 18:44 ` Rémi Denis-Courmont
0 siblings, 1 reply; 18+ messages in thread
From: Paul B Mahol @ 2023-09-26 18:40 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: flow gg
On Tue, Sep 26, 2023 at 8:35 PM Rémi Denis-Courmont <remi@remlab.net> wrote:
> Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> > benchmark:
> > fcmul_add_c: 19.7
> > fcmul_add_rvv_f32: 6.7
>
> Nit: please pad mnemonics to at least 8 columns for consistency.
>
> I'm a bit surprised that the performance improves this much, considering
> that
> the C910 is notoriously bad at both segmented strided loads. It might be
> that
> the C versions is just very bad due to lack of aliasing optimisations. Oh
> well.
>
What you mean exactly that C version is missing?
>
> Note that you could do the double versions with very little extra efforts.
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-09-26 18:40 ` Paul B Mahol
@ 2023-09-26 18:44 ` Rémi Denis-Courmont
2023-09-27 1:47 ` flow gg
0 siblings, 1 reply; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-09-26 18:44 UTC (permalink / raw)
To: FFmpeg development discussions and patches, Paul B Mahol
Le tiistaina 26. syyskuuta 2023, 21.40.12 EEST Paul B Mahol a écrit :
> On Tue, Sep 26, 2023 at 8:35 PM Rémi Denis-Courmont <remi@remlab.net> wrote:
> > Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> > > benchmark:
> > > fcmul_add_c: 19.7
> > > fcmul_add_rvv_f32: 6.7
> >
> > Nit: please pad mnemonics to at least 8 columns for consistency.
> >
> > I'm a bit surprised that the performance improves this much, considering
> > that
> > the C910 is notoriously bad at both segmented strided loads. It might be
> > that
> > the C versions is just very bad due to lack of aliasing optimisations. Oh
> > well.
>
> What you mean exactly that C version is missing?
The C version does not have any restrict qualifier. This potentially prevents
the C compiler from unrolling. Adding the keyword can improve performance
gains of 20-30% on RISC-V scalar floating point.
That said, sometimes you can't validly use restrict, and you simply can't tell
the C compiler how to optimise properly. In those cases, even scalar floating
point optimisations improve performance.
--
Rémi Denis-Courmont
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-09-26 9:24 [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add flow gg
2023-09-26 18:34 ` Rémi Denis-Courmont
@ 2023-09-26 18:50 ` Rémi Denis-Courmont
2023-09-27 16:41 ` Rémi Denis-Courmont
2 siblings, 0 replies; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-09-26 18:50 UTC (permalink / raw)
To: ffmpeg-devel
Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> benchmark:
> fcmul_add_c: 19.7
> fcmul_add_rvv_f32: 6.7
+ li t1, 4
+ vsetvli t0, t1, e32, m1, ta, ma
vsetivli t0, 4, ...
But really, DO NOT use a fixed vector length here. At best, you're wasting half
the vector width. Your input has a variable size, use it.
+
+ li t2, 8
+
+ vlsseg2e32.v v0, (a1), t2
I'm not sure what you are trying to achieve here. It seems that you could just
as well use vlseg2 without register stride, no?
+ vlsseg2e32.v v2, (a2), t2
+ vlsseg2e32.v v4, (a0), t2
+
+ vfmul.vv v6, v0, v2
+ vfmul.vv v7, v1, v3
+ vfmul.vv v8, v0, v3
+ vfmul.vv v9, v1, v2
+
+ vfadd.vv v4, v4, v6
+ vfsub.vv v4, v4, v7
+ vfadd.vv v5, v5, v8
+ vfadd.vv v5, v5, v9
+
+ vssseg2e32.v v4, (a0), t2
Same here.
--
レミ・デニ-クールモン
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-09-26 18:44 ` Rémi Denis-Courmont
@ 2023-09-27 1:47 ` flow gg
2023-09-27 16:01 ` Rémi Denis-Courmont
2023-09-27 16:27 ` Rémi Denis-Courmont
0 siblings, 2 replies; 18+ messages in thread
From: flow gg @ 2023-09-27 1:47 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 3373 bytes --]
>>> please pad mnemonics to at least 8 columns for consistency
okay, changed
>>> It seems that you could just as well use vlseg2 without register
stride, no?
yes, vlseg will better, changed
>>> Note that you could do the double versions with very little extra
efforts.
okay
>>> But really, DO NOT use a fixed vector length here. At best, you're
wasting half
>>> the vector width. Your input has a variable size, use it.
okay, changed
>>> I'm a bit surprised that the performance improves this much,
considering that
>>> the C910 is notoriously bad at both segmented strided loads. It might
be that
>>> the C versions is just very bad due to lack of aliasing optimisations.
thanks, You reminded me.
Sorry I had forgotten that there was a problem..
A few days ago, I wanted to try running some existing benchmarks,
```
tests/checkasm/checkasm --bench --test=aacpsdsp
tests/checkasm/checkasm --bench --test=alacdsp
tests/checkasm/checkasm --bench --test=audiodsp
tests/checkasm/checkasm --bench --test=g722dsp
tests/checkasm/checkasm --bench --test=vorbisdsp
tests/checkasm/checkasm --bench --test=float_dsp
tests/checkasm/checkasm --bench --test=fixed_dsp
tests/checkasm/checkasm --bench --test=af_afir
```
but they all returned 0.0.
For example,
```
butterflies_float_c: 0.0
butterflies_float_rvv_f32: 0.0
scalarproduct_float_c: 0.0
scalarproduct_float_rvv_f32: 0.0
vector_dmac_scalar_c: 0.0
vector_dmac_scalar_rvv_f64: 0.0
...
```
I tried changing the -O3 in configure to -O2 or -O1, but still got 0.0.
Only by changing to -O0 did I receive non-zero results.
So, the benchmark I conducted was based on this, and I obtained the initial
results…
fcmul_add_c: 19.7
fcmul_add_rvv_f32: 6.7
Rémi Denis-Courmont <remi@remlab.net> 于2023年9月27日周三 02:44写道:
> Le tiistaina 26. syyskuuta 2023, 21.40.12 EEST Paul B Mahol a écrit :
> > On Tue, Sep 26, 2023 at 8:35 PM Rémi Denis-Courmont <remi@remlab.net>
> wrote:
> > > Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> > > > benchmark:
> > > > fcmul_add_c: 19.7
> > > > fcmul_add_rvv_f32: 6.7
> > >
> > > Nit: please pad mnemonics to at least 8 columns for consistency.
> > >
> > > I'm a bit surprised that the performance improves this much,
> considering
> > > that
> > > the C910 is notoriously bad at both segmented strided loads. It might
> be
> > > that
> > > the C versions is just very bad due to lack of aliasing optimisations.
> Oh
> > > well.
> >
> > What you mean exactly that C version is missing?
>
> The C version does not have any restrict qualifier. This potentially
> prevents
> the C compiler from unrolling. Adding the keyword can improve performance
> gains of 20-30% on RISC-V scalar floating point.
>
> That said, sometimes you can't validly use restrict, and you simply can't
> tell
> the C compiler how to optimise properly. In those cases, even scalar
> floating
> point optimisations improve performance.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
[-- Attachment #2: 0001-af_afir-RISC-V-V-fcmul_add.patch --]
[-- Type: text/x-patch, Size: 4996 bytes --]
From 5343b40c2e1849daadd79d6d531aeb7a762e0e06 Mon Sep 17 00:00:00 2001
From: h <hlefthleft@gmail.com>
Date: Tue, 26 Sep 2023 15:03:12 +0800
Subject: [PATCH] af_afir: RISC-V V fcmul_add
fcmul_add_c: 19.7
fcmul_add_rvv_f32: 6.7
---
libavfilter/af_afirdsp.h | 3 ++
libavfilter/riscv/Makefile | 2 ++
libavfilter/riscv/af_afir_init.c | 39 ++++++++++++++++++++++
libavfilter/riscv/af_afir_rvv.S | 55 ++++++++++++++++++++++++++++++++
4 files changed, 99 insertions(+)
create mode 100644 libavfilter/riscv/Makefile
create mode 100644 libavfilter/riscv/af_afir_init.c
create mode 100644 libavfilter/riscv/af_afir_rvv.S
diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
index 4208501393..d2d1e909c1 100644
--- a/libavfilter/af_afirdsp.h
+++ b/libavfilter/af_afirdsp.h
@@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
} AudioFIRDSPContext;
void ff_afir_init_x86(AudioFIRDSPContext *s);
+void ff_afir_init_riscv(AudioFIRDSPContext *s);
static void fcmul_add_c(float *sum, const float *t, const float *c, ptrdiff_t len)
{
@@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext *dsp)
#if ARCH_X86
ff_afir_init_x86(dsp);
+#elif ARCH_RISCV
+ ff_afir_init_riscv(dsp);
#endif
}
diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
new file mode 100644
index 0000000000..0b968a9c0d
--- /dev/null
+++ b/libavfilter/riscv/Makefile
@@ -0,0 +1,2 @@
+OBJS += riscv/af_afir_init.o
+RVV-OBJS += riscv/af_afir_rvv.o
diff --git a/libavfilter/riscv/af_afir_init.c b/libavfilter/riscv/af_afir_init.c
new file mode 100644
index 0000000000..ffa176abd2
--- /dev/null
+++ b/libavfilter/riscv/af_afir_init.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2023 hleft
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavfilter/af_afirdsp.h"
+
+void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
+ ptrdiff_t len);
+
+av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
+{
+#if HAVE_RVV
+ int flags = av_get_cpu_flags();
+
+ if (flags & AV_CPU_FLAG_RVV_F32)
+ s->fcmul_add = ff_fcmul_add_rvv;
+#endif
+}
diff --git a/libavfilter/riscv/af_afir_rvv.S b/libavfilter/riscv/af_afir_rvv.S
new file mode 100644
index 0000000000..e060d03b6e
--- /dev/null
+++ b/libavfilter/riscv/af_afir_rvv.S
@@ -0,0 +1,55 @@
+/*
+ * Copyright © 2023 hleft
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+// void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
+func ff_fcmul_add_rvv, zve32f
+1:
+ vsetvli t0, a3, e32, m1, ta, ma
+ vlseg2e32.v v0, (a1)
+ vlseg2e32.v v2, (a2)
+ vlseg2e32.v v4, (a0)
+ vfmul.vv v6, v0, v2
+ vfmul.vv v7, v1, v3
+ vfmul.vv v8, v0, v3
+ vfmul.vv v9, v1, v2
+ vfadd.vv v4, v4, v6
+ vfsub.vv v4, v4, v7
+ vfadd.vv v5, v5, v8
+ vfadd.vv v5, v5, v9
+ vsseg2e32.v v4, (a0)
+ li t2, 8
+ mul t3, t2, t0
+ add a0, a0, t3
+ add a1, a1, t3
+ add a2, a2, t3
+ sub a3, a3, t0
+ bgtz a3, 1b
+
+ flw fa0, 0(a1)
+ flw fa1, 0(a2)
+ flw fa2, 0(a0)
+ fmul.s fa0, fa0, fa1
+ fadd.s fa2, fa2, fa0
+ fsw fa2, 0(a0)
+
+ ret
+endfunc
--
2.42.0
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-09-27 1:47 ` flow gg
@ 2023-09-27 16:01 ` Rémi Denis-Courmont
2023-09-27 16:27 ` Rémi Denis-Courmont
1 sibling, 0 replies; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-09-27 16:01 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Le keskiviikkona 27. syyskuuta 2023, 4.47.26 EEST flow gg a écrit :
> ```
> tests/checkasm/checkasm --bench --test=aacpsdsp
> tests/checkasm/checkasm --bench --test=alacdsp
> tests/checkasm/checkasm --bench --test=audiodsp
> tests/checkasm/checkasm --bench --test=g722dsp
> tests/checkasm/checkasm --bench --test=vorbisdsp
> tests/checkasm/checkasm --bench --test=float_dsp
> tests/checkasm/checkasm --bench --test=fixed_dsp
> tests/checkasm/checkasm --bench --test=af_afir
> ```
>
> but they all returned 0.0.
Your checkasm setup is broken however why (I have not tested on C910 recently
so maybe it's just how it is). But in any case, performance metrics from C
code compiled with -O0 are worthless.
--
Rémi Denis-Courmont
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-09-27 1:47 ` flow gg
2023-09-27 16:01 ` Rémi Denis-Courmont
@ 2023-09-27 16:27 ` Rémi Denis-Courmont
1 sibling, 0 replies; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-09-27 16:27 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Le keskiviikkona 27. syyskuuta 2023, 4.47.26 EEST flow gg a écrit :
> >>> please pad mnemonics to at least 8 columns for consistency
>
> okay, changed
>
> >>> It seems that you could just as well use vlseg2 without register
>
> stride, no?
>
> yes, vlseg will better, changed
>
> >>> Note that you could do the double versions with very little extra
>
> efforts.
>
> okay
>
> >>> But really, DO NOT use a fixed vector length here. At best, you're
>
> wasting half
>
> >>> the vector width. Your input has a variable size, use it.
>
> okay, changed
>
> >>> I'm a bit surprised that the performance improves this much,
>
> considering that
>
> >>> the C910 is notoriously bad at both segmented strided loads. It might
>
> be that
>
> >>> the C versions is just very bad due to lack of aliasing optimisations.
>
> thanks, You reminded me.
> Sorry I had forgotten that there was a problem..
> A few days ago, I wanted to try running some existing benchmarks,
>
> ```
> tests/checkasm/checkasm --bench --test=aacpsdsp
> tests/checkasm/checkasm --bench --test=alacdsp
> tests/checkasm/checkasm --bench --test=audiodsp
> tests/checkasm/checkasm --bench --test=g722dsp
> tests/checkasm/checkasm --bench --test=vorbisdsp
> tests/checkasm/checkasm --bench --test=float_dsp
> tests/checkasm/checkasm --bench --test=fixed_dsp
> tests/checkasm/checkasm --bench --test=af_afir
> ```
>
> but they all returned 0.0.
>
> For example,
>
> ```
> butterflies_float_c: 0.0
> butterflies_float_rvv_f32: 0.0
> scalarproduct_float_c: 0.0
> scalarproduct_float_rvv_f32: 0.0
> vector_dmac_scalar_c: 0.0
> vector_dmac_scalar_rvv_f64: 0.0
> ...
OK, this reproduces on both SiFive and T-Head hardware here. You need to
revert 09731fbfc3a914ec4f6ffad60aa9062db6a8f6aa.
--
レミ・デニ-クールモン
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-09-26 9:24 [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add flow gg
2023-09-26 18:34 ` Rémi Denis-Courmont
2023-09-26 18:50 ` Rémi Denis-Courmont
@ 2023-09-27 16:41 ` Rémi Denis-Courmont
2023-09-28 5:45 ` flow gg
2 siblings, 1 reply; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-09-27 16:41 UTC (permalink / raw)
To: ffmpeg-devel
Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> benchmark:
> fcmul_add_c: 19.7
> fcmul_add_rvv_f32: 6.7
With optimisations enabled and the benchmarking fix, I get this (on the same
hardware, I believe):
fcmul_add_c: 3.5
fcmul_add_rvv_f32: 6.7
For sure unfortunate design limitations of T-Head C910 are to blame to no
small extent. It is not the first occurrence of an RVV optimisation that turns
out worse than scalar due to those, and I still have honest hopes that newer
(and conformant) IP would give saner results, but... I also believe that the
code could be improved regardless.
--
Rémi Denis-Courmont
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-09-27 16:41 ` Rémi Denis-Courmont
@ 2023-09-28 5:45 ` flow gg
2023-09-28 13:33 ` Rémi Denis-Courmont
0 siblings, 1 reply; 18+ messages in thread
From: flow gg @ 2023-09-28 5:45 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 1584 bytes --]
Okay, I revert the volatile in ff_read_time
How about this version?
use vls instead vlseg, and use vfmacc
The benchmark is sometimes better, sometimes the same
fcmul_add_c: 3.5
fcmul_add_rvv_f32: 3.5
- af_afir.fcmul_add [OK]
fcmul_add_c: 4.5
fcmul_add_rvv_f32: 4.2
- af_afir.fcmul_add [OK]
fcmul_add_c: 4.2
fcmul_add_rvv_f32: 4.2
- af_afir.fcmul_add [OK]
fcmul_add_c: 4.5
fcmul_add_rvv_f32: 4.2
- af_afir.fcmul_add [OK]
fcmul_add_c: 4.7
fcmul_add_rvv_f32: 3.5
Rémi Denis-Courmont <remi@remlab.net> 于2023年9月28日周四 00:41写道:
> Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> > benchmark:
> > fcmul_add_c: 19.7
> > fcmul_add_rvv_f32: 6.7
>
> With optimisations enabled and the benchmarking fix, I get this (on the
> same
> hardware, I believe):
>
> fcmul_add_c: 3.5
> fcmul_add_rvv_f32: 6.7
>
> For sure unfortunate design limitations of T-Head C910 are to blame to no
> small extent. It is not the first occurrence of an RVV optimisation that
> turns
> out worse than scalar due to those, and I still have honest hopes that
> newer
> (and conformant) IP would give saner results, but... I also believe that
> the
> code could be improved regardless.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
[-- Attachment #2: 0001-af_afir-RISC-V-V-fcmul_add.patch --]
[-- Type: text/x-patch, Size: 5159 bytes --]
From 3e956958b01e780b7360dace59b6248f61a6f12c Mon Sep 17 00:00:00 2001
From: h <hlefthleft@gmail.com>
Date: Tue, 26 Sep 2023 15:03:12 +0800
Subject: [PATCH] af_afir: RISC-V V fcmul_add
---
libavfilter/af_afirdsp.h | 3 ++
libavfilter/riscv/Makefile | 2 ++
libavfilter/riscv/af_afir_init.c | 39 ++++++++++++++++++++
libavfilter/riscv/af_afir_rvv.S | 61 ++++++++++++++++++++++++++++++++
4 files changed, 105 insertions(+)
create mode 100644 libavfilter/riscv/Makefile
create mode 100644 libavfilter/riscv/af_afir_init.c
create mode 100644 libavfilter/riscv/af_afir_rvv.S
diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
index 4208501393..d2d1e909c1 100644
--- a/libavfilter/af_afirdsp.h
+++ b/libavfilter/af_afirdsp.h
@@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
} AudioFIRDSPContext;
void ff_afir_init_x86(AudioFIRDSPContext *s);
+void ff_afir_init_riscv(AudioFIRDSPContext *s);
static void fcmul_add_c(float *sum, const float *t, const float *c, ptrdiff_t len)
{
@@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext *dsp)
#if ARCH_X86
ff_afir_init_x86(dsp);
+#elif ARCH_RISCV
+ ff_afir_init_riscv(dsp);
#endif
}
diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
new file mode 100644
index 0000000000..0b968a9c0d
--- /dev/null
+++ b/libavfilter/riscv/Makefile
@@ -0,0 +1,2 @@
+OBJS += riscv/af_afir_init.o
+RVV-OBJS += riscv/af_afir_rvv.o
diff --git a/libavfilter/riscv/af_afir_init.c b/libavfilter/riscv/af_afir_init.c
new file mode 100644
index 0000000000..ffa176abd2
--- /dev/null
+++ b/libavfilter/riscv/af_afir_init.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2023 hleft
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavfilter/af_afirdsp.h"
+
+void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
+ ptrdiff_t len);
+
+av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
+{
+#if HAVE_RVV
+ int flags = av_get_cpu_flags();
+
+ if (flags & AV_CPU_FLAG_RVV_F32)
+ s->fcmul_add = ff_fcmul_add_rvv;
+#endif
+}
diff --git a/libavfilter/riscv/af_afir_rvv.S b/libavfilter/riscv/af_afir_rvv.S
new file mode 100644
index 0000000000..6c15586007
--- /dev/null
+++ b/libavfilter/riscv/af_afir_rvv.S
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2023 hleft
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+// void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
+func ff_fcmul_add_rvv, zve32f
+1:
+ vsetvli t0, a3, e32, m1, ta, ma
+ li t1, 8
+ li t2, 4
+ vlse32.v v0, (a1), t1
+ add a1, a1, t2
+ vlse32.v v1, (a1), t1
+ sub a1, a1, t2
+ vlse32.v v2, (a2), t1
+ add a2, a2, t2
+ vlse32.v v3, (a2), t1
+ sub a2, a2, t2
+ vlse32.v v4, (a0), t1
+ add a0, a0, t2
+ vlse32.v v5, (a0), t1
+ sub a0, a0, t2
+ vfmacc.vv v4, v0, v2
+ vfnmsac.vv v4, v1, v3
+ vfmacc.vv v5, v0, v3
+ vfmacc.vv v5, v1, v2
+ vsseg2e32.v v4, (a0)
+ mul t3, t1, t0
+ add a0, a0, t3
+ add a1, a1, t3
+ add a2, a2, t3
+ sub a3, a3, t0
+ bgtz a3, 1b
+
+ flw fa0, 0(a1)
+ flw fa1, 0(a2)
+ flw fa2, 0(a0)
+ fmul.s fa0, fa0, fa1
+ fadd.s fa2, fa2, fa0
+ fsw fa2, 0(a0)
+
+ ret
+endfunc
--
2.42.0
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-09-28 5:45 ` flow gg
@ 2023-09-28 13:33 ` Rémi Denis-Courmont
2023-11-13 9:43 ` flow gg
0 siblings, 1 reply; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-09-28 13:33 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Le 28 septembre 2023 08:45:44 GMT+03:00, flow gg <hlefthleft@gmail.com> a écrit :
>Okay, I revert the volatile in ff_read_time
>
>How about this version?
It's still using register stride which is all but guaranteed to be slow on any hardware and should only be used as a last resort.
The code is also missing scheduling for multi-issue and unrolling with the group multiplier.
And lastly, while that probably won't change much, there are no reasons to use mul here. You can use shNadd like existing code does.
>
>use vls instead vlseg, and use vfmacc
>
>The benchmark is sometimes better, sometimes the same
>
>fcmul_add_c: 3.5
>fcmul_add_rvv_f32: 3.5
> - af_afir.fcmul_add [OK]
>fcmul_add_c: 4.5
>fcmul_add_rvv_f32: 4.2
> - af_afir.fcmul_add [OK]
>fcmul_add_c: 4.2
>fcmul_add_rvv_f32: 4.2
> - af_afir.fcmul_add [OK]
>fcmul_add_c: 4.5
>fcmul_add_rvv_f32: 4.2
> - af_afir.fcmul_add [OK]
>fcmul_add_c: 4.7
>fcmul_add_rvv_f32: 3.5
>
>
>Rémi Denis-Courmont <remi@remlab.net> 于2023年9月28日周四 00:41写道:
>
>> Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
>> > benchmark:
>> > fcmul_add_c: 19.7
>> > fcmul_add_rvv_f32: 6.7
>>
>> With optimisations enabled and the benchmarking fix, I get this (on the
>> same
>> hardware, I believe):
>>
>> fcmul_add_c: 3.5
>> fcmul_add_rvv_f32: 6.7
>>
>> For sure unfortunate design limitations of T-Head C910 are to blame to no
>> small extent. It is not the first occurrence of an RVV optimisation that
>> turns
>> out worse than scalar due to those, and I still have honest hopes that
>> newer
>> (and conformant) IP would give saner results, but... I also believe that
>> the
>> code could be improved regardless.
>>
>> --
>> Rémi Denis-Courmont
>> http://www.remlab.net/
>>
>>
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-09-28 13:33 ` Rémi Denis-Courmont
@ 2023-11-13 9:43 ` flow gg
2023-11-13 15:35 ` Rémi Denis-Courmont
0 siblings, 1 reply; 18+ messages in thread
From: flow gg @ 2023-11-13 9:43 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 2905 bytes --]
Sorry for the long delay in responding.
How is the modified patch now?
no longer using register stride(learn from your code) and have switched to
shNadd instead.
(using m4 and m2 as they are slightly faster than m8 and m4)
benchmark:
fcmul_add_c: 2179
fcmul_add_rvv_f32: 1652
Rémi Denis-Courmont <remi@remlab.net> 于2023年9月28日周四 21:33写道:
>
>
> Le 28 septembre 2023 08:45:44 GMT+03:00, flow gg <hlefthleft@gmail.com> a
> écrit :
> >Okay, I revert the volatile in ff_read_time
> >
> >How about this version?
>
> It's still using register stride which is all but guaranteed to be slow on
> any hardware and should only be used as a last resort.
>
> The code is also missing scheduling for multi-issue and unrolling with the
> group multiplier.
>
> And lastly, while that probably won't change much, there are no reasons to
> use mul here. You can use shNadd like existing code does.
>
>
> >
> >use vls instead vlseg, and use vfmacc
> >
> >The benchmark is sometimes better, sometimes the same
> >
> >fcmul_add_c: 3.5
> >fcmul_add_rvv_f32: 3.5
> > - af_afir.fcmul_add [OK]
> >fcmul_add_c: 4.5
> >fcmul_add_rvv_f32: 4.2
> > - af_afir.fcmul_add [OK]
> >fcmul_add_c: 4.2
> >fcmul_add_rvv_f32: 4.2
> > - af_afir.fcmul_add [OK]
> >fcmul_add_c: 4.5
> >fcmul_add_rvv_f32: 4.2
> > - af_afir.fcmul_add [OK]
> >fcmul_add_c: 4.7
> >fcmul_add_rvv_f32: 3.5
> >
> >
> >Rémi Denis-Courmont <remi@remlab.net> 于2023年9月28日周四 00:41写道:
> >
> >> Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> >> > benchmark:
> >> > fcmul_add_c: 19.7
> >> > fcmul_add_rvv_f32: 6.7
> >>
> >> With optimisations enabled and the benchmarking fix, I get this (on the
> >> same
> >> hardware, I believe):
> >>
> >> fcmul_add_c: 3.5
> >> fcmul_add_rvv_f32: 6.7
> >>
> >> For sure unfortunate design limitations of T-Head C910 are to blame to
> no
> >> small extent. It is not the first occurrence of an RVV optimisation that
> >> turns
> >> out worse than scalar due to those, and I still have honest hopes that
> >> newer
> >> (and conformant) IP would give saner results, but... I also believe that
> >> the
> >> code could be improved regardless.
> >>
> >> --
> >> Rémi Denis-Courmont
> >> http://www.remlab.net/
> >>
> >>
> >>
> >> _______________________________________________
> >> ffmpeg-devel mailing list
> >> ffmpeg-devel@ffmpeg.org
> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>
> >> To unsubscribe, visit link above, or email
> >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> >>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
[-- Attachment #2: af_afir-RISC-V-V-fcmul_add.patch --]
[-- Type: text/x-patch, Size: 5417 bytes --]
From 4199887247d31348385cd864b4efd6f4c02740f2 Mon Sep 17 00:00:00 2001
From: sunyuechi <sunyuechi@iscas.ac.cn>
Date: Fri, 3 Nov 2023 10:35:53 +0800
Subject: [PATCH] af_afir: RISC-V V fcmul_add
benchmark:
fcmul_add_c: 2179
fcmul_add_rvv_f32: 1652
---
libavfilter/af_afirdsp.h | 3 ++
libavfilter/riscv/Makefile | 2 ++
libavfilter/riscv/af_afir_init.c | 39 ++++++++++++++++++++
libavfilter/riscv/af_afir_rvv.S | 61 ++++++++++++++++++++++++++++++++
4 files changed, 105 insertions(+)
create mode 100644 libavfilter/riscv/Makefile
create mode 100644 libavfilter/riscv/af_afir_init.c
create mode 100644 libavfilter/riscv/af_afir_rvv.S
diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
index 4208501393..d2d1e909c1 100644
--- a/libavfilter/af_afirdsp.h
+++ b/libavfilter/af_afirdsp.h
@@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
} AudioFIRDSPContext;
void ff_afir_init_x86(AudioFIRDSPContext *s);
+void ff_afir_init_riscv(AudioFIRDSPContext *s);
static void fcmul_add_c(float *sum, const float *t, const float *c, ptrdiff_t len)
{
@@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext *dsp)
#if ARCH_X86
ff_afir_init_x86(dsp);
+#elif ARCH_RISCV
+ ff_afir_init_riscv(dsp);
#endif
}
diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
new file mode 100644
index 0000000000..0b968a9c0d
--- /dev/null
+++ b/libavfilter/riscv/Makefile
@@ -0,0 +1,2 @@
+OBJS += riscv/af_afir_init.o
+RVV-OBJS += riscv/af_afir_rvv.o
diff --git a/libavfilter/riscv/af_afir_init.c b/libavfilter/riscv/af_afir_init.c
new file mode 100644
index 0000000000..13df8341e7
--- /dev/null
+++ b/libavfilter/riscv/af_afir_init.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavfilter/af_afirdsp.h"
+
+void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
+ ptrdiff_t len);
+
+av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
+{
+#if HAVE_RVV
+ int flags = av_get_cpu_flags();
+
+ if (flags & AV_CPU_FLAG_RVV_F32)
+ s->fcmul_add = ff_fcmul_add_rvv;
+#endif
+}
diff --git a/libavfilter/riscv/af_afir_rvv.S b/libavfilter/riscv/af_afir_rvv.S
new file mode 100644
index 0000000000..078cac8e7e
--- /dev/null
+++ b/libavfilter/riscv/af_afir_rvv.S
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+// void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
+func ff_fcmul_add_rvv, zve32f
+ li t1, 32
+1:
+ vsetvli t0, a3, e64, m4, ta, ma
+ vle64.v v12, (a0)
+ sub a3, a3, t0
+ vsetvli zero, zero, e32, m2, ta, ma
+ vnsrl.vx v8, v12, zero
+ vnsrl.vx v10, v12, t1
+ vsetvli zero, zero, e64, m4, ta, ma
+ vle64.v v12, (a1)
+ sh3add a1, t0, a1
+ vsetvli zero, zero, e32, m2, ta, ma
+ vnsrl.vx v0, v12, zero
+ vnsrl.vx v2, v12, t1
+ vsetvli zero, zero, e64, m4, ta, ma
+ vle64.v v12, (a2)
+ sh3add a2, t0, a2
+ vsetvli zero, zero, e32, m2, ta, ma
+ vnsrl.vx v4, v12, zero
+ vnsrl.vx v6, v12, t1
+ vfmacc.vv v8, v0, v4
+ vfnmsac.vv v8, v2, v6
+ vfmacc.vv v10, v0, v6
+ vfmacc.vv v10, v2, v4
+ vsseg2e32.v v8, (a0)
+ sh3add a0, t0, a0
+ bgtz a3, 1b
+
+ flw fa0, 0(a1)
+ flw fa1, 0(a2)
+ flw fa2, 0(a0)
+ fmul.s fa0, fa0, fa1
+ fadd.s fa2, fa2, fa0
+ fsw fa2, 0(a0)
+
+ ret
+endfunc
--
2.42.1
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-11-13 9:43 ` flow gg
@ 2023-11-13 15:35 ` Rémi Denis-Courmont
2023-11-13 16:01 ` Paul B Mahol
` (2 more replies)
0 siblings, 3 replies; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-11-13 15:35 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: flow gg
Hi,
Le maanantaina 13. marraskuuta 2023, 11.43.01 EET flow gg a écrit :
> Sorry for the long delay in responding.
No problem. Working with T-Head C910 (or C920?) cores is very tedious. I gave
up on that and switched over to Kendryte K230 (based on C908) now.
> How is the modified patch now?
It looks better, but some minute improvements are still possible.
> no longer using register stride(learn from your code) and have switched to
> shNadd instead.
>
> (using m4 and m2 as they are slightly faster than m8 and m4)
>
> benchmark:
> fcmul_add_c: 2179
> fcmul_add_rvv_f32: 1652
> diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
> index 4208501393..d2d1e909c1 100644
> --- a/libavfilter/af_afirdsp.h
> +++ b/libavfilter/af_afirdsp.h
> @@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
> } AudioFIRDSPContext;
>
> void ff_afir_init_x86(AudioFIRDSPContext *s);
> +void ff_afir_init_riscv(AudioFIRDSPContext *s);
Nit: please stick to alphabetical order like most similar code.
>
> static void fcmul_add_c(float *sum, const float *t, const float *c,
> ptrdiff_t len)
> {
> @@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext
> *dsp)
>
> #if ARCH_X86
> ff_afir_init_x86(dsp);
> +#elif ARCH_RISCV
> + ff_afir_init_riscv(dsp);
Ditto.
> #endif
> }
>
> diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
> new file mode 100644
> index 0000000000..0b968a9c0d
> --- /dev/null
> +++ b/libavfilter/riscv/Makefile
> @@ -0,0 +1,2 @@
> +OBJS += riscv/af_afir_init.o
> +RVV-OBJS += riscv/af_afir_rvv.o
> diff --git a/libavfilter/riscv/af_afir_init.c
> b/libavfilter/riscv/af_afir_init.c new file mode 100644
> index 0000000000..13df8341e7
> --- /dev/null
> +++ b/libavfilter/riscv/af_afir_init.c
> @@ -0,0 +1,39 @@
> +/*
> + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include <stdint.h>
> +
> +#include "config.h"
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavfilter/af_afirdsp.h"
> +
> +void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
> + ptrdiff_t len);
> +
> +av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
> +{
> +#if HAVE_RVV
> + int flags = av_get_cpu_flags();
> +
> + if (flags & AV_CPU_FLAG_RVV_F32)
You need to check for Zba as well here. I doubt that we'll see hardware with V
and without Zba in real life, but for the sake of correctness...
> + s->fcmul_add = ff_fcmul_add_rvv;
> +#endif
> +}
> diff --git a/libavfilter/riscv/af_afir_rvv.S
> b/libavfilter/riscv/af_afir_rvv.S new file mode 100644
> index 0000000000..078cac8e7e
> --- /dev/null
> +++ b/libavfilter/riscv/af_afir_rvv.S
> @@ -0,0 +1,61 @@
> +/*
> + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +// void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
> +func ff_fcmul_add_rvv, zve32f
> + li t1, 32
> +1:
> + vsetvli t0, a3, e64, m4, ta, ma
You can set SEW=32 and corresponding LMUL here. Then you can remove all other
VSETVLI instances below. (Note that this will NOT work on draft 0.7.1
hardware, but it does work on conformant hardware.)
> + vle64.v v12, (a0)
This requires 64-bit alignment. I don't know if this is correct for this
specific filter, so I leave it to other people to comment here.
> + sub a3, a3, t0
> + vsetvli zero, zero, e32, m2, ta, ma
> + vnsrl.vx v8, v12, zero
> + vnsrl.vx v10, v12, t1
> + vsetvli zero, zero, e64, m4, ta, ma
> + vle64.v v12, (a1)
> + sh3add a1, t0, a1
> + vsetvli zero, zero, e32, m2, ta, ma
> + vnsrl.vx v0, v12, zero
> + vnsrl.vx v2, v12, t1
> + vsetvli zero, zero, e64, m4, ta, ma
> + vle64.v v12, (a2)
> + sh3add a2, t0, a2
> + vsetvli zero, zero, e32, m2, ta, ma
> + vnsrl.vx v4, v12, zero
> + vnsrl.vx v6, v12, t1
> + vfmacc.vv v8, v0, v4
> + vfnmsac.vv v8, v2, v6
> + vfmacc.vv v10, v0, v6
Swap the two instructions above for better pipeline utilisation on in-order
CPUs.
> + vfmacc.vv v10, v2, v4
> + vsseg2e32.v v8, (a0)
> + sh3add a0, t0, a0
> + bgtz a3, 1b
> +
> + flw fa0, 0(a1)
> + flw fa1, 0(a2)
> + flw fa2, 0(a0)
> + fmul.s fa0, fa0, fa1
> + fadd.s fa2, fa2, fa0
It won't make much difference, but you can use a fused multiply-add here.
> + fsw fa2, 0(a0)
> +
> + ret
> +endfunc
While you're at it, this looks like it could easily be adapted for the double
precision version. In fact, it will be simpler, since you will have to use
vlseg2e64 rather than vle128.v+vnsrl.vx+vnsrl.vx. But if you decide to
implement that too, please keep it a separate patch.
--
レミ・デニ-クールモン
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-11-13 15:35 ` Rémi Denis-Courmont
@ 2023-11-13 16:01 ` Paul B Mahol
2023-11-15 8:57 ` flow gg
2023-11-15 8:59 ` flow gg
2 siblings, 0 replies; 18+ messages in thread
From: Paul B Mahol @ 2023-11-13 16:01 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: flow gg
On Mon, Nov 13, 2023 at 4:35 PM Rémi Denis-Courmont <remi@remlab.net> wrote:
> Hi,
>
> Le maanantaina 13. marraskuuta 2023, 11.43.01 EET flow gg a écrit :
> > Sorry for the long delay in responding.
>
> No problem. Working with T-Head C910 (or C920?) cores is very tedious. I
> gave
> up on that and switched over to Kendryte K230 (based on C908) now.
>
> > How is the modified patch now?
>
> It looks better, but some minute improvements are still possible.
>
> > no longer using register stride(learn from your code) and have switched
> to
> > shNadd instead.
> >
> > (using m4 and m2 as they are slightly faster than m8 and m4)
> >
> > benchmark:
> > fcmul_add_c: 2179
> > fcmul_add_rvv_f32: 1652
>
> > diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
> > index 4208501393..d2d1e909c1 100644
> > --- a/libavfilter/af_afirdsp.h
> > +++ b/libavfilter/af_afirdsp.h
> > @@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
> > } AudioFIRDSPContext;
> >
> > void ff_afir_init_x86(AudioFIRDSPContext *s);
> > +void ff_afir_init_riscv(AudioFIRDSPContext *s);
>
> Nit: please stick to alphabetical order like most similar code.
>
> >
> > static void fcmul_add_c(float *sum, const float *t, const float *c,
> > ptrdiff_t len)
> > {
> > @@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext
> > *dsp)
> >
> > #if ARCH_X86
> > ff_afir_init_x86(dsp);
> > +#elif ARCH_RISCV
> > + ff_afir_init_riscv(dsp);
>
> Ditto.
>
> > #endif
> > }
> >
> > diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
> > new file mode 100644
> > index 0000000000..0b968a9c0d
> > --- /dev/null
> > +++ b/libavfilter/riscv/Makefile
> > @@ -0,0 +1,2 @@
> > +OBJS += riscv/af_afir_init.o
> > +RVV-OBJS += riscv/af_afir_rvv.o
> > diff --git a/libavfilter/riscv/af_afir_init.c
> > b/libavfilter/riscv/af_afir_init.c new file mode 100644
> > index 0000000000..13df8341e7
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_init.c
> > @@ -0,0 +1,39 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include <stdint.h>
> > +
> > +#include "config.h"
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavfilter/af_afirdsp.h"
> > +
> > +void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
> > + ptrdiff_t len);
> > +
> > +av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
> > +{
> > +#if HAVE_RVV
> > + int flags = av_get_cpu_flags();
> > +
> > + if (flags & AV_CPU_FLAG_RVV_F32)
>
> You need to check for Zba as well here. I doubt that we'll see hardware
> with V
> and without Zba in real life, but for the sake of correctness...
>
> > + s->fcmul_add = ff_fcmul_add_rvv;
> > +#endif
> > +}
> > diff --git a/libavfilter/riscv/af_afir_rvv.S
> > b/libavfilter/riscv/af_afir_rvv.S new file mode 100644
> > index 0000000000..078cac8e7e
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_rvv.S
> > @@ -0,0 +1,61 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +// void ff_fcmul_add(float *sum, const float *t, const float *c, int
> len)
> > +func ff_fcmul_add_rvv, zve32f
> > + li t1, 32
> > +1:
> > + vsetvli t0, a3, e64, m4, ta, ma
>
> You can set SEW=32 and corresponding LMUL here. Then you can remove all
> other
> VSETVLI instances below. (Note that this will NOT work on draft 0.7.1
> hardware, but it does work on conformant hardware.)
>
> > + vle64.v v12, (a0)
>
> This requires 64-bit alignment. I don't know if this is correct for this
> specific filter, so I leave it to other people to comment here.
>
Array should be aligned as allocated by libavutil calls.
The buffers sizes are aligned using av_cpu_align() so if that returns
correct size it should work.
>
> > + sub a3, a3, t0
> > + vsetvli zero, zero, e32, m2, ta, ma
> > + vnsrl.vx v8, v12, zero
> > + vnsrl.vx v10, v12, t1
> > + vsetvli zero, zero, e64, m4, ta, ma
> > + vle64.v v12, (a1)
> > + sh3add a1, t0, a1
> > + vsetvli zero, zero, e32, m2, ta, ma
> > + vnsrl.vx v0, v12, zero
> > + vnsrl.vx v2, v12, t1
> > + vsetvli zero, zero, e64, m4, ta, ma
> > + vle64.v v12, (a2)
> > + sh3add a2, t0, a2
> > + vsetvli zero, zero, e32, m2, ta, ma
> > + vnsrl.vx v4, v12, zero
> > + vnsrl.vx v6, v12, t1
> > + vfmacc.vv v8, v0, v4
> > + vfnmsac.vv v8, v2, v6
> > + vfmacc.vv v10, v0, v6
>
> Swap the two instructions above for better pipeline utilisation on
> in-order
> CPUs.
>
> > + vfmacc.vv v10, v2, v4
> > + vsseg2e32.v v8, (a0)
> > + sh3add a0, t0, a0
> > + bgtz a3, 1b
> > +
> > + flw fa0, 0(a1)
> > + flw fa1, 0(a2)
> > + flw fa2, 0(a0)
> > + fmul.s fa0, fa0, fa1
> > + fadd.s fa2, fa2, fa0
>
> It won't make much difference, but you can use a fused multiply-add here.
>
> > + fsw fa2, 0(a0)
> > +
> > + ret
> > +endfunc
>
> While you're at it, this looks like it could easily be adapted for the
> double
> precision version. In fact, it will be simpler, since you will have to use
> vlseg2e64 rather than vle128.v+vnsrl.vx+vnsrl.vx. But if you decide to
> implement that too, please keep it a separate patch.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-11-13 15:35 ` Rémi Denis-Courmont
2023-11-13 16:01 ` Paul B Mahol
@ 2023-11-15 8:57 ` flow gg
2023-11-15 8:59 ` flow gg
2 siblings, 0 replies; 18+ messages in thread
From: flow gg @ 2023-11-15 8:57 UTC (permalink / raw)
To: Rémi Denis-Courmont, FFmpeg development discussions and patches
Okay, I have updated these issues in the patch.
Rémi Denis-Courmont <remi@remlab.net> 于2023年11月13日周一 23:35写道:
> Hi,
>
> Le maanantaina 13. marraskuuta 2023, 11.43.01 EET flow gg a écrit :
> > Sorry for the long delay in responding.
>
> No problem. Working with T-Head C910 (or C920?) cores is very tedious. I
> gave
> up on that and switched over to Kendryte K230 (based on C908) now.
>
> > How is the modified patch now?
>
> It looks better, but some minute improvements are still possible.
>
> > no longer using register stride(learn from your code) and have switched
> to
> > shNadd instead.
> >
> > (using m4 and m2 as they are slightly faster than m8 and m4)
> >
> > benchmark:
> > fcmul_add_c: 2179
> > fcmul_add_rvv_f32: 1652
>
> > diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
> > index 4208501393..d2d1e909c1 100644
> > --- a/libavfilter/af_afirdsp.h
> > +++ b/libavfilter/af_afirdsp.h
> > @@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
> > } AudioFIRDSPContext;
> >
> > void ff_afir_init_x86(AudioFIRDSPContext *s);
> > +void ff_afir_init_riscv(AudioFIRDSPContext *s);
>
> Nit: please stick to alphabetical order like most similar code.
>
> >
> > static void fcmul_add_c(float *sum, const float *t, const float *c,
> > ptrdiff_t len)
> > {
> > @@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext
> > *dsp)
> >
> > #if ARCH_X86
> > ff_afir_init_x86(dsp);
> > +#elif ARCH_RISCV
> > + ff_afir_init_riscv(dsp);
>
> Ditto.
>
> > #endif
> > }
> >
> > diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
> > new file mode 100644
> > index 0000000000..0b968a9c0d
> > --- /dev/null
> > +++ b/libavfilter/riscv/Makefile
> > @@ -0,0 +1,2 @@
> > +OBJS += riscv/af_afir_init.o
> > +RVV-OBJS += riscv/af_afir_rvv.o
> > diff --git a/libavfilter/riscv/af_afir_init.c
> > b/libavfilter/riscv/af_afir_init.c new file mode 100644
> > index 0000000000..13df8341e7
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_init.c
> > @@ -0,0 +1,39 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include <stdint.h>
> > +
> > +#include "config.h"
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavfilter/af_afirdsp.h"
> > +
> > +void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
> > + ptrdiff_t len);
> > +
> > +av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
> > +{
> > +#if HAVE_RVV
> > + int flags = av_get_cpu_flags();
> > +
> > + if (flags & AV_CPU_FLAG_RVV_F32)
>
> You need to check for Zba as well here. I doubt that we'll see hardware
> with V
> and without Zba in real life, but for the sake of correctness...
>
> > + s->fcmul_add = ff_fcmul_add_rvv;
> > +#endif
> > +}
> > diff --git a/libavfilter/riscv/af_afir_rvv.S
> > b/libavfilter/riscv/af_afir_rvv.S new file mode 100644
> > index 0000000000..078cac8e7e
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_rvv.S
> > @@ -0,0 +1,61 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +// void ff_fcmul_add(float *sum, const float *t, const float *c, int
> len)
> > +func ff_fcmul_add_rvv, zve32f
> > + li t1, 32
> > +1:
> > + vsetvli t0, a3, e64, m4, ta, ma
>
> You can set SEW=32 and corresponding LMUL here. Then you can remove all
> other
> VSETVLI instances below. (Note that this will NOT work on draft 0.7.1
> hardware, but it does work on conformant hardware.)
>
> > + vle64.v v12, (a0)
>
> This requires 64-bit alignment. I don't know if this is correct for this
> specific filter, so I leave it to other people to comment here.
>
> > + sub a3, a3, t0
> > + vsetvli zero, zero, e32, m2, ta, ma
> > + vnsrl.vx v8, v12, zero
> > + vnsrl.vx v10, v12, t1
> > + vsetvli zero, zero, e64, m4, ta, ma
> > + vle64.v v12, (a1)
> > + sh3add a1, t0, a1
> > + vsetvli zero, zero, e32, m2, ta, ma
> > + vnsrl.vx v0, v12, zero
> > + vnsrl.vx v2, v12, t1
> > + vsetvli zero, zero, e64, m4, ta, ma
> > + vle64.v v12, (a2)
> > + sh3add a2, t0, a2
> > + vsetvli zero, zero, e32, m2, ta, ma
> > + vnsrl.vx v4, v12, zero
> > + vnsrl.vx v6, v12, t1
> > + vfmacc.vv v8, v0, v4
> > + vfnmsac.vv v8, v2, v6
> > + vfmacc.vv v10, v0, v6
>
> Swap the two instructions above for better pipeline utilisation on
> in-order
> CPUs.
>
> > + vfmacc.vv v10, v2, v4
> > + vsseg2e32.v v8, (a0)
> > + sh3add a0, t0, a0
> > + bgtz a3, 1b
> > +
> > + flw fa0, 0(a1)
> > + flw fa1, 0(a2)
> > + flw fa2, 0(a0)
> > + fmul.s fa0, fa0, fa1
> > + fadd.s fa2, fa2, fa0
>
> It won't make much difference, but you can use a fused multiply-add here.
>
> > + fsw fa2, 0(a0)
> > +
> > + ret
> > +endfunc
>
> While you're at it, this looks like it could easily be adapted for the
> double
> precision version. In fact, it will be simpler, since you will have to use
> vlseg2e64 rather than vle128.v+vnsrl.vx+vnsrl.vx. But if you decide to
> implement that too, please keep it a separate patch.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-11-13 15:35 ` Rémi Denis-Courmont
2023-11-13 16:01 ` Paul B Mahol
2023-11-15 8:57 ` flow gg
@ 2023-11-15 8:59 ` flow gg
2023-11-15 15:05 ` Rémi Denis-Courmont
2 siblings, 1 reply; 18+ messages in thread
From: flow gg @ 2023-11-15 8:59 UTC (permalink / raw)
To: Rémi Denis-Courmont, FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 7444 bytes --]
Okay, I have updated these issues in the patch.
Rémi Denis-Courmont <remi@remlab.net> 于2023年11月13日周一 23:35写道:
> Hi,
>
> Le maanantaina 13. marraskuuta 2023, 11.43.01 EET flow gg a écrit :
> > Sorry for the long delay in responding.
>
> No problem. Working with T-Head C910 (or C920?) cores is very tedious. I
> gave
> up on that and switched over to Kendryte K230 (based on C908) now.
>
> > How is the modified patch now?
>
> It looks better, but some minute improvements are still possible.
>
> > no longer using register stride(learn from your code) and have switched
> to
> > shNadd instead.
> >
> > (using m4 and m2 as they are slightly faster than m8 and m4)
> >
> > benchmark:
> > fcmul_add_c: 2179
> > fcmul_add_rvv_f32: 1652
>
> > diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
> > index 4208501393..d2d1e909c1 100644
> > --- a/libavfilter/af_afirdsp.h
> > +++ b/libavfilter/af_afirdsp.h
> > @@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
> > } AudioFIRDSPContext;
> >
> > void ff_afir_init_x86(AudioFIRDSPContext *s);
> > +void ff_afir_init_riscv(AudioFIRDSPContext *s);
>
> Nit: please stick to alphabetical order like most similar code.
>
> >
> > static void fcmul_add_c(float *sum, const float *t, const float *c,
> > ptrdiff_t len)
> > {
> > @@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext
> > *dsp)
> >
> > #if ARCH_X86
> > ff_afir_init_x86(dsp);
> > +#elif ARCH_RISCV
> > + ff_afir_init_riscv(dsp);
>
> Ditto.
>
> > #endif
> > }
> >
> > diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
> > new file mode 100644
> > index 0000000000..0b968a9c0d
> > --- /dev/null
> > +++ b/libavfilter/riscv/Makefile
> > @@ -0,0 +1,2 @@
> > +OBJS += riscv/af_afir_init.o
> > +RVV-OBJS += riscv/af_afir_rvv.o
> > diff --git a/libavfilter/riscv/af_afir_init.c
> > b/libavfilter/riscv/af_afir_init.c new file mode 100644
> > index 0000000000..13df8341e7
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_init.c
> > @@ -0,0 +1,39 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include <stdint.h>
> > +
> > +#include "config.h"
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavfilter/af_afirdsp.h"
> > +
> > +void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
> > + ptrdiff_t len);
> > +
> > +av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
> > +{
> > +#if HAVE_RVV
> > + int flags = av_get_cpu_flags();
> > +
> > + if (flags & AV_CPU_FLAG_RVV_F32)
>
> You need to check for Zba as well here. I doubt that we'll see hardware
> with V
> and without Zba in real life, but for the sake of correctness...
>
> > + s->fcmul_add = ff_fcmul_add_rvv;
> > +#endif
> > +}
> > diff --git a/libavfilter/riscv/af_afir_rvv.S
> > b/libavfilter/riscv/af_afir_rvv.S new file mode 100644
> > index 0000000000..078cac8e7e
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_rvv.S
> > @@ -0,0 +1,61 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +// void ff_fcmul_add(float *sum, const float *t, const float *c, int
> len)
> > +func ff_fcmul_add_rvv, zve32f
> > + li t1, 32
> > +1:
> > + vsetvli t0, a3, e64, m4, ta, ma
>
> You can set SEW=32 and corresponding LMUL here. Then you can remove all
> other
> VSETVLI instances below. (Note that this will NOT work on draft 0.7.1
> hardware, but it does work on conformant hardware.)
>
> > + vle64.v v12, (a0)
>
> This requires 64-bit alignment. I don't know if this is correct for this
> specific filter, so I leave it to other people to comment here.
>
> > + sub a3, a3, t0
> > + vsetvli zero, zero, e32, m2, ta, ma
> > + vnsrl.vx v8, v12, zero
> > + vnsrl.vx v10, v12, t1
> > + vsetvli zero, zero, e64, m4, ta, ma
> > + vle64.v v12, (a1)
> > + sh3add a1, t0, a1
> > + vsetvli zero, zero, e32, m2, ta, ma
> > + vnsrl.vx v0, v12, zero
> > + vnsrl.vx v2, v12, t1
> > + vsetvli zero, zero, e64, m4, ta, ma
> > + vle64.v v12, (a2)
> > + sh3add a2, t0, a2
> > + vsetvli zero, zero, e32, m2, ta, ma
> > + vnsrl.vx v4, v12, zero
> > + vnsrl.vx v6, v12, t1
> > + vfmacc.vv v8, v0, v4
> > + vfnmsac.vv v8, v2, v6
> > + vfmacc.vv v10, v0, v6
>
> Swap the two instructions above for better pipeline utilisation on
> in-order
> CPUs.
>
> > + vfmacc.vv v10, v2, v4
> > + vsseg2e32.v v8, (a0)
> > + sh3add a0, t0, a0
> > + bgtz a3, 1b
> > +
> > + flw fa0, 0(a1)
> > + flw fa1, 0(a2)
> > + flw fa2, 0(a0)
> > + fmul.s fa0, fa0, fa1
> > + fadd.s fa2, fa2, fa0
>
> It won't make much difference, but you can use a fused multiply-add here.
>
> > + fsw fa2, 0(a0)
> > +
> > + ret
> > +endfunc
>
> While you're at it, this looks like it could easily be adapted for the
> double
> precision version. In fact, it will be simpler, since you will have to use
> vlseg2e64 rather than vle128.v+vnsrl.vx+vnsrl.vx. But if you decide to
> implement that too, please keep it a separate patch.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
>
[-- Attachment #2: af_afir-RISC-V-V-fcmul_add.patch --]
[-- Type: text/x-patch, Size: 5356 bytes --]
From 66e681b6d85a9ffead13f5bc21975f7110f6c84f Mon Sep 17 00:00:00 2001
From: sunyuechi <sunyuechi@iscas.ac.cn>
Date: Fri, 3 Nov 2023 10:35:53 +0800
Subject: [PATCH] af_afir: RISC-V V fcmul_add
benchmark:
fcmul_add_c: 2179
fcmul_add_rvv_f32: 1652
---
libavfilter/af_afirdsp.h | 5 ++-
libavfilter/riscv/Makefile | 2 ++
libavfilter/riscv/af_afir_init.c | 42 ++++++++++++++++++++++++
libavfilter/riscv/af_afir_rvv.S | 55 ++++++++++++++++++++++++++++++++
4 files changed, 103 insertions(+), 1 deletion(-)
create mode 100644 libavfilter/riscv/Makefile
create mode 100644 libavfilter/riscv/af_afir_init.c
create mode 100644 libavfilter/riscv/af_afir_rvv.S
diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
index 4208501393..827e067a9b 100644
--- a/libavfilter/af_afirdsp.h
+++ b/libavfilter/af_afirdsp.h
@@ -33,6 +33,7 @@ typedef struct AudioFIRDSPContext {
ptrdiff_t len);
} AudioFIRDSPContext;
+void ff_afir_init_riscv(AudioFIRDSPContext *s);
void ff_afir_init_x86(AudioFIRDSPContext *s);
static void fcmul_add_c(float *sum, const float *t, const float *c, ptrdiff_t len)
@@ -74,7 +75,9 @@ static av_unused void ff_afir_init(AudioFIRDSPContext *dsp)
dsp->fcmul_add = fcmul_add_c;
dsp->dcmul_add = dcmul_add_c;
-#if ARCH_X86
+#if ARCH_RISCV
+ ff_afir_init_riscv(dsp);
+#elif ARCH_X86
ff_afir_init_x86(dsp);
#endif
}
diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
new file mode 100644
index 0000000000..0b968a9c0d
--- /dev/null
+++ b/libavfilter/riscv/Makefile
@@ -0,0 +1,2 @@
+OBJS += riscv/af_afir_init.o
+RVV-OBJS += riscv/af_afir_rvv.o
diff --git a/libavfilter/riscv/af_afir_init.c b/libavfilter/riscv/af_afir_init.c
new file mode 100644
index 0000000000..c05cc26c04
--- /dev/null
+++ b/libavfilter/riscv/af_afir_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavfilter/af_afirdsp.h"
+
+void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
+ ptrdiff_t len);
+
+av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
+{
+#if HAVE_RVV
+ int flags = av_get_cpu_flags();
+
+ if (flags & AV_CPU_FLAG_RVV_F32) {
+ if (flags & AV_CPU_FLAG_RVB_ADDR) {
+ s->fcmul_add = ff_fcmul_add_rvv;
+ }
+ }
+#endif
+}
diff --git a/libavfilter/riscv/af_afir_rvv.S b/libavfilter/riscv/af_afir_rvv.S
new file mode 100644
index 0000000000..ff4146133f
--- /dev/null
+++ b/libavfilter/riscv/af_afir_rvv.S
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+// void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
+func ff_fcmul_add_rvv, zve32f
+ li t1, 32
+1:
+ vsetvli t0, a3, e32, m4, ta, ma
+ vle64.v v24, (a0)
+ sub a3, a3, t0
+ vnsrl.wx v16, v24, zero
+ vnsrl.wx v20, v24, t1
+ vle64.v v24, (a1)
+ sh3add a1, t0, a1
+ vnsrl.wx v0, v24, zero
+ vnsrl.wx v4, v24, t1
+ vle64.v v24, (a2)
+ sh3add a2, t0, a2
+ vnsrl.wx v8, v24, zero
+ vnsrl.wx v12, v24, t1
+ vfmacc.vv v16, v0, v8
+ vfmacc.vv v20, v4, v8
+ vfnmsac.vv v16, v4, v12
+ vfmacc.vv v20, v0, v12
+ vsseg2e32.v v16, (a0)
+ sh3add a0, t0, a0
+ bgtz a3, 1b
+
+ flw fa0, 0(a1)
+ flw fa1, 0(a2)
+ flw fa2, 0(a0)
+ fmadd.s fa2, fa0, fa1, fa2
+ fsw fa2, 0(a0)
+
+ ret
+endfunc
--
2.42.1
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-11-15 8:59 ` flow gg
@ 2023-11-15 15:05 ` Rémi Denis-Courmont
2023-11-15 23:04 ` flow gg
0 siblings, 1 reply; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-11-15 15:05 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Le keskiviikkona 15. marraskuuta 2023, 10.59.55 EET flow gg a écrit :
> Okay, I have updated these issues in the patch.
It does not assemble but I can fix it locally. The narrowing shift trickery
require Zve64x, or rather Zve64f in this case.
The performance improvement is much better on newer hardware:
fcmul_add_c: 4891.2
fcmul_add_rvv_f64: 2399.5
FWIW, VLSEG2E32.V remains slightly worse than with shifting:
fcmul_add_c: 4891.2
fcmul_add_rvv_f32: 2877.5
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
2023-11-15 15:05 ` Rémi Denis-Courmont
@ 2023-11-15 23:04 ` flow gg
0 siblings, 0 replies; 18+ messages in thread
From: flow gg @ 2023-11-15 23:04 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 997 bytes --]
Okay, I have modified them to 64 and added some descriptions.
Rémi Denis-Courmont <remi@remlab.net> 于2023年11月15日周三 23:06写道:
> Le keskiviikkona 15. marraskuuta 2023, 10.59.55 EET flow gg a écrit :
> > Okay, I have updated these issues in the patch.
>
> It does not assemble but I can fix it locally. The narrowing shift
> trickery
> require Zve64x, or rather Zve64f in this case.
>
> The performance improvement is much better on newer hardware:
> fcmul_add_c: 4891.2
> fcmul_add_rvv_f64: 2399.5
>
> FWIW, VLSEG2E32.V remains slightly worse than with shifting:
> fcmul_add_c: 4891.2
> fcmul_add_rvv_f32: 2877.5
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
[-- Attachment #2: af_afir-RISC-V-V-fcmul_add.patch --]
[-- Type: text/x-patch, Size: 5486 bytes --]
From 6b88fbf9b94c098841197c9fcb467006177ee4c6 Mon Sep 17 00:00:00 2001
From: sunyuechi <sunyuechi@iscas.ac.cn>
Date: Fri, 3 Nov 2023 10:35:53 +0800
Subject: [PATCH] af_afir: RISC-V V fcmul_add
Segmented loads are slow, so here we use unit-strided load and narrowing shifts.
c910:
fcmul_add_c: 2179
fcmul_add_rvv_f64: 1652
c908:
fcmul_add_c: 4891.2
fcmul_add_rvv_f64: 2399.5
---
libavfilter/af_afirdsp.h | 5 ++-
libavfilter/riscv/Makefile | 2 ++
libavfilter/riscv/af_afir_init.c | 42 ++++++++++++++++++++++++
libavfilter/riscv/af_afir_rvv.S | 55 ++++++++++++++++++++++++++++++++
4 files changed, 103 insertions(+), 1 deletion(-)
create mode 100644 libavfilter/riscv/Makefile
create mode 100644 libavfilter/riscv/af_afir_init.c
create mode 100644 libavfilter/riscv/af_afir_rvv.S
diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
index 4208501393..827e067a9b 100644
--- a/libavfilter/af_afirdsp.h
+++ b/libavfilter/af_afirdsp.h
@@ -33,6 +33,7 @@ typedef struct AudioFIRDSPContext {
ptrdiff_t len);
} AudioFIRDSPContext;
+void ff_afir_init_riscv(AudioFIRDSPContext *s);
void ff_afir_init_x86(AudioFIRDSPContext *s);
static void fcmul_add_c(float *sum, const float *t, const float *c, ptrdiff_t len)
@@ -74,7 +75,9 @@ static av_unused void ff_afir_init(AudioFIRDSPContext *dsp)
dsp->fcmul_add = fcmul_add_c;
dsp->dcmul_add = dcmul_add_c;
-#if ARCH_X86
+#if ARCH_RISCV
+ ff_afir_init_riscv(dsp);
+#elif ARCH_X86
ff_afir_init_x86(dsp);
#endif
}
diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
new file mode 100644
index 0000000000..0b968a9c0d
--- /dev/null
+++ b/libavfilter/riscv/Makefile
@@ -0,0 +1,2 @@
+OBJS += riscv/af_afir_init.o
+RVV-OBJS += riscv/af_afir_rvv.o
diff --git a/libavfilter/riscv/af_afir_init.c b/libavfilter/riscv/af_afir_init.c
new file mode 100644
index 0000000000..52aa18c126
--- /dev/null
+++ b/libavfilter/riscv/af_afir_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavfilter/af_afirdsp.h"
+
+void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
+ ptrdiff_t len);
+
+av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
+{
+#if HAVE_RVV
+ int flags = av_get_cpu_flags();
+
+ if (flags & AV_CPU_FLAG_RVV_F64) {
+ if (flags & AV_CPU_FLAG_RVB_ADDR) {
+ s->fcmul_add = ff_fcmul_add_rvv;
+ }
+ }
+#endif
+}
diff --git a/libavfilter/riscv/af_afir_rvv.S b/libavfilter/riscv/af_afir_rvv.S
new file mode 100644
index 0000000000..04ec2e50d8
--- /dev/null
+++ b/libavfilter/riscv/af_afir_rvv.S
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+// void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
+func ff_fcmul_add_rvv, zve64f
+ li t1, 32
+1:
+ vsetvli t0, a3, e32, m4, ta, ma
+ vle64.v v24, (a0)
+ sub a3, a3, t0
+ vnsrl.wx v16, v24, zero
+ vnsrl.wx v20, v24, t1
+ vle64.v v24, (a1)
+ sh3add a1, t0, a1
+ vnsrl.wx v0, v24, zero
+ vnsrl.wx v4, v24, t1
+ vle64.v v24, (a2)
+ sh3add a2, t0, a2
+ vnsrl.wx v8, v24, zero
+ vnsrl.wx v12, v24, t1
+ vfmacc.vv v16, v0, v8
+ vfmacc.vv v20, v4, v8
+ vfnmsac.vv v16, v4, v12
+ vfmacc.vv v20, v0, v12
+ vsseg2e32.v v16, (a0)
+ sh3add a0, t0, a0
+ bgtz a3, 1b
+
+ flw fa0, 0(a1)
+ flw fa1, 0(a2)
+ flw fa2, 0(a0)
+ fmadd.s fa2, fa0, fa1, fa2
+ fsw fa2, 0(a0)
+
+ ret
+endfunc
--
2.42.1
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 18+ messages in thread
end of thread, other threads:[~2023-11-15 23:05 UTC | newest]
Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-26 9:24 [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add flow gg
2023-09-26 18:34 ` Rémi Denis-Courmont
2023-09-26 18:40 ` Paul B Mahol
2023-09-26 18:44 ` Rémi Denis-Courmont
2023-09-27 1:47 ` flow gg
2023-09-27 16:01 ` Rémi Denis-Courmont
2023-09-27 16:27 ` Rémi Denis-Courmont
2023-09-26 18:50 ` Rémi Denis-Courmont
2023-09-27 16:41 ` Rémi Denis-Courmont
2023-09-28 5:45 ` flow gg
2023-09-28 13:33 ` Rémi Denis-Courmont
2023-11-13 9:43 ` flow gg
2023-11-13 15:35 ` Rémi Denis-Courmont
2023-11-13 16:01 ` Paul B Mahol
2023-11-15 8:57 ` flow gg
2023-11-15 8:59 ` flow gg
2023-11-15 15:05 ` Rémi Denis-Courmont
2023-11-15 23:04 ` flow gg
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git