Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
@ 2023-09-26  9:24 flow gg
  2023-09-26 18:34 ` Rémi Denis-Courmont
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: flow gg @ 2023-09-26  9:24 UTC (permalink / raw)
  To: ffmpeg-devel

[-- Attachment #1: Type: text/plain, Size: 52 bytes --]

benchmark:
fcmul_add_c: 19.7
fcmul_add_rvv_f32: 6.7

[-- Attachment #2: 0001-af_afir-RISC-V-V-fcmul_add.patch --]
[-- Type: text/x-patch, Size: 4830 bytes --]

From 6bef2523728a472bb803ce085a1aafdfd624e212 Mon Sep 17 00:00:00 2001
From: h <hlefthleft@gmail.com>
Date: Tue, 26 Sep 2023 15:03:12 +0800
Subject: [PATCH] af_afir: RISC-V V fcmul_add

fcmul_add_c: 19.7
fcmul_add_rvv_f32: 6.7
---
 libavfilter/af_afirdsp.h         |  3 ++
 libavfilter/riscv/Makefile       |  2 +
 libavfilter/riscv/af_afir_init.c | 39 +++++++++++++++++++
 libavfilter/riscv/af_afir_rvv.S  | 65 ++++++++++++++++++++++++++++++++
 4 files changed, 109 insertions(+)
 create mode 100644 libavfilter/riscv/Makefile
 create mode 100644 libavfilter/riscv/af_afir_init.c
 create mode 100644 libavfilter/riscv/af_afir_rvv.S

diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
index 4208501393..d2d1e909c1 100644
--- a/libavfilter/af_afirdsp.h
+++ b/libavfilter/af_afirdsp.h
@@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
 } AudioFIRDSPContext;
 
 void ff_afir_init_x86(AudioFIRDSPContext *s);
+void ff_afir_init_riscv(AudioFIRDSPContext *s);
 
 static void fcmul_add_c(float *sum, const float *t, const float *c, ptrdiff_t len)
 {
@@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext *dsp)
 
 #if ARCH_X86
     ff_afir_init_x86(dsp);
+#elif ARCH_RISCV
+    ff_afir_init_riscv(dsp);
 #endif
 }
 
diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
new file mode 100644
index 0000000000..0b968a9c0d
--- /dev/null
+++ b/libavfilter/riscv/Makefile
@@ -0,0 +1,2 @@
+OBJS += riscv/af_afir_init.o
+RVV-OBJS += riscv/af_afir_rvv.o
diff --git a/libavfilter/riscv/af_afir_init.c b/libavfilter/riscv/af_afir_init.c
new file mode 100644
index 0000000000..ffa176abd2
--- /dev/null
+++ b/libavfilter/riscv/af_afir_init.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2023 hleft
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavfilter/af_afirdsp.h"
+
+void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
+                       ptrdiff_t len);
+
+av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
+{
+#if HAVE_RVV
+    int flags = av_get_cpu_flags();
+
+    if (flags & AV_CPU_FLAG_RVV_F32)
+        s->fcmul_add = ff_fcmul_add_rvv;
+#endif
+}
diff --git a/libavfilter/riscv/af_afir_rvv.S b/libavfilter/riscv/af_afir_rvv.S
new file mode 100644
index 0000000000..06c3979575
--- /dev/null
+++ b/libavfilter/riscv/af_afir_rvv.S
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2023 hleft
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+//  void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
+func ff_fcmul_add_rvv, zve32f
+1:
+    li t1, 4
+    vsetvli  t0, t1, e32, m1, ta, ma
+
+    li t2, 8
+
+    vlsseg2e32.v v0, (a1), t2
+    vlsseg2e32.v v2, (a2), t2
+    vlsseg2e32.v v4, (a0), t2
+
+    vfmul.vv v6, v0, v2
+    vfmul.vv v7, v1, v3
+    vfmul.vv v8, v0, v3
+    vfmul.vv v9, v1, v2
+
+    vfadd.vv v4, v4, v6
+    vfsub.vv v4, v4, v7
+    vfadd.vv v5, v5, v8
+    vfadd.vv v5, v5, v9
+
+    vssseg2e32.v v4, (a0), t2
+
+    mul t3, t2, t1
+    add a0, a0, t3
+    add a1, a1, t3
+    add a2, a2, t3
+
+    sub a3, a3, t0
+    bgtz a3, 1b
+
+    flw     fa0, 0(a1)
+    flw     fa1, 0(a2)
+    flw     fa2, 0(a0)
+
+    fmul.s  fa0, fa0, fa1
+    fadd.s  fa2, fa2, fa0
+
+    fsw     fa2, 0(a0)
+
+    ret
+endfunc
-- 
2.42.0


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-09-26  9:24 [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add flow gg
@ 2023-09-26 18:34 ` Rémi Denis-Courmont
  2023-09-26 18:40   ` Paul B Mahol
  2023-09-26 18:50 ` Rémi Denis-Courmont
  2023-09-27 16:41 ` Rémi Denis-Courmont
  2 siblings, 1 reply; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-09-26 18:34 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: flow gg

Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> benchmark:
> fcmul_add_c: 19.7
> fcmul_add_rvv_f32: 6.7

Nit: please pad mnemonics to at least 8 columns for consistency.

I'm a bit surprised that the performance improves this much, considering that 
the C910 is notoriously bad at both segmented strided loads. It might be that 
the C versions is just very bad due to lack of aliasing optimisations. Oh 
well.

Note that you could do the double versions with very little extra efforts.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-09-26 18:34 ` Rémi Denis-Courmont
@ 2023-09-26 18:40   ` Paul B Mahol
  2023-09-26 18:44     ` Rémi Denis-Courmont
  0 siblings, 1 reply; 18+ messages in thread
From: Paul B Mahol @ 2023-09-26 18:40 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: flow gg

On Tue, Sep 26, 2023 at 8:35 PM Rémi Denis-Courmont <remi@remlab.net> wrote:

> Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> > benchmark:
> > fcmul_add_c: 19.7
> > fcmul_add_rvv_f32: 6.7
>
> Nit: please pad mnemonics to at least 8 columns for consistency.
>
> I'm a bit surprised that the performance improves this much, considering
> that
> the C910 is notoriously bad at both segmented strided loads. It might be
> that
> the C versions is just very bad due to lack of aliasing optimisations. Oh
> well.
>

What you mean exactly that C version is missing?


>
> Note that you could do the double versions with very little extra efforts.
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-09-26 18:40   ` Paul B Mahol
@ 2023-09-26 18:44     ` Rémi Denis-Courmont
  2023-09-27  1:47       ` flow gg
  0 siblings, 1 reply; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-09-26 18:44 UTC (permalink / raw)
  To: FFmpeg development discussions and patches, Paul B Mahol

Le tiistaina 26. syyskuuta 2023, 21.40.12 EEST Paul B Mahol a écrit :
> On Tue, Sep 26, 2023 at 8:35 PM Rémi Denis-Courmont <remi@remlab.net> wrote:
> > Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> > > benchmark:
> > > fcmul_add_c: 19.7
> > > fcmul_add_rvv_f32: 6.7
> > 
> > Nit: please pad mnemonics to at least 8 columns for consistency.
> > 
> > I'm a bit surprised that the performance improves this much, considering
> > that
> > the C910 is notoriously bad at both segmented strided loads. It might be
> > that
> > the C versions is just very bad due to lack of aliasing optimisations. Oh
> > well.
> 
> What you mean exactly that C version is missing?

The C version does not have any restrict qualifier. This potentially prevents 
the C compiler from unrolling. Adding the keyword can improve performance 
gains of 20-30% on RISC-V scalar floating point.

That said, sometimes you can't validly use restrict, and you simply can't tell 
the C compiler how to optimise properly. In those cases, even scalar floating 
point optimisations improve performance.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-09-26  9:24 [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add flow gg
  2023-09-26 18:34 ` Rémi Denis-Courmont
@ 2023-09-26 18:50 ` Rémi Denis-Courmont
  2023-09-27 16:41 ` Rémi Denis-Courmont
  2 siblings, 0 replies; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-09-26 18:50 UTC (permalink / raw)
  To: ffmpeg-devel

Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> benchmark:
> fcmul_add_c: 19.7
> fcmul_add_rvv_f32: 6.7

+    li t1, 4
+    vsetvli  t0, t1, e32, m1, ta, ma

vsetivli t0, 4, ...

But really, DO NOT use a fixed vector length here. At best, you're wasting half 
the vector width. Your input has a variable size, use it.

+
+    li t2, 8
+
+    vlsseg2e32.v v0, (a1), t2

I'm not sure what you are trying to achieve here. It seems that you could just 
as well use vlseg2 without register stride, no?

+    vlsseg2e32.v v2, (a2), t2
+    vlsseg2e32.v v4, (a0), t2
+
+    vfmul.vv v6, v0, v2
+    vfmul.vv v7, v1, v3
+    vfmul.vv v8, v0, v3
+    vfmul.vv v9, v1, v2
+
+    vfadd.vv v4, v4, v6
+    vfsub.vv v4, v4, v7
+    vfadd.vv v5, v5, v8
+    vfadd.vv v5, v5, v9
+
+    vssseg2e32.v v4, (a0), t2

Same here.


-- 
レミ・デニ-クールモン
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-09-26 18:44     ` Rémi Denis-Courmont
@ 2023-09-27  1:47       ` flow gg
  2023-09-27 16:01         ` Rémi Denis-Courmont
  2023-09-27 16:27         ` Rémi Denis-Courmont
  0 siblings, 2 replies; 18+ messages in thread
From: flow gg @ 2023-09-27  1:47 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 3373 bytes --]

>>> please pad mnemonics to at least 8 columns for consistency

okay, changed

>>> It seems that you could just as well use vlseg2 without register
stride, no?

yes, vlseg will better, changed

>>> Note that you could do the double versions with very little extra
efforts.

okay

>>> But really, DO NOT use a fixed vector length here. At best, you're
wasting half
>>> the vector width. Your input has a variable size, use it.

okay, changed

>>> I'm a bit surprised that the performance improves this much,
considering that
>>> the C910 is notoriously bad at both segmented strided loads. It might
be that
>>> the C versions is just very bad due to lack of aliasing optimisations.

thanks, You reminded me.
Sorry I had forgotten that there was a problem..
A few days ago, I wanted to try running some existing benchmarks,

```
tests/checkasm/checkasm --bench --test=aacpsdsp
tests/checkasm/checkasm --bench --test=alacdsp
tests/checkasm/checkasm --bench --test=audiodsp
tests/checkasm/checkasm --bench --test=g722dsp
tests/checkasm/checkasm --bench --test=vorbisdsp
tests/checkasm/checkasm --bench --test=float_dsp
tests/checkasm/checkasm --bench --test=fixed_dsp
tests/checkasm/checkasm --bench --test=af_afir
```

but they all returned 0.0.

For example,

```
butterflies_float_c: 0.0
butterflies_float_rvv_f32: 0.0
scalarproduct_float_c: 0.0
scalarproduct_float_rvv_f32: 0.0
vector_dmac_scalar_c: 0.0
vector_dmac_scalar_rvv_f64: 0.0
...
```

I tried changing the -O3 in configure to -O2 or -O1, but still got 0.0.

Only by changing to -O0 did I receive non-zero results.

So, the benchmark I conducted was based on this, and I obtained the initial
results…

fcmul_add_c: 19.7
fcmul_add_rvv_f32: 6.7

Rémi Denis-Courmont <remi@remlab.net> 于2023年9月27日周三 02:44写道:

> Le tiistaina 26. syyskuuta 2023, 21.40.12 EEST Paul B Mahol a écrit :
> > On Tue, Sep 26, 2023 at 8:35 PM Rémi Denis-Courmont <remi@remlab.net>
> wrote:
> > > Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> > > > benchmark:
> > > > fcmul_add_c: 19.7
> > > > fcmul_add_rvv_f32: 6.7
> > >
> > > Nit: please pad mnemonics to at least 8 columns for consistency.
> > >
> > > I'm a bit surprised that the performance improves this much,
> considering
> > > that
> > > the C910 is notoriously bad at both segmented strided loads. It might
> be
> > > that
> > > the C versions is just very bad due to lack of aliasing optimisations.
> Oh
> > > well.
> >
> > What you mean exactly that C version is missing?
>
> The C version does not have any restrict qualifier. This potentially
> prevents
> the C compiler from unrolling. Adding the keyword can improve performance
> gains of 20-30% on RISC-V scalar floating point.
>
> That said, sometimes you can't validly use restrict, and you simply can't
> tell
> the C compiler how to optimise properly. In those cases, even scalar
> floating
> point optimisations improve performance.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>

[-- Attachment #2: 0001-af_afir-RISC-V-V-fcmul_add.patch --]
[-- Type: text/x-patch, Size: 4996 bytes --]

From 5343b40c2e1849daadd79d6d531aeb7a762e0e06 Mon Sep 17 00:00:00 2001
From: h <hlefthleft@gmail.com>
Date: Tue, 26 Sep 2023 15:03:12 +0800
Subject: [PATCH] af_afir: RISC-V V fcmul_add

fcmul_add_c: 19.7
fcmul_add_rvv_f32: 6.7
---
 libavfilter/af_afirdsp.h         |  3 ++
 libavfilter/riscv/Makefile       |  2 ++
 libavfilter/riscv/af_afir_init.c | 39 ++++++++++++++++++++++
 libavfilter/riscv/af_afir_rvv.S  | 55 ++++++++++++++++++++++++++++++++
 4 files changed, 99 insertions(+)
 create mode 100644 libavfilter/riscv/Makefile
 create mode 100644 libavfilter/riscv/af_afir_init.c
 create mode 100644 libavfilter/riscv/af_afir_rvv.S

diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
index 4208501393..d2d1e909c1 100644
--- a/libavfilter/af_afirdsp.h
+++ b/libavfilter/af_afirdsp.h
@@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
 } AudioFIRDSPContext;
 
 void ff_afir_init_x86(AudioFIRDSPContext *s);
+void ff_afir_init_riscv(AudioFIRDSPContext *s);
 
 static void fcmul_add_c(float *sum, const float *t, const float *c, ptrdiff_t len)
 {
@@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext *dsp)
 
 #if ARCH_X86
     ff_afir_init_x86(dsp);
+#elif ARCH_RISCV
+    ff_afir_init_riscv(dsp);
 #endif
 }
 
diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
new file mode 100644
index 0000000000..0b968a9c0d
--- /dev/null
+++ b/libavfilter/riscv/Makefile
@@ -0,0 +1,2 @@
+OBJS += riscv/af_afir_init.o
+RVV-OBJS += riscv/af_afir_rvv.o
diff --git a/libavfilter/riscv/af_afir_init.c b/libavfilter/riscv/af_afir_init.c
new file mode 100644
index 0000000000..ffa176abd2
--- /dev/null
+++ b/libavfilter/riscv/af_afir_init.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2023 hleft
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavfilter/af_afirdsp.h"
+
+void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
+                       ptrdiff_t len);
+
+av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
+{
+#if HAVE_RVV
+    int flags = av_get_cpu_flags();
+
+    if (flags & AV_CPU_FLAG_RVV_F32)
+        s->fcmul_add = ff_fcmul_add_rvv;
+#endif
+}
diff --git a/libavfilter/riscv/af_afir_rvv.S b/libavfilter/riscv/af_afir_rvv.S
new file mode 100644
index 0000000000..e060d03b6e
--- /dev/null
+++ b/libavfilter/riscv/af_afir_rvv.S
@@ -0,0 +1,55 @@
+/*
+ * Copyright © 2023 hleft
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+//  void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
+func ff_fcmul_add_rvv, zve32f
+1:
+        vsetvli     t0, a3, e32, m1, ta, ma
+        vlseg2e32.v v0, (a1)
+        vlseg2e32.v v2, (a2)
+        vlseg2e32.v v4, (a0)
+        vfmul.vv    v6, v0, v2
+        vfmul.vv    v7, v1, v3
+        vfmul.vv    v8, v0, v3
+        vfmul.vv    v9, v1, v2
+        vfadd.vv    v4, v4, v6
+        vfsub.vv    v4, v4, v7
+        vfadd.vv    v5, v5, v8
+        vfadd.vv    v5, v5, v9
+        vsseg2e32.v v4, (a0)
+        li          t2, 8
+        mul         t3, t2, t0
+        add         a0, a0, t3
+        add         a1, a1, t3
+        add         a2, a2, t3
+        sub         a3, a3, t0
+        bgtz        a3, 1b
+
+        flw         fa0, 0(a1)
+        flw         fa1, 0(a2)
+        flw         fa2, 0(a0)
+        fmul.s      fa0, fa0, fa1
+        fadd.s      fa2, fa2, fa0
+        fsw         fa2, 0(a0)
+
+        ret
+endfunc
-- 
2.42.0


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-09-27  1:47       ` flow gg
@ 2023-09-27 16:01         ` Rémi Denis-Courmont
  2023-09-27 16:27         ` Rémi Denis-Courmont
  1 sibling, 0 replies; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-09-27 16:01 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le keskiviikkona 27. syyskuuta 2023, 4.47.26 EEST flow gg a écrit :
> ```
> tests/checkasm/checkasm --bench --test=aacpsdsp
> tests/checkasm/checkasm --bench --test=alacdsp
> tests/checkasm/checkasm --bench --test=audiodsp
> tests/checkasm/checkasm --bench --test=g722dsp
> tests/checkasm/checkasm --bench --test=vorbisdsp
> tests/checkasm/checkasm --bench --test=float_dsp
> tests/checkasm/checkasm --bench --test=fixed_dsp
> tests/checkasm/checkasm --bench --test=af_afir
> ```
> 
> but they all returned 0.0.

Your checkasm setup is broken however why (I have not tested on C910 recently 
so maybe it's just how it is). But in any case, performance metrics from C 
code compiled with -O0 are worthless.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-09-27  1:47       ` flow gg
  2023-09-27 16:01         ` Rémi Denis-Courmont
@ 2023-09-27 16:27         ` Rémi Denis-Courmont
  1 sibling, 0 replies; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-09-27 16:27 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le keskiviikkona 27. syyskuuta 2023, 4.47.26 EEST flow gg a écrit :
> >>> please pad mnemonics to at least 8 columns for consistency
> 
> okay, changed
> 
> >>> It seems that you could just as well use vlseg2 without register
> 
> stride, no?
> 
> yes, vlseg will better, changed
> 
> >>> Note that you could do the double versions with very little extra
> 
> efforts.
> 
> okay
> 
> >>> But really, DO NOT use a fixed vector length here. At best, you're
> 
> wasting half
> 
> >>> the vector width. Your input has a variable size, use it.
> 
> okay, changed
> 
> >>> I'm a bit surprised that the performance improves this much,
> 
> considering that
> 
> >>> the C910 is notoriously bad at both segmented strided loads. It might
> 
> be that
> 
> >>> the C versions is just very bad due to lack of aliasing optimisations.
> 
> thanks, You reminded me.
> Sorry I had forgotten that there was a problem..
> A few days ago, I wanted to try running some existing benchmarks,
> 
> ```
> tests/checkasm/checkasm --bench --test=aacpsdsp
> tests/checkasm/checkasm --bench --test=alacdsp
> tests/checkasm/checkasm --bench --test=audiodsp
> tests/checkasm/checkasm --bench --test=g722dsp
> tests/checkasm/checkasm --bench --test=vorbisdsp
> tests/checkasm/checkasm --bench --test=float_dsp
> tests/checkasm/checkasm --bench --test=fixed_dsp
> tests/checkasm/checkasm --bench --test=af_afir
> ```
> 
> but they all returned 0.0.
> 
> For example,
> 
> ```
> butterflies_float_c: 0.0
> butterflies_float_rvv_f32: 0.0
> scalarproduct_float_c: 0.0
> scalarproduct_float_rvv_f32: 0.0
> vector_dmac_scalar_c: 0.0
> vector_dmac_scalar_rvv_f64: 0.0
> ...

OK, this reproduces on both SiFive and T-Head hardware here. You need to 
revert 09731fbfc3a914ec4f6ffad60aa9062db6a8f6aa.


-- 
レミ・デニ-クールモン
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-09-26  9:24 [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add flow gg
  2023-09-26 18:34 ` Rémi Denis-Courmont
  2023-09-26 18:50 ` Rémi Denis-Courmont
@ 2023-09-27 16:41 ` Rémi Denis-Courmont
  2023-09-28  5:45   ` flow gg
  2 siblings, 1 reply; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-09-27 16:41 UTC (permalink / raw)
  To: ffmpeg-devel

Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> benchmark:
> fcmul_add_c: 19.7
> fcmul_add_rvv_f32: 6.7

With optimisations enabled and the benchmarking fix, I get this (on the same 
hardware, I believe):

fcmul_add_c: 3.5
fcmul_add_rvv_f32: 6.7

For sure unfortunate design limitations of T-Head C910 are to blame to no 
small extent. It is not the first occurrence of an RVV optimisation that turns 
out worse than scalar due to those, and I still have honest hopes that newer 
(and conformant) IP would give saner results, but... I also believe that the 
code could be improved regardless.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-09-27 16:41 ` Rémi Denis-Courmont
@ 2023-09-28  5:45   ` flow gg
  2023-09-28 13:33     ` Rémi Denis-Courmont
  0 siblings, 1 reply; 18+ messages in thread
From: flow gg @ 2023-09-28  5:45 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1584 bytes --]

Okay, I revert the volatile in ff_read_time

How about this version?

use vls instead vlseg, and use vfmacc

The benchmark is sometimes better, sometimes the same

fcmul_add_c: 3.5
fcmul_add_rvv_f32: 3.5
 - af_afir.fcmul_add [OK]
fcmul_add_c: 4.5
fcmul_add_rvv_f32: 4.2
 - af_afir.fcmul_add [OK]
fcmul_add_c: 4.2
fcmul_add_rvv_f32: 4.2
 - af_afir.fcmul_add [OK]
fcmul_add_c: 4.5
fcmul_add_rvv_f32: 4.2
 - af_afir.fcmul_add [OK]
fcmul_add_c: 4.7
fcmul_add_rvv_f32: 3.5


Rémi Denis-Courmont <remi@remlab.net> 于2023年9月28日周四 00:41写道:

> Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> > benchmark:
> > fcmul_add_c: 19.7
> > fcmul_add_rvv_f32: 6.7
>
> With optimisations enabled and the benchmarking fix, I get this (on the
> same
> hardware, I believe):
>
> fcmul_add_c: 3.5
> fcmul_add_rvv_f32: 6.7
>
> For sure unfortunate design limitations of T-Head C910 are to blame to no
> small extent. It is not the first occurrence of an RVV optimisation that
> turns
> out worse than scalar due to those, and I still have honest hopes that
> newer
> (and conformant) IP would give saner results, but... I also believe that
> the
> code could be improved regardless.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>

[-- Attachment #2: 0001-af_afir-RISC-V-V-fcmul_add.patch --]
[-- Type: text/x-patch, Size: 5159 bytes --]

From 3e956958b01e780b7360dace59b6248f61a6f12c Mon Sep 17 00:00:00 2001
From: h <hlefthleft@gmail.com>
Date: Tue, 26 Sep 2023 15:03:12 +0800
Subject: [PATCH] af_afir: RISC-V V fcmul_add

---
 libavfilter/af_afirdsp.h         |  3 ++
 libavfilter/riscv/Makefile       |  2 ++
 libavfilter/riscv/af_afir_init.c | 39 ++++++++++++++++++++
 libavfilter/riscv/af_afir_rvv.S  | 61 ++++++++++++++++++++++++++++++++
 4 files changed, 105 insertions(+)
 create mode 100644 libavfilter/riscv/Makefile
 create mode 100644 libavfilter/riscv/af_afir_init.c
 create mode 100644 libavfilter/riscv/af_afir_rvv.S

diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
index 4208501393..d2d1e909c1 100644
--- a/libavfilter/af_afirdsp.h
+++ b/libavfilter/af_afirdsp.h
@@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
 } AudioFIRDSPContext;
 
 void ff_afir_init_x86(AudioFIRDSPContext *s);
+void ff_afir_init_riscv(AudioFIRDSPContext *s);
 
 static void fcmul_add_c(float *sum, const float *t, const float *c, ptrdiff_t len)
 {
@@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext *dsp)
 
 #if ARCH_X86
     ff_afir_init_x86(dsp);
+#elif ARCH_RISCV
+    ff_afir_init_riscv(dsp);
 #endif
 }
 
diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
new file mode 100644
index 0000000000..0b968a9c0d
--- /dev/null
+++ b/libavfilter/riscv/Makefile
@@ -0,0 +1,2 @@
+OBJS += riscv/af_afir_init.o
+RVV-OBJS += riscv/af_afir_rvv.o
diff --git a/libavfilter/riscv/af_afir_init.c b/libavfilter/riscv/af_afir_init.c
new file mode 100644
index 0000000000..ffa176abd2
--- /dev/null
+++ b/libavfilter/riscv/af_afir_init.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2023 hleft
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavfilter/af_afirdsp.h"
+
+void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
+                       ptrdiff_t len);
+
+av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
+{
+#if HAVE_RVV
+    int flags = av_get_cpu_flags();
+
+    if (flags & AV_CPU_FLAG_RVV_F32)
+        s->fcmul_add = ff_fcmul_add_rvv;
+#endif
+}
diff --git a/libavfilter/riscv/af_afir_rvv.S b/libavfilter/riscv/af_afir_rvv.S
new file mode 100644
index 0000000000..6c15586007
--- /dev/null
+++ b/libavfilter/riscv/af_afir_rvv.S
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2023 hleft
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+//  void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
+func ff_fcmul_add_rvv, zve32f
+1:
+        vsetvli     t0, a3, e32, m1, ta, ma
+        li          t1, 8
+        li          t2, 4
+        vlse32.v    v0, (a1), t1
+        add         a1, a1, t2
+        vlse32.v    v1, (a1), t1
+        sub         a1, a1, t2
+        vlse32.v    v2, (a2), t1
+        add         a2, a2, t2
+        vlse32.v    v3, (a2), t1
+        sub         a2, a2, t2
+        vlse32.v    v4, (a0), t1
+        add         a0, a0, t2
+        vlse32.v    v5, (a0), t1
+        sub         a0, a0, t2
+        vfmacc.vv   v4, v0, v2
+        vfnmsac.vv  v4, v1, v3
+        vfmacc.vv   v5, v0, v3
+        vfmacc.vv   v5, v1, v2
+        vsseg2e32.v v4, (a0)
+        mul         t3, t1, t0
+        add         a0, a0, t3
+        add         a1, a1, t3
+        add         a2, a2, t3
+        sub         a3, a3, t0
+        bgtz        a3, 1b
+
+        flw         fa0, 0(a1)
+        flw         fa1, 0(a2)
+        flw         fa2, 0(a0)
+        fmul.s      fa0, fa0, fa1
+        fadd.s      fa2, fa2, fa0
+        fsw         fa2, 0(a0)
+
+        ret
+endfunc
-- 
2.42.0


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-09-28  5:45   ` flow gg
@ 2023-09-28 13:33     ` Rémi Denis-Courmont
  2023-11-13  9:43       ` flow gg
  0 siblings, 1 reply; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-09-28 13:33 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



Le 28 septembre 2023 08:45:44 GMT+03:00, flow gg <hlefthleft@gmail.com> a écrit :
>Okay, I revert the volatile in ff_read_time
>
>How about this version?

It's still using register stride which is all but guaranteed to be slow on any hardware and should only be used as a last resort.

The code is also missing scheduling for multi-issue and unrolling with the group multiplier.

And lastly, while that probably won't change much, there are no reasons to use mul here. You can use shNadd like existing code does.


>
>use vls instead vlseg, and use vfmacc
>
>The benchmark is sometimes better, sometimes the same
>
>fcmul_add_c: 3.5
>fcmul_add_rvv_f32: 3.5
> - af_afir.fcmul_add [OK]
>fcmul_add_c: 4.5
>fcmul_add_rvv_f32: 4.2
> - af_afir.fcmul_add [OK]
>fcmul_add_c: 4.2
>fcmul_add_rvv_f32: 4.2
> - af_afir.fcmul_add [OK]
>fcmul_add_c: 4.5
>fcmul_add_rvv_f32: 4.2
> - af_afir.fcmul_add [OK]
>fcmul_add_c: 4.7
>fcmul_add_rvv_f32: 3.5
>
>
>Rémi Denis-Courmont <remi@remlab.net> 于2023年9月28日周四 00:41写道:
>
>> Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
>> > benchmark:
>> > fcmul_add_c: 19.7
>> > fcmul_add_rvv_f32: 6.7
>>
>> With optimisations enabled and the benchmarking fix, I get this (on the
>> same
>> hardware, I believe):
>>
>> fcmul_add_c: 3.5
>> fcmul_add_rvv_f32: 6.7
>>
>> For sure unfortunate design limitations of T-Head C910 are to blame to no
>> small extent. It is not the first occurrence of an RVV optimisation that
>> turns
>> out worse than scalar due to those, and I still have honest hopes that
>> newer
>> (and conformant) IP would give saner results, but... I also believe that
>> the
>> code could be improved regardless.
>>
>> --
>> Rémi Denis-Courmont
>> http://www.remlab.net/
>>
>>
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-09-28 13:33     ` Rémi Denis-Courmont
@ 2023-11-13  9:43       ` flow gg
  2023-11-13 15:35         ` Rémi Denis-Courmont
  0 siblings, 1 reply; 18+ messages in thread
From: flow gg @ 2023-11-13  9:43 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 2905 bytes --]

Sorry for the long delay in responding.

How is the modified patch now?

no longer using register stride(learn from your code) and have switched to
shNadd instead.

(using m4 and m2 as they are slightly faster than m8 and m4)

benchmark:
fcmul_add_c: 2179
fcmul_add_rvv_f32: 1652

Rémi Denis-Courmont <remi@remlab.net> 于2023年9月28日周四 21:33写道:

>
>
> Le 28 septembre 2023 08:45:44 GMT+03:00, flow gg <hlefthleft@gmail.com> a
> écrit :
> >Okay, I revert the volatile in ff_read_time
> >
> >How about this version?
>
> It's still using register stride which is all but guaranteed to be slow on
> any hardware and should only be used as a last resort.
>
> The code is also missing scheduling for multi-issue and unrolling with the
> group multiplier.
>
> And lastly, while that probably won't change much, there are no reasons to
> use mul here. You can use shNadd like existing code does.
>
>
> >
> >use vls instead vlseg, and use vfmacc
> >
> >The benchmark is sometimes better, sometimes the same
> >
> >fcmul_add_c: 3.5
> >fcmul_add_rvv_f32: 3.5
> > - af_afir.fcmul_add [OK]
> >fcmul_add_c: 4.5
> >fcmul_add_rvv_f32: 4.2
> > - af_afir.fcmul_add [OK]
> >fcmul_add_c: 4.2
> >fcmul_add_rvv_f32: 4.2
> > - af_afir.fcmul_add [OK]
> >fcmul_add_c: 4.5
> >fcmul_add_rvv_f32: 4.2
> > - af_afir.fcmul_add [OK]
> >fcmul_add_c: 4.7
> >fcmul_add_rvv_f32: 3.5
> >
> >
> >Rémi Denis-Courmont <remi@remlab.net> 于2023年9月28日周四 00:41写道:
> >
> >> Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> >> > benchmark:
> >> > fcmul_add_c: 19.7
> >> > fcmul_add_rvv_f32: 6.7
> >>
> >> With optimisations enabled and the benchmarking fix, I get this (on the
> >> same
> >> hardware, I believe):
> >>
> >> fcmul_add_c: 3.5
> >> fcmul_add_rvv_f32: 6.7
> >>
> >> For sure unfortunate design limitations of T-Head C910 are to blame to
> no
> >> small extent. It is not the first occurrence of an RVV optimisation that
> >> turns
> >> out worse than scalar due to those, and I still have honest hopes that
> >> newer
> >> (and conformant) IP would give saner results, but... I also believe that
> >> the
> >> code could be improved regardless.
> >>
> >> --
> >> Rémi Denis-Courmont
> >> http://www.remlab.net/
> >>
> >>
> >>
> >> _______________________________________________
> >> ffmpeg-devel mailing list
> >> ffmpeg-devel@ffmpeg.org
> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>
> >> To unsubscribe, visit link above, or email
> >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> >>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>

[-- Attachment #2: af_afir-RISC-V-V-fcmul_add.patch --]
[-- Type: text/x-patch, Size: 5417 bytes --]

From 4199887247d31348385cd864b4efd6f4c02740f2 Mon Sep 17 00:00:00 2001
From: sunyuechi <sunyuechi@iscas.ac.cn>
Date: Fri, 3 Nov 2023 10:35:53 +0800
Subject: [PATCH] af_afir: RISC-V V fcmul_add

benchmark:
fcmul_add_c: 2179
fcmul_add_rvv_f32: 1652
---
 libavfilter/af_afirdsp.h         |  3 ++
 libavfilter/riscv/Makefile       |  2 ++
 libavfilter/riscv/af_afir_init.c | 39 ++++++++++++++++++++
 libavfilter/riscv/af_afir_rvv.S  | 61 ++++++++++++++++++++++++++++++++
 4 files changed, 105 insertions(+)
 create mode 100644 libavfilter/riscv/Makefile
 create mode 100644 libavfilter/riscv/af_afir_init.c
 create mode 100644 libavfilter/riscv/af_afir_rvv.S

diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
index 4208501393..d2d1e909c1 100644
--- a/libavfilter/af_afirdsp.h
+++ b/libavfilter/af_afirdsp.h
@@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
 } AudioFIRDSPContext;
 
 void ff_afir_init_x86(AudioFIRDSPContext *s);
+void ff_afir_init_riscv(AudioFIRDSPContext *s);
 
 static void fcmul_add_c(float *sum, const float *t, const float *c, ptrdiff_t len)
 {
@@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext *dsp)
 
 #if ARCH_X86
     ff_afir_init_x86(dsp);
+#elif ARCH_RISCV
+    ff_afir_init_riscv(dsp);
 #endif
 }
 
diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
new file mode 100644
index 0000000000..0b968a9c0d
--- /dev/null
+++ b/libavfilter/riscv/Makefile
@@ -0,0 +1,2 @@
+OBJS += riscv/af_afir_init.o
+RVV-OBJS += riscv/af_afir_rvv.o
diff --git a/libavfilter/riscv/af_afir_init.c b/libavfilter/riscv/af_afir_init.c
new file mode 100644
index 0000000000..13df8341e7
--- /dev/null
+++ b/libavfilter/riscv/af_afir_init.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavfilter/af_afirdsp.h"
+
+void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
+                       ptrdiff_t len);
+
+av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
+{
+#if HAVE_RVV
+    int flags = av_get_cpu_flags();
+
+    if (flags & AV_CPU_FLAG_RVV_F32)
+        s->fcmul_add = ff_fcmul_add_rvv;
+#endif
+}
diff --git a/libavfilter/riscv/af_afir_rvv.S b/libavfilter/riscv/af_afir_rvv.S
new file mode 100644
index 0000000000..078cac8e7e
--- /dev/null
+++ b/libavfilter/riscv/af_afir_rvv.S
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+//  void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
+func ff_fcmul_add_rvv, zve32f
+        li          t1, 32
+1:
+        vsetvli     t0, a3, e64, m4, ta, ma
+        vle64.v     v12, (a0)
+        sub         a3, a3, t0
+        vsetvli     zero, zero, e32, m2, ta, ma
+        vnsrl.vx    v8, v12, zero
+        vnsrl.vx    v10, v12, t1
+        vsetvli     zero, zero, e64, m4, ta, ma
+        vle64.v     v12, (a1)
+        sh3add      a1, t0, a1
+        vsetvli     zero, zero, e32, m2, ta, ma
+        vnsrl.vx    v0, v12, zero
+        vnsrl.vx    v2, v12, t1
+        vsetvli     zero, zero, e64, m4, ta, ma
+        vle64.v     v12, (a2)
+        sh3add      a2, t0, a2
+        vsetvli     zero, zero, e32, m2, ta, ma
+        vnsrl.vx    v4, v12, zero
+        vnsrl.vx    v6, v12, t1
+        vfmacc.vv   v8, v0, v4
+        vfnmsac.vv  v8, v2, v6
+        vfmacc.vv   v10, v0, v6
+        vfmacc.vv   v10, v2, v4
+        vsseg2e32.v v8, (a0)
+        sh3add      a0, t0, a0
+        bgtz        a3, 1b
+
+        flw         fa0, 0(a1)
+        flw         fa1, 0(a2)
+        flw         fa2, 0(a0)
+        fmul.s      fa0, fa0, fa1
+        fadd.s      fa2, fa2, fa0
+        fsw         fa2, 0(a0)
+
+        ret
+endfunc
-- 
2.42.1


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-11-13  9:43       ` flow gg
@ 2023-11-13 15:35         ` Rémi Denis-Courmont
  2023-11-13 16:01           ` Paul B Mahol
                             ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-11-13 15:35 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: flow gg

   Hi,

Le maanantaina 13. marraskuuta 2023, 11.43.01 EET flow gg a écrit :
> Sorry for the long delay in responding.

No problem. Working with T-Head C910 (or C920?) cores is very tedious. I gave 
up on that and switched over to Kendryte K230 (based on C908) now.

> How is the modified patch now?

It looks better, but some minute improvements are still possible.

> no longer using register stride(learn from your code) and have switched to
> shNadd instead.
> 
> (using m4 and m2 as they are slightly faster than m8 and m4)
> 
> benchmark:
> fcmul_add_c: 2179
> fcmul_add_rvv_f32: 1652

> diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
> index 4208501393..d2d1e909c1 100644
> --- a/libavfilter/af_afirdsp.h
> +++ b/libavfilter/af_afirdsp.h
> @@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
>  } AudioFIRDSPContext;
> 
>  void ff_afir_init_x86(AudioFIRDSPContext *s);
> +void ff_afir_init_riscv(AudioFIRDSPContext *s);

Nit: please stick to alphabetical order like most similar code.

> 
>  static void fcmul_add_c(float *sum, const float *t, const float *c,
> ptrdiff_t len)
>  {
> @@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext
> *dsp)
> 
>  #if ARCH_X86
>      ff_afir_init_x86(dsp);
> +#elif ARCH_RISCV
> +    ff_afir_init_riscv(dsp);

Ditto.

>  #endif
>  }
> 
> diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
> new file mode 100644
> index 0000000000..0b968a9c0d
> --- /dev/null
> +++ b/libavfilter/riscv/Makefile
> @@ -0,0 +1,2 @@
> +OBJS += riscv/af_afir_init.o
> +RVV-OBJS += riscv/af_afir_rvv.o
> diff --git a/libavfilter/riscv/af_afir_init.c
> b/libavfilter/riscv/af_afir_init.c new file mode 100644
> index 0000000000..13df8341e7
> --- /dev/null
> +++ b/libavfilter/riscv/af_afir_init.c
> @@ -0,0 +1,39 @@
> +/*
> + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include <stdint.h>
> +
> +#include "config.h"
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavfilter/af_afirdsp.h"
> +
> +void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
> +                       ptrdiff_t len);
> +
> +av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
> +{
> +#if HAVE_RVV
> +    int flags = av_get_cpu_flags();
> +
> +    if (flags & AV_CPU_FLAG_RVV_F32)

You need to check for Zba as well here. I doubt that we'll see hardware with V 
and without Zba in real life, but for the sake of correctness...

> +        s->fcmul_add = ff_fcmul_add_rvv;
> +#endif
> +}
> diff --git a/libavfilter/riscv/af_afir_rvv.S
> b/libavfilter/riscv/af_afir_rvv.S new file mode 100644
> index 0000000000..078cac8e7e
> --- /dev/null
> +++ b/libavfilter/riscv/af_afir_rvv.S
> @@ -0,0 +1,61 @@
> +/*
> + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +//  void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
> +func ff_fcmul_add_rvv, zve32f
> +        li          t1, 32
> +1:
> +        vsetvli     t0, a3, e64, m4, ta, ma

You can set SEW=32 and corresponding LMUL here. Then you can remove all other 
VSETVLI instances below. (Note that this will NOT work on draft 0.7.1 
hardware, but it does work on conformant hardware.)

> +        vle64.v     v12, (a0)

This requires 64-bit alignment. I don't know if this is correct for this 
specific filter, so I leave it to other people to comment here.

> +        sub         a3, a3, t0
> +        vsetvli     zero, zero, e32, m2, ta, ma
> +        vnsrl.vx    v8, v12, zero
> +        vnsrl.vx    v10, v12, t1
> +        vsetvli     zero, zero, e64, m4, ta, ma
> +        vle64.v     v12, (a1)
> +        sh3add      a1, t0, a1
> +        vsetvli     zero, zero, e32, m2, ta, ma
> +        vnsrl.vx    v0, v12, zero
> +        vnsrl.vx    v2, v12, t1
> +        vsetvli     zero, zero, e64, m4, ta, ma
> +        vle64.v     v12, (a2)
> +        sh3add      a2, t0, a2
> +        vsetvli     zero, zero, e32, m2, ta, ma
> +        vnsrl.vx    v4, v12, zero
> +        vnsrl.vx    v6, v12, t1
> +        vfmacc.vv   v8, v0, v4
> +        vfnmsac.vv  v8, v2, v6
> +        vfmacc.vv   v10, v0, v6

Swap the two instructions above for better pipeline utilisation on in-order 
CPUs.

> +        vfmacc.vv   v10, v2, v4
> +        vsseg2e32.v v8, (a0)
> +        sh3add      a0, t0, a0
> +        bgtz        a3, 1b
> +
> +        flw         fa0, 0(a1)
> +        flw         fa1, 0(a2)
> +        flw         fa2, 0(a0)
> +        fmul.s      fa0, fa0, fa1
> +        fadd.s      fa2, fa2, fa0

It won't make much difference, but you can use a fused multiply-add here.

> +        fsw         fa2, 0(a0)
> +
> +        ret
> +endfunc

While you're at it, this looks like it could easily be adapted for the double 
precision version. In fact, it will be simpler, since you will have to use 
vlseg2e64 rather than vle128.v+vnsrl.vx+vnsrl.vx. But if you decide to 
implement that too, please keep it a separate patch.

-- 
レミ・デニ-クールモン
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-11-13 15:35         ` Rémi Denis-Courmont
@ 2023-11-13 16:01           ` Paul B Mahol
  2023-11-15  8:57           ` flow gg
  2023-11-15  8:59           ` flow gg
  2 siblings, 0 replies; 18+ messages in thread
From: Paul B Mahol @ 2023-11-13 16:01 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: flow gg

On Mon, Nov 13, 2023 at 4:35 PM Rémi Denis-Courmont <remi@remlab.net> wrote:

>    Hi,
>
> Le maanantaina 13. marraskuuta 2023, 11.43.01 EET flow gg a écrit :
> > Sorry for the long delay in responding.
>
> No problem. Working with T-Head C910 (or C920?) cores is very tedious. I
> gave
> up on that and switched over to Kendryte K230 (based on C908) now.
>
> > How is the modified patch now?
>
> It looks better, but some minute improvements are still possible.
>
> > no longer using register stride(learn from your code) and have switched
> to
> > shNadd instead.
> >
> > (using m4 and m2 as they are slightly faster than m8 and m4)
> >
> > benchmark:
> > fcmul_add_c: 2179
> > fcmul_add_rvv_f32: 1652
>
> > diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
> > index 4208501393..d2d1e909c1 100644
> > --- a/libavfilter/af_afirdsp.h
> > +++ b/libavfilter/af_afirdsp.h
> > @@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
> >  } AudioFIRDSPContext;
> >
> >  void ff_afir_init_x86(AudioFIRDSPContext *s);
> > +void ff_afir_init_riscv(AudioFIRDSPContext *s);
>
> Nit: please stick to alphabetical order like most similar code.
>
> >
> >  static void fcmul_add_c(float *sum, const float *t, const float *c,
> > ptrdiff_t len)
> >  {
> > @@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext
> > *dsp)
> >
> >  #if ARCH_X86
> >      ff_afir_init_x86(dsp);
> > +#elif ARCH_RISCV
> > +    ff_afir_init_riscv(dsp);
>
> Ditto.
>
> >  #endif
> >  }
> >
> > diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
> > new file mode 100644
> > index 0000000000..0b968a9c0d
> > --- /dev/null
> > +++ b/libavfilter/riscv/Makefile
> > @@ -0,0 +1,2 @@
> > +OBJS += riscv/af_afir_init.o
> > +RVV-OBJS += riscv/af_afir_rvv.o
> > diff --git a/libavfilter/riscv/af_afir_init.c
> > b/libavfilter/riscv/af_afir_init.c new file mode 100644
> > index 0000000000..13df8341e7
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_init.c
> > @@ -0,0 +1,39 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include <stdint.h>
> > +
> > +#include "config.h"
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavfilter/af_afirdsp.h"
> > +
> > +void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
> > +                       ptrdiff_t len);
> > +
> > +av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
> > +{
> > +#if HAVE_RVV
> > +    int flags = av_get_cpu_flags();
> > +
> > +    if (flags & AV_CPU_FLAG_RVV_F32)
>
> You need to check for Zba as well here. I doubt that we'll see hardware
> with V
> and without Zba in real life, but for the sake of correctness...
>
> > +        s->fcmul_add = ff_fcmul_add_rvv;
> > +#endif
> > +}
> > diff --git a/libavfilter/riscv/af_afir_rvv.S
> > b/libavfilter/riscv/af_afir_rvv.S new file mode 100644
> > index 0000000000..078cac8e7e
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_rvv.S
> > @@ -0,0 +1,61 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +//  void ff_fcmul_add(float *sum, const float *t, const float *c, int
> len)
> > +func ff_fcmul_add_rvv, zve32f
> > +        li          t1, 32
> > +1:
> > +        vsetvli     t0, a3, e64, m4, ta, ma
>
> You can set SEW=32 and corresponding LMUL here. Then you can remove all
> other
> VSETVLI instances below. (Note that this will NOT work on draft 0.7.1
> hardware, but it does work on conformant hardware.)
>
> > +        vle64.v     v12, (a0)
>
> This requires 64-bit alignment. I don't know if this is correct for this
> specific filter, so I leave it to other people to comment here.
>

Array should be aligned as allocated by libavutil calls.
The buffers sizes are aligned using av_cpu_align() so if that returns
correct size it should work.


>
> > +        sub         a3, a3, t0
> > +        vsetvli     zero, zero, e32, m2, ta, ma
> > +        vnsrl.vx    v8, v12, zero
> > +        vnsrl.vx    v10, v12, t1
> > +        vsetvli     zero, zero, e64, m4, ta, ma
> > +        vle64.v     v12, (a1)
> > +        sh3add      a1, t0, a1
> > +        vsetvli     zero, zero, e32, m2, ta, ma
> > +        vnsrl.vx    v0, v12, zero
> > +        vnsrl.vx    v2, v12, t1
> > +        vsetvli     zero, zero, e64, m4, ta, ma
> > +        vle64.v     v12, (a2)
> > +        sh3add      a2, t0, a2
> > +        vsetvli     zero, zero, e32, m2, ta, ma
> > +        vnsrl.vx    v4, v12, zero
> > +        vnsrl.vx    v6, v12, t1
> > +        vfmacc.vv   v8, v0, v4
> > +        vfnmsac.vv  v8, v2, v6
> > +        vfmacc.vv   v10, v0, v6
>
> Swap the two instructions above for better pipeline utilisation on
> in-order
> CPUs.
>
> > +        vfmacc.vv   v10, v2, v4
> > +        vsseg2e32.v v8, (a0)
> > +        sh3add      a0, t0, a0
> > +        bgtz        a3, 1b
> > +
> > +        flw         fa0, 0(a1)
> > +        flw         fa1, 0(a2)
> > +        flw         fa2, 0(a0)
> > +        fmul.s      fa0, fa0, fa1
> > +        fadd.s      fa2, fa2, fa0
>
> It won't make much difference, but you can use a fused multiply-add here.
>
> > +        fsw         fa2, 0(a0)
> > +
> > +        ret
> > +endfunc
>
> While you're at it, this looks like it could easily be adapted for the
> double
> precision version. In fact, it will be simpler, since you will have to use
> vlseg2e64 rather than vle128.v+vnsrl.vx+vnsrl.vx. But if you decide to
> implement that too, please keep it a separate patch.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-11-13 15:35         ` Rémi Denis-Courmont
  2023-11-13 16:01           ` Paul B Mahol
@ 2023-11-15  8:57           ` flow gg
  2023-11-15  8:59           ` flow gg
  2 siblings, 0 replies; 18+ messages in thread
From: flow gg @ 2023-11-15  8:57 UTC (permalink / raw)
  To: Rémi Denis-Courmont, FFmpeg development discussions and patches

Okay, I have updated these issues in the patch.

Rémi Denis-Courmont <remi@remlab.net> 于2023年11月13日周一 23:35写道:

>    Hi,
>
> Le maanantaina 13. marraskuuta 2023, 11.43.01 EET flow gg a écrit :
> > Sorry for the long delay in responding.
>
> No problem. Working with T-Head C910 (or C920?) cores is very tedious. I
> gave
> up on that and switched over to Kendryte K230 (based on C908) now.
>
> > How is the modified patch now?
>
> It looks better, but some minute improvements are still possible.
>
> > no longer using register stride(learn from your code) and have switched
> to
> > shNadd instead.
> >
> > (using m4 and m2 as they are slightly faster than m8 and m4)
> >
> > benchmark:
> > fcmul_add_c: 2179
> > fcmul_add_rvv_f32: 1652
>
> > diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
> > index 4208501393..d2d1e909c1 100644
> > --- a/libavfilter/af_afirdsp.h
> > +++ b/libavfilter/af_afirdsp.h
> > @@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
> >  } AudioFIRDSPContext;
> >
> >  void ff_afir_init_x86(AudioFIRDSPContext *s);
> > +void ff_afir_init_riscv(AudioFIRDSPContext *s);
>
> Nit: please stick to alphabetical order like most similar code.
>
> >
> >  static void fcmul_add_c(float *sum, const float *t, const float *c,
> > ptrdiff_t len)
> >  {
> > @@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext
> > *dsp)
> >
> >  #if ARCH_X86
> >      ff_afir_init_x86(dsp);
> > +#elif ARCH_RISCV
> > +    ff_afir_init_riscv(dsp);
>
> Ditto.
>
> >  #endif
> >  }
> >
> > diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
> > new file mode 100644
> > index 0000000000..0b968a9c0d
> > --- /dev/null
> > +++ b/libavfilter/riscv/Makefile
> > @@ -0,0 +1,2 @@
> > +OBJS += riscv/af_afir_init.o
> > +RVV-OBJS += riscv/af_afir_rvv.o
> > diff --git a/libavfilter/riscv/af_afir_init.c
> > b/libavfilter/riscv/af_afir_init.c new file mode 100644
> > index 0000000000..13df8341e7
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_init.c
> > @@ -0,0 +1,39 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include <stdint.h>
> > +
> > +#include "config.h"
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavfilter/af_afirdsp.h"
> > +
> > +void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
> > +                       ptrdiff_t len);
> > +
> > +av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
> > +{
> > +#if HAVE_RVV
> > +    int flags = av_get_cpu_flags();
> > +
> > +    if (flags & AV_CPU_FLAG_RVV_F32)
>
> You need to check for Zba as well here. I doubt that we'll see hardware
> with V
> and without Zba in real life, but for the sake of correctness...
>
> > +        s->fcmul_add = ff_fcmul_add_rvv;
> > +#endif
> > +}
> > diff --git a/libavfilter/riscv/af_afir_rvv.S
> > b/libavfilter/riscv/af_afir_rvv.S new file mode 100644
> > index 0000000000..078cac8e7e
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_rvv.S
> > @@ -0,0 +1,61 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +//  void ff_fcmul_add(float *sum, const float *t, const float *c, int
> len)
> > +func ff_fcmul_add_rvv, zve32f
> > +        li          t1, 32
> > +1:
> > +        vsetvli     t0, a3, e64, m4, ta, ma
>
> You can set SEW=32 and corresponding LMUL here. Then you can remove all
> other
> VSETVLI instances below. (Note that this will NOT work on draft 0.7.1
> hardware, but it does work on conformant hardware.)
>
> > +        vle64.v     v12, (a0)
>
> This requires 64-bit alignment. I don't know if this is correct for this
> specific filter, so I leave it to other people to comment here.
>
> > +        sub         a3, a3, t0
> > +        vsetvli     zero, zero, e32, m2, ta, ma
> > +        vnsrl.vx    v8, v12, zero
> > +        vnsrl.vx    v10, v12, t1
> > +        vsetvli     zero, zero, e64, m4, ta, ma
> > +        vle64.v     v12, (a1)
> > +        sh3add      a1, t0, a1
> > +        vsetvli     zero, zero, e32, m2, ta, ma
> > +        vnsrl.vx    v0, v12, zero
> > +        vnsrl.vx    v2, v12, t1
> > +        vsetvli     zero, zero, e64, m4, ta, ma
> > +        vle64.v     v12, (a2)
> > +        sh3add      a2, t0, a2
> > +        vsetvli     zero, zero, e32, m2, ta, ma
> > +        vnsrl.vx    v4, v12, zero
> > +        vnsrl.vx    v6, v12, t1
> > +        vfmacc.vv   v8, v0, v4
> > +        vfnmsac.vv  v8, v2, v6
> > +        vfmacc.vv   v10, v0, v6
>
> Swap the two instructions above for better pipeline utilisation on
> in-order
> CPUs.
>
> > +        vfmacc.vv   v10, v2, v4
> > +        vsseg2e32.v v8, (a0)
> > +        sh3add      a0, t0, a0
> > +        bgtz        a3, 1b
> > +
> > +        flw         fa0, 0(a1)
> > +        flw         fa1, 0(a2)
> > +        flw         fa2, 0(a0)
> > +        fmul.s      fa0, fa0, fa1
> > +        fadd.s      fa2, fa2, fa0
>
> It won't make much difference, but you can use a fused multiply-add here.
>
> > +        fsw         fa2, 0(a0)
> > +
> > +        ret
> > +endfunc
>
> While you're at it, this looks like it could easily be adapted for the
> double
> precision version. In fact, it will be simpler, since you will have to use
> vlseg2e64 rather than vle128.v+vnsrl.vx+vnsrl.vx. But if you decide to
> implement that too, please keep it a separate patch.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-11-13 15:35         ` Rémi Denis-Courmont
  2023-11-13 16:01           ` Paul B Mahol
  2023-11-15  8:57           ` flow gg
@ 2023-11-15  8:59           ` flow gg
  2023-11-15 15:05             ` Rémi Denis-Courmont
  2 siblings, 1 reply; 18+ messages in thread
From: flow gg @ 2023-11-15  8:59 UTC (permalink / raw)
  To: Rémi Denis-Courmont, FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 7444 bytes --]

Okay, I have updated these issues in the patch.

Rémi Denis-Courmont <remi@remlab.net> 于2023年11月13日周一 23:35写道:

>    Hi,
>
> Le maanantaina 13. marraskuuta 2023, 11.43.01 EET flow gg a écrit :
> > Sorry for the long delay in responding.
>
> No problem. Working with T-Head C910 (or C920?) cores is very tedious. I
> gave
> up on that and switched over to Kendryte K230 (based on C908) now.
>
> > How is the modified patch now?
>
> It looks better, but some minute improvements are still possible.
>
> > no longer using register stride(learn from your code) and have switched
> to
> > shNadd instead.
> >
> > (using m4 and m2 as they are slightly faster than m8 and m4)
> >
> > benchmark:
> > fcmul_add_c: 2179
> > fcmul_add_rvv_f32: 1652
>
> > diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
> > index 4208501393..d2d1e909c1 100644
> > --- a/libavfilter/af_afirdsp.h
> > +++ b/libavfilter/af_afirdsp.h
> > @@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
> >  } AudioFIRDSPContext;
> >
> >  void ff_afir_init_x86(AudioFIRDSPContext *s);
> > +void ff_afir_init_riscv(AudioFIRDSPContext *s);
>
> Nit: please stick to alphabetical order like most similar code.
>
> >
> >  static void fcmul_add_c(float *sum, const float *t, const float *c,
> > ptrdiff_t len)
> >  {
> > @@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext
> > *dsp)
> >
> >  #if ARCH_X86
> >      ff_afir_init_x86(dsp);
> > +#elif ARCH_RISCV
> > +    ff_afir_init_riscv(dsp);
>
> Ditto.
>
> >  #endif
> >  }
> >
> > diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
> > new file mode 100644
> > index 0000000000..0b968a9c0d
> > --- /dev/null
> > +++ b/libavfilter/riscv/Makefile
> > @@ -0,0 +1,2 @@
> > +OBJS += riscv/af_afir_init.o
> > +RVV-OBJS += riscv/af_afir_rvv.o
> > diff --git a/libavfilter/riscv/af_afir_init.c
> > b/libavfilter/riscv/af_afir_init.c new file mode 100644
> > index 0000000000..13df8341e7
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_init.c
> > @@ -0,0 +1,39 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include <stdint.h>
> > +
> > +#include "config.h"
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavfilter/af_afirdsp.h"
> > +
> > +void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
> > +                       ptrdiff_t len);
> > +
> > +av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
> > +{
> > +#if HAVE_RVV
> > +    int flags = av_get_cpu_flags();
> > +
> > +    if (flags & AV_CPU_FLAG_RVV_F32)
>
> You need to check for Zba as well here. I doubt that we'll see hardware
> with V
> and without Zba in real life, but for the sake of correctness...
>
> > +        s->fcmul_add = ff_fcmul_add_rvv;
> > +#endif
> > +}
> > diff --git a/libavfilter/riscv/af_afir_rvv.S
> > b/libavfilter/riscv/af_afir_rvv.S new file mode 100644
> > index 0000000000..078cac8e7e
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_rvv.S
> > @@ -0,0 +1,61 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +//  void ff_fcmul_add(float *sum, const float *t, const float *c, int
> len)
> > +func ff_fcmul_add_rvv, zve32f
> > +        li          t1, 32
> > +1:
> > +        vsetvli     t0, a3, e64, m4, ta, ma
>
> You can set SEW=32 and corresponding LMUL here. Then you can remove all
> other
> VSETVLI instances below. (Note that this will NOT work on draft 0.7.1
> hardware, but it does work on conformant hardware.)
>
> > +        vle64.v     v12, (a0)
>
> This requires 64-bit alignment. I don't know if this is correct for this
> specific filter, so I leave it to other people to comment here.
>
> > +        sub         a3, a3, t0
> > +        vsetvli     zero, zero, e32, m2, ta, ma
> > +        vnsrl.vx    v8, v12, zero
> > +        vnsrl.vx    v10, v12, t1
> > +        vsetvli     zero, zero, e64, m4, ta, ma
> > +        vle64.v     v12, (a1)
> > +        sh3add      a1, t0, a1
> > +        vsetvli     zero, zero, e32, m2, ta, ma
> > +        vnsrl.vx    v0, v12, zero
> > +        vnsrl.vx    v2, v12, t1
> > +        vsetvli     zero, zero, e64, m4, ta, ma
> > +        vle64.v     v12, (a2)
> > +        sh3add      a2, t0, a2
> > +        vsetvli     zero, zero, e32, m2, ta, ma
> > +        vnsrl.vx    v4, v12, zero
> > +        vnsrl.vx    v6, v12, t1
> > +        vfmacc.vv   v8, v0, v4
> > +        vfnmsac.vv  v8, v2, v6
> > +        vfmacc.vv   v10, v0, v6
>
> Swap the two instructions above for better pipeline utilisation on
> in-order
> CPUs.
>
> > +        vfmacc.vv   v10, v2, v4
> > +        vsseg2e32.v v8, (a0)
> > +        sh3add      a0, t0, a0
> > +        bgtz        a3, 1b
> > +
> > +        flw         fa0, 0(a1)
> > +        flw         fa1, 0(a2)
> > +        flw         fa2, 0(a0)
> > +        fmul.s      fa0, fa0, fa1
> > +        fadd.s      fa2, fa2, fa0
>
> It won't make much difference, but you can use a fused multiply-add here.
>
> > +        fsw         fa2, 0(a0)
> > +
> > +        ret
> > +endfunc
>
> While you're at it, this looks like it could easily be adapted for the
> double
> precision version. In fact, it will be simpler, since you will have to use
> vlseg2e64 rather than vle128.v+vnsrl.vx+vnsrl.vx. But if you decide to
> implement that too, please keep it a separate patch.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
>

[-- Attachment #2: af_afir-RISC-V-V-fcmul_add.patch --]
[-- Type: text/x-patch, Size: 5356 bytes --]

From 66e681b6d85a9ffead13f5bc21975f7110f6c84f Mon Sep 17 00:00:00 2001
From: sunyuechi <sunyuechi@iscas.ac.cn>
Date: Fri, 3 Nov 2023 10:35:53 +0800
Subject: [PATCH] af_afir: RISC-V V fcmul_add

benchmark:
fcmul_add_c: 2179
fcmul_add_rvv_f32: 1652
---
 libavfilter/af_afirdsp.h         |  5 ++-
 libavfilter/riscv/Makefile       |  2 ++
 libavfilter/riscv/af_afir_init.c | 42 ++++++++++++++++++++++++
 libavfilter/riscv/af_afir_rvv.S  | 55 ++++++++++++++++++++++++++++++++
 4 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 libavfilter/riscv/Makefile
 create mode 100644 libavfilter/riscv/af_afir_init.c
 create mode 100644 libavfilter/riscv/af_afir_rvv.S

diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
index 4208501393..827e067a9b 100644
--- a/libavfilter/af_afirdsp.h
+++ b/libavfilter/af_afirdsp.h
@@ -33,6 +33,7 @@ typedef struct AudioFIRDSPContext {
                       ptrdiff_t len);
 } AudioFIRDSPContext;
 
+void ff_afir_init_riscv(AudioFIRDSPContext *s);
 void ff_afir_init_x86(AudioFIRDSPContext *s);
 
 static void fcmul_add_c(float *sum, const float *t, const float *c, ptrdiff_t len)
@@ -74,7 +75,9 @@ static av_unused void ff_afir_init(AudioFIRDSPContext *dsp)
     dsp->fcmul_add = fcmul_add_c;
     dsp->dcmul_add = dcmul_add_c;
 
-#if ARCH_X86
+#if ARCH_RISCV
+    ff_afir_init_riscv(dsp);
+#elif ARCH_X86
     ff_afir_init_x86(dsp);
 #endif
 }
diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
new file mode 100644
index 0000000000..0b968a9c0d
--- /dev/null
+++ b/libavfilter/riscv/Makefile
@@ -0,0 +1,2 @@
+OBJS += riscv/af_afir_init.o
+RVV-OBJS += riscv/af_afir_rvv.o
diff --git a/libavfilter/riscv/af_afir_init.c b/libavfilter/riscv/af_afir_init.c
new file mode 100644
index 0000000000..c05cc26c04
--- /dev/null
+++ b/libavfilter/riscv/af_afir_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavfilter/af_afirdsp.h"
+
+void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
+                       ptrdiff_t len);
+
+av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
+{
+#if HAVE_RVV
+    int flags = av_get_cpu_flags();
+
+    if (flags & AV_CPU_FLAG_RVV_F32) {
+        if (flags & AV_CPU_FLAG_RVB_ADDR) {
+            s->fcmul_add = ff_fcmul_add_rvv;
+        }
+    }
+#endif
+}
diff --git a/libavfilter/riscv/af_afir_rvv.S b/libavfilter/riscv/af_afir_rvv.S
new file mode 100644
index 0000000000..ff4146133f
--- /dev/null
+++ b/libavfilter/riscv/af_afir_rvv.S
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+//  void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
+func ff_fcmul_add_rvv, zve32f
+        li          t1, 32
+1:
+        vsetvli     t0, a3, e32, m4, ta, ma
+        vle64.v     v24, (a0)
+        sub         a3, a3, t0
+        vnsrl.wx    v16, v24, zero
+        vnsrl.wx    v20, v24, t1
+        vle64.v     v24, (a1)
+        sh3add      a1, t0, a1
+        vnsrl.wx    v0, v24, zero
+        vnsrl.wx    v4, v24, t1
+        vle64.v     v24, (a2)
+        sh3add      a2, t0, a2
+        vnsrl.wx    v8, v24, zero
+        vnsrl.wx    v12, v24, t1
+        vfmacc.vv   v16, v0, v8
+        vfmacc.vv   v20, v4, v8
+        vfnmsac.vv  v16, v4, v12
+        vfmacc.vv   v20, v0, v12
+        vsseg2e32.v v16, (a0)
+        sh3add      a0, t0, a0
+        bgtz        a3, 1b
+
+        flw         fa0, 0(a1)
+        flw         fa1, 0(a2)
+        flw         fa2, 0(a0)
+        fmadd.s     fa2, fa0, fa1, fa2
+        fsw         fa2, 0(a0)
+
+        ret
+endfunc
-- 
2.42.1


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-11-15  8:59           ` flow gg
@ 2023-11-15 15:05             ` Rémi Denis-Courmont
  2023-11-15 23:04               ` flow gg
  0 siblings, 1 reply; 18+ messages in thread
From: Rémi Denis-Courmont @ 2023-11-15 15:05 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le keskiviikkona 15. marraskuuta 2023, 10.59.55 EET flow gg a écrit :
> Okay, I have updated these issues in the patch.

It does not assemble but I can fix it locally. The narrowing shift trickery 
require Zve64x, or rather Zve64f in this case.

The performance improvement is much better on newer hardware:
fcmul_add_c: 4891.2
fcmul_add_rvv_f64: 2399.5

FWIW, VLSEG2E32.V remains slightly worse than with shifting:
fcmul_add_c: 4891.2
fcmul_add_rvv_f32: 2877.5

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add
  2023-11-15 15:05             ` Rémi Denis-Courmont
@ 2023-11-15 23:04               ` flow gg
  0 siblings, 0 replies; 18+ messages in thread
From: flow gg @ 2023-11-15 23:04 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 997 bytes --]

Okay, I have modified them to 64 and added some descriptions.

Rémi Denis-Courmont <remi@remlab.net> 于2023年11月15日周三 23:06写道:

> Le keskiviikkona 15. marraskuuta 2023, 10.59.55 EET flow gg a écrit :
> > Okay, I have updated these issues in the patch.
>
> It does not assemble but I can fix it locally. The narrowing shift
> trickery
> require Zve64x, or rather Zve64f in this case.
>
> The performance improvement is much better on newer hardware:
> fcmul_add_c: 4891.2
> fcmul_add_rvv_f64: 2399.5
>
> FWIW, VLSEG2E32.V remains slightly worse than with shifting:
> fcmul_add_c: 4891.2
> fcmul_add_rvv_f32: 2877.5
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>

[-- Attachment #2: af_afir-RISC-V-V-fcmul_add.patch --]
[-- Type: text/x-patch, Size: 5486 bytes --]

From 6b88fbf9b94c098841197c9fcb467006177ee4c6 Mon Sep 17 00:00:00 2001
From: sunyuechi <sunyuechi@iscas.ac.cn>
Date: Fri, 3 Nov 2023 10:35:53 +0800
Subject: [PATCH] af_afir: RISC-V V fcmul_add

Segmented loads are slow, so here we use unit-strided load and narrowing shifts.

c910:
fcmul_add_c: 2179
fcmul_add_rvv_f64: 1652

c908:
fcmul_add_c: 4891.2
fcmul_add_rvv_f64: 2399.5
---
 libavfilter/af_afirdsp.h         |  5 ++-
 libavfilter/riscv/Makefile       |  2 ++
 libavfilter/riscv/af_afir_init.c | 42 ++++++++++++++++++++++++
 libavfilter/riscv/af_afir_rvv.S  | 55 ++++++++++++++++++++++++++++++++
 4 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 libavfilter/riscv/Makefile
 create mode 100644 libavfilter/riscv/af_afir_init.c
 create mode 100644 libavfilter/riscv/af_afir_rvv.S

diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
index 4208501393..827e067a9b 100644
--- a/libavfilter/af_afirdsp.h
+++ b/libavfilter/af_afirdsp.h
@@ -33,6 +33,7 @@ typedef struct AudioFIRDSPContext {
                       ptrdiff_t len);
 } AudioFIRDSPContext;
 
+void ff_afir_init_riscv(AudioFIRDSPContext *s);
 void ff_afir_init_x86(AudioFIRDSPContext *s);
 
 static void fcmul_add_c(float *sum, const float *t, const float *c, ptrdiff_t len)
@@ -74,7 +75,9 @@ static av_unused void ff_afir_init(AudioFIRDSPContext *dsp)
     dsp->fcmul_add = fcmul_add_c;
     dsp->dcmul_add = dcmul_add_c;
 
-#if ARCH_X86
+#if ARCH_RISCV
+    ff_afir_init_riscv(dsp);
+#elif ARCH_X86
     ff_afir_init_x86(dsp);
 #endif
 }
diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
new file mode 100644
index 0000000000..0b968a9c0d
--- /dev/null
+++ b/libavfilter/riscv/Makefile
@@ -0,0 +1,2 @@
+OBJS += riscv/af_afir_init.o
+RVV-OBJS += riscv/af_afir_rvv.o
diff --git a/libavfilter/riscv/af_afir_init.c b/libavfilter/riscv/af_afir_init.c
new file mode 100644
index 0000000000..52aa18c126
--- /dev/null
+++ b/libavfilter/riscv/af_afir_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavfilter/af_afirdsp.h"
+
+void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
+                       ptrdiff_t len);
+
+av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
+{
+#if HAVE_RVV
+    int flags = av_get_cpu_flags();
+
+    if (flags & AV_CPU_FLAG_RVV_F64) {
+        if (flags & AV_CPU_FLAG_RVB_ADDR) {
+            s->fcmul_add = ff_fcmul_add_rvv;
+        }
+    }
+#endif
+}
diff --git a/libavfilter/riscv/af_afir_rvv.S b/libavfilter/riscv/af_afir_rvv.S
new file mode 100644
index 0000000000..04ec2e50d8
--- /dev/null
+++ b/libavfilter/riscv/af_afir_rvv.S
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+//  void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
+func ff_fcmul_add_rvv, zve64f
+        li          t1, 32
+1:
+        vsetvli     t0, a3, e32, m4, ta, ma
+        vle64.v     v24, (a0)
+        sub         a3, a3, t0
+        vnsrl.wx    v16, v24, zero
+        vnsrl.wx    v20, v24, t1
+        vle64.v     v24, (a1)
+        sh3add      a1, t0, a1
+        vnsrl.wx    v0, v24, zero
+        vnsrl.wx    v4, v24, t1
+        vle64.v     v24, (a2)
+        sh3add      a2, t0, a2
+        vnsrl.wx    v8, v24, zero
+        vnsrl.wx    v12, v24, t1
+        vfmacc.vv   v16, v0, v8
+        vfmacc.vv   v20, v4, v8
+        vfnmsac.vv  v16, v4, v12
+        vfmacc.vv   v20, v0, v12
+        vsseg2e32.v v16, (a0)
+        sh3add      a0, t0, a0
+        bgtz        a3, 1b
+
+        flw         fa0, 0(a1)
+        flw         fa1, 0(a2)
+        flw         fa2, 0(a0)
+        fmadd.s     fa2, fa0, fa1, fa2
+        fsw         fa2, 0(a0)
+
+        ret
+endfunc
-- 
2.42.1


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2023-11-15 23:05 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-26  9:24 [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add flow gg
2023-09-26 18:34 ` Rémi Denis-Courmont
2023-09-26 18:40   ` Paul B Mahol
2023-09-26 18:44     ` Rémi Denis-Courmont
2023-09-27  1:47       ` flow gg
2023-09-27 16:01         ` Rémi Denis-Courmont
2023-09-27 16:27         ` Rémi Denis-Courmont
2023-09-26 18:50 ` Rémi Denis-Courmont
2023-09-27 16:41 ` Rémi Denis-Courmont
2023-09-28  5:45   ` flow gg
2023-09-28 13:33     ` Rémi Denis-Courmont
2023-11-13  9:43       ` flow gg
2023-11-13 15:35         ` Rémi Denis-Courmont
2023-11-13 16:01           ` Paul B Mahol
2023-11-15  8:57           ` flow gg
2023-11-15  8:59           ` flow gg
2023-11-15 15:05             ` Rémi Denis-Courmont
2023-11-15 23:04               ` flow gg

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git