Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/7] lavc/me_cmp: R-V V pix_abs
@ 2024-02-06 15:55 flow gg
  2024-02-06 16:58 ` Rémi Denis-Courmont
  0 siblings, 1 reply; 9+ messages in thread
From: flow gg @ 2024-02-06 15:55 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: 0001-lavc-me_cmp-R-V-V-pix_abs.patch --]
[-- Type: text/x-patch, Size: 6857 bytes --]

From d4d6b3ea040f3f7997463b4452813bc75d1c9f9d Mon Sep 17 00:00:00 2001
From: sunyuechi <sunyuechi@iscas.ac.cn>
Date: Sat, 3 Feb 2024 10:58:13 +0800
Subject: [PATCH 1/7] lavc/me_cmp: R-V V pix_abs

C908:
pix_abs_0_0_c: 534.0
pix_abs_0_0_rvv_i32: 136.2
pix_abs_1_0_c: 287.7
pix_abs_1_0_rvv_i32: 125.2
sad_0_c: 534.0
sad_0_rvv_i32: 136.2
sad_1_c: 287.7
sad_1_rvv_i32: 125.2
---
 libavcodec/me_cmp.c            |  2 +
 libavcodec/me_cmp.h            |  1 +
 libavcodec/riscv/Makefile      |  2 +
 libavcodec/riscv/me_cmp_init.c | 46 +++++++++++++++++++++++
 libavcodec/riscv/me_cmp_rvv.S  | 67 ++++++++++++++++++++++++++++++++++
 5 files changed, 118 insertions(+)
 create mode 100644 libavcodec/riscv/me_cmp_init.c
 create mode 100644 libavcodec/riscv/me_cmp_rvv.S

diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index fecd70d723..8f4b3d0ad5 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1136,6 +1136,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
     ff_me_cmp_init_arm(c, avctx);
 #elif ARCH_PPC
     ff_me_cmp_init_ppc(c, avctx);
+#elif ARCH_RISCV
+    ff_me_cmp_init_riscv(c, avctx);
 #elif ARCH_X86
     ff_me_cmp_init_x86(c, avctx);
 #elif ARCH_MIPS
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index aefd32a7dc..fee0ecb28e 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -86,6 +86,7 @@ void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_riscv(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx);
 
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 97067558d8..dff8784102 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -41,6 +41,8 @@ OBJS-$(CONFIG_LLVIDENCDSP) += riscv/llvidencdsp_init.o
 RVV-OBJS-$(CONFIG_LLVIDENCDSP) += riscv/llvidencdsp_rvv.o
 OBJS-$(CONFIG_LPC) += riscv/lpc_init.o
 RVV-OBJS-$(CONFIG_LPC) += riscv/lpc_rvv.o
+OBJS-$(CONFIG_ME_CMP) += riscv/me_cmp_init.o
+RVV-OBJS-$(CONFIG_ME_CMP) += riscv/me_cmp_rvv.o
 OBJS-$(CONFIG_OPUS_DECODER) += riscv/opusdsp_init.o
 RVV-OBJS-$(CONFIG_OPUS_DECODER) += riscv/opusdsp_rvv.o
 OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o
diff --git a/libavcodec/riscv/me_cmp_init.c b/libavcodec/riscv/me_cmp_init.c
new file mode 100644
index 0000000000..9228f74cfd
--- /dev/null
+++ b/libavcodec/riscv/me_cmp_init.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/me_cmp.h"
+#include "libavcodec/mpegvideo.h"
+
+int ff_pix_abs16_rvv(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+                              ptrdiff_t stride, int h);
+int ff_pix_abs8_rvv(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+                             ptrdiff_t stride, int h);
+
+av_cold void ff_me_cmp_init_riscv(MECmpContext *c, AVCodecContext *avctx)
+{
+#if HAVE_RVV
+    int flags = av_get_cpu_flags();
+
+    if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
+        c->pix_abs[0][0] = ff_pix_abs16_rvv;
+        c->sad[0] = ff_pix_abs16_rvv;
+        c->pix_abs[1][0] = ff_pix_abs8_rvv;
+        c->sad[1] = ff_pix_abs8_rvv;
+    }
+#endif
+}
diff --git a/libavcodec/riscv/me_cmp_rvv.S b/libavcodec/riscv/me_cmp_rvv.S
new file mode 100644
index 0000000000..8dadf39bc7
--- /dev/null
+++ b/libavcodec/riscv/me_cmp_rvv.S
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro pix_abs_ret
+        vsetivli        zero, 1, e32, m1, ta, ma
+        vmv.x.s         a0, v0
+        ret
+.endm
+
+func ff_pix_abs16_rvv, zve32x
+        vsetivli        zero, 1, e32, m1, ta, ma
+        vmv.s.x         v0, zero
+1:
+        vsetivli        zero, 16, e8, m1, tu, ma
+        vle8.v          v4, (a1)
+        vle8.v          v12, (a2)
+        addi            a4, a4, -1
+        vwsubu.vv       v16, v4, v12
+        add             a1, a1, a3
+        vwsubu.vv       v20, v12, v4
+        vsetvli         zero, zero, e16, m2, tu, ma
+        vmax.vv         v16, v16, v20
+        add             a2, a2, a3
+        vwredsum.vs     v0, v16, v0
+        bnez            a4, 1b
+
+        pix_abs_ret
+endfunc
+
+func ff_pix_abs8_rvv, zve32x
+        vsetivli        zero, 1, e32, m1, ta, ma
+        vmv.s.x         v0, zero
+1:
+        vsetivli        zero, 8, e8, mf2, tu, ma
+        vle8.v          v4, (a1)
+        vle8.v          v12, (a2)
+        addi            a4, a4, -1
+        vwsubu.vv       v16, v4, v12
+        add             a1, a1, a3
+        vwsubu.vv       v20, v12, v4
+        vsetvli         zero, zero, e16, m1, tu, ma
+        vmax.vv         v16, v16, v20
+        add             a2, a2, a3
+        vwredsum.vs     v0, v16, v0
+        bnez            a4, 1b
+
+        pix_abs_ret
+endfunc
-- 
2.43.0


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/7] lavc/me_cmp: R-V V pix_abs
  2024-02-06 15:55 [FFmpeg-devel] [PATCH 1/7] lavc/me_cmp: R-V V pix_abs flow gg
@ 2024-02-06 16:58 ` Rémi Denis-Courmont
  2024-02-07  0:01   ` flow gg
  0 siblings, 1 reply; 9+ messages in thread
From: Rémi Denis-Courmont @ 2024-02-06 16:58 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: flow gg

Hi,

To sum a vector, you should only reduce once at the end of the function, c.f. 
how it's done in existing scalar products. Reduction instructions are 
(intrinsically) slow.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/7] lavc/me_cmp: R-V V pix_abs
  2024-02-06 16:58 ` Rémi Denis-Courmont
@ 2024-02-07  0:01   ` flow gg
  2024-02-08 19:41     ` Rémi Denis-Courmont
  0 siblings, 1 reply; 9+ messages in thread
From: flow gg @ 2024-02-07  0:01 UTC (permalink / raw)
  To: Rémi Denis-Courmont, FFmpeg development discussions and patches

I think in most cases it is like this, but specifically for this function,
using Reduction only once would be slower.

The currently submitted version roughly takes:
pix_abs_0_0_rvv_i32: 136.2

The version that uses Reduction only once takes:
pix_abs_0_0_rvv_i32: 169.2

Here is the implementation of the version that uses it only once:

func ff_pix_abs16_temp_rvv, zve32x
        vsetivli        zero, 16, e32, m4, ta, ma
        vmv.v.i         v24, 0
        vmv.s.x         v0, zero
1:
        vsetvli         zero, zero, e8, m1, tu, ma
        vle8.v          v4, (a1)
        vle8.v          v12, (a2)
        addi            a4, a4, -1
        vwsubu.vv       v16, v4, v12
        add             a1, a1, a3
        vwsubu.vv       v20, v12, v4
        vsetvli         zero, zero, e16, m2, tu, ma
        vmax.vv         v16, v16, v20
        add             a2, a2, a3
        vwadd.wv        v24, v24, v16
        bnez            a4, 1b

        vsetvli         zero, zero, e32, m4, ta, ma
        vwredsumu.vs    v0, v24, v0
        vmv.x.s         a0, v0
        ret
endfunc

Rémi Denis-Courmont <remi@remlab.net> 于2024年2月7日周三 00:58写道:

> Hi,
>
> To sum a vector, you should only reduce once at the end of the function,
> c.f.
> how it's done in existing scalar products. Reduction instructions are
> (intrinsically) slow.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/7] lavc/me_cmp: R-V V pix_abs
  2024-02-07  0:01   ` flow gg
@ 2024-02-08 19:41     ` Rémi Denis-Courmont
  2024-02-08 22:39       ` flow gg
  0 siblings, 1 reply; 9+ messages in thread
From: Rémi Denis-Courmont @ 2024-02-08 19:41 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le keskiviikkona 7. helmikuuta 2024, 2.01.23 EET flow gg a écrit :
> I think in most cases it is like this, but specifically for this function,
> using Reduction only once would be slower.
> 
> The currently submitted version roughly takes:
> pix_abs_0_0_rvv_i32: 136.2
> 
> The version that uses Reduction only once takes:
> pix_abs_0_0_rvv_i32: 169.2

You're only using one vector and half a vector respectively, so the 
logarithmic time of the sum is relatively small.

But are you sure that it wouldn't be faster to process multiple rows and 
larger group multiplers?

> Here is the implementation of the version that uses it only once:
> 
> func ff_pix_abs16_temp_rvv, zve32x
>         vsetivli        zero, 16, e32, m4, ta, ma
>         vmv.v.i         v24, 0
>         vmv.s.x         v0, zero
> 1:
>         vsetvli         zero, zero, e8, m1, tu, ma
>         vle8.v          v4, (a1)
>         vle8.v          v12, (a2)
>         addi            a4, a4, -1
>         vwsubu.vv       v16, v4, v12
>         add             a1, a1, a3
>         vwsubu.vv       v20, v12, v4
>         vsetvli         zero, zero, e16, m2, tu, ma
>         vmax.vv         v16, v16, v20
>         add             a2, a2, a3
>         vwadd.wv        v24, v24, v16
>         bnez            a4, 1b
> 
>         vsetvli         zero, zero, e32, m4, ta, ma
>         vwredsumu.vs    v0, v24, v0
>         vmv.x.s         a0, v0
>         ret
> endfunc
> 
> Rémi Denis-Courmont <remi@remlab.net> 于2024年2月7日周三 00:58写道:
> 
> > Hi,
> > 
> > To sum a vector, you should only reduce once at the end of the function,
> > c.f.
> > how it's done in existing scalar products. Reduction instructions are
> > (intrinsically) slow.
> > 
> > --
> > Rémi Denis-Courmont
> > http://www.remlab.net/
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".


-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/7] lavc/me_cmp: R-V V pix_abs
  2024-02-08 19:41     ` Rémi Denis-Courmont
@ 2024-02-08 22:39       ` flow gg
  2024-02-09 10:32         ` Rémi Denis-Courmont
  0 siblings, 1 reply; 9+ messages in thread
From: flow gg @ 2024-02-08 22:39 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

From my understanding, to use larger group multipliers, one needs to
utilize vlse64 (8x8) vlse128 (16x16).

However, due to the use in tests of

ptr = img2 + y * WIDTH + x;
d2 = call_ref(NULL, img1, ptr, WIDTH, h);
d1 = call_new(NULL, img1, ptr, WIDTH, h);

will get:  pix_abs_1_0_rvv_i32 (fatal signal 7: Bus error)

Because it can only load according to e8, it seems there's no way to use
larger group multipliers.



Rémi Denis-Courmont <remi@remlab.net> 于2024年2月9日周五 03:41写道:

> Le keskiviikkona 7. helmikuuta 2024, 2.01.23 EET flow gg a écrit :
> > I think in most cases it is like this, but specifically for this
> function,
> > using Reduction only once would be slower.
> >
> > The currently submitted version roughly takes:
> > pix_abs_0_0_rvv_i32: 136.2
> >
> > The version that uses Reduction only once takes:
> > pix_abs_0_0_rvv_i32: 169.2
>
> You're only using one vector and half a vector respectively, so the
> logarithmic time of the sum is relatively small.
>
> But are you sure that it wouldn't be faster to process multiple rows and
> larger group multiplers?
>
> > Here is the implementation of the version that uses it only once:
> >
> > func ff_pix_abs16_temp_rvv, zve32x
> >         vsetivli        zero, 16, e32, m4, ta, ma
> >         vmv.v.i         v24, 0
> >         vmv.s.x         v0, zero
> > 1:
> >         vsetvli         zero, zero, e8, m1, tu, ma
> >         vle8.v          v4, (a1)
> >         vle8.v          v12, (a2)
> >         addi            a4, a4, -1
> >         vwsubu.vv       v16, v4, v12
> >         add             a1, a1, a3
> >         vwsubu.vv       v20, v12, v4
> >         vsetvli         zero, zero, e16, m2, tu, ma
> >         vmax.vv         v16, v16, v20
> >         add             a2, a2, a3
> >         vwadd.wv        v24, v24, v16
> >         bnez            a4, 1b
> >
> >         vsetvli         zero, zero, e32, m4, ta, ma
> >         vwredsumu.vs    v0, v24, v0
> >         vmv.x.s         a0, v0
> >         ret
> > endfunc
> >
> > Rémi Denis-Courmont <remi@remlab.net> 于2024年2月7日周三 00:58写道:
> >
> > > Hi,
> > >
> > > To sum a vector, you should only reduce once at the end of the
> function,
> > > c.f.
> > > how it's done in existing scalar products. Reduction instructions are
> > > (intrinsically) slow.
> > >
> > > --
> > > Rémi Denis-Courmont
> > > http://www.remlab.net/
> >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/7] lavc/me_cmp: R-V V pix_abs
  2024-02-08 22:39       ` flow gg
@ 2024-02-09 10:32         ` Rémi Denis-Courmont
  2024-02-09 15:34           ` flow gg
  0 siblings, 1 reply; 9+ messages in thread
From: Rémi Denis-Courmont @ 2024-02-09 10:32 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



Le 9 février 2024 00:39:38 GMT+02:00, flow gg <hlefthleft@gmail.com> a écrit :
>From my understanding, to use larger group multipliers, one needs to
>utilize vlse64 (8x8) vlse128 (16x16).
>
>However, due to the use in tests of
>
>ptr = img2 + y * WIDTH + x;
>d2 = call_ref(NULL, img1, ptr, WIDTH, h);
>d1 = call_new(NULL, img1, ptr, WIDTH, h);
>
>will get:  pix_abs_1_0_rvv_i32 (fatal signal 7: Bus error)
>
>Because it can only load according to e8, it seems there's no way to use
>larger group multipliers.

vlse128.v requires 128-bit elements, which no hardware supports. vlse64.v works just fine; we're already using it. There's also the possibility of segmented strided loads, or simply multiple unit loads.

In any case, unrolling one way or other should improve performance.


>
>
>
>Rémi Denis-Courmont <remi@remlab.net> 于2024年2月9日周五 03:41写道:
>
>> Le keskiviikkona 7. helmikuuta 2024, 2.01.23 EET flow gg a écrit :
>> > I think in most cases it is like this, but specifically for this
>> function,
>> > using Reduction only once would be slower.
>> >
>> > The currently submitted version roughly takes:
>> > pix_abs_0_0_rvv_i32: 136.2
>> >
>> > The version that uses Reduction only once takes:
>> > pix_abs_0_0_rvv_i32: 169.2
>>
>> You're only using one vector and half a vector respectively, so the
>> logarithmic time of the sum is relatively small.
>>
>> But are you sure that it wouldn't be faster to process multiple rows and
>> larger group multiplers?
>>
>> > Here is the implementation of the version that uses it only once:
>> >
>> > func ff_pix_abs16_temp_rvv, zve32x
>> >         vsetivli        zero, 16, e32, m4, ta, ma
>> >         vmv.v.i         v24, 0
>> >         vmv.s.x         v0, zero
>> > 1:
>> >         vsetvli         zero, zero, e8, m1, tu, ma
>> >         vle8.v          v4, (a1)
>> >         vle8.v          v12, (a2)
>> >         addi            a4, a4, -1
>> >         vwsubu.vv       v16, v4, v12
>> >         add             a1, a1, a3
>> >         vwsubu.vv       v20, v12, v4
>> >         vsetvli         zero, zero, e16, m2, tu, ma
>> >         vmax.vv         v16, v16, v20
>> >         add             a2, a2, a3
>> >         vwadd.wv        v24, v24, v16
>> >         bnez            a4, 1b
>> >
>> >         vsetvli         zero, zero, e32, m4, ta, ma
>> >         vwredsumu.vs    v0, v24, v0
>> >         vmv.x.s         a0, v0
>> >         ret
>> > endfunc
>> >
>> > Rémi Denis-Courmont <remi@remlab.net> 于2024年2月7日周三 00:58写道:
>> >
>> > > Hi,
>> > >
>> > > To sum a vector, you should only reduce once at the end of the
>> function,
>> > > c.f.
>> > > how it's done in existing scalar products. Reduction instructions are
>> > > (intrinsically) slow.
>> > >
>> > > --
>> > > Rémi Denis-Courmont
>> > > http://www.remlab.net/
>> >
>> > _______________________________________________
>> > ffmpeg-devel mailing list
>> > ffmpeg-devel@ffmpeg.org
>> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>> >
>> > To unsubscribe, visit link above, or email
>> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
>>
>> --
>> 雷米‧德尼-库尔蒙
>> http://www.remlab.net/
>>
>>
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
>_______________________________________________
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/7] lavc/me_cmp: R-V V pix_abs
  2024-02-09 10:32         ` Rémi Denis-Courmont
@ 2024-02-09 15:34           ` flow gg
  2024-02-10  9:14             ` Rémi Denis-Courmont
  0 siblings, 1 reply; 9+ messages in thread
From: flow gg @ 2024-02-09 15:34 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

The issue here is that any load greater than e8 will fail the test(Bus
error), so it cannot use vlse64 or similar methods...

Rémi Denis-Courmont <remi@remlab.net> 于2024年2月9日周五 18:32写道:

>
>
> Le 9 février 2024 00:39:38 GMT+02:00, flow gg <hlefthleft@gmail.com> a
> écrit :
> >From my understanding, to use larger group multipliers, one needs to
> >utilize vlse64 (8x8) vlse128 (16x16).
> >
> >However, due to the use in tests of
> >
> >ptr = img2 + y * WIDTH + x;
> >d2 = call_ref(NULL, img1, ptr, WIDTH, h);
> >d1 = call_new(NULL, img1, ptr, WIDTH, h);
> >
> >will get:  pix_abs_1_0_rvv_i32 (fatal signal 7: Bus error)
> >
> >Because it can only load according to e8, it seems there's no way to use
> >larger group multipliers.
>
> vlse128.v requires 128-bit elements, which no hardware supports. vlse64.v
> works just fine; we're already using it. There's also the possibility of
> segmented strided loads, or simply multiple unit loads.
>
> In any case, unrolling one way or other should improve performance.
>
>
> >
> >
> >
> >Rémi Denis-Courmont <remi@remlab.net> 于2024年2月9日周五 03:41写道:
> >
> >> Le keskiviikkona 7. helmikuuta 2024, 2.01.23 EET flow gg a écrit :
> >> > I think in most cases it is like this, but specifically for this
> >> function,
> >> > using Reduction only once would be slower.
> >> >
> >> > The currently submitted version roughly takes:
> >> > pix_abs_0_0_rvv_i32: 136.2
> >> >
> >> > The version that uses Reduction only once takes:
> >> > pix_abs_0_0_rvv_i32: 169.2
> >>
> >> You're only using one vector and half a vector respectively, so the
> >> logarithmic time of the sum is relatively small.
> >>
> >> But are you sure that it wouldn't be faster to process multiple rows and
> >> larger group multiplers?
> >>
> >> > Here is the implementation of the version that uses it only once:
> >> >
> >> > func ff_pix_abs16_temp_rvv, zve32x
> >> >         vsetivli        zero, 16, e32, m4, ta, ma
> >> >         vmv.v.i         v24, 0
> >> >         vmv.s.x         v0, zero
> >> > 1:
> >> >         vsetvli         zero, zero, e8, m1, tu, ma
> >> >         vle8.v          v4, (a1)
> >> >         vle8.v          v12, (a2)
> >> >         addi            a4, a4, -1
> >> >         vwsubu.vv       v16, v4, v12
> >> >         add             a1, a1, a3
> >> >         vwsubu.vv       v20, v12, v4
> >> >         vsetvli         zero, zero, e16, m2, tu, ma
> >> >         vmax.vv         v16, v16, v20
> >> >         add             a2, a2, a3
> >> >         vwadd.wv        v24, v24, v16
> >> >         bnez            a4, 1b
> >> >
> >> >         vsetvli         zero, zero, e32, m4, ta, ma
> >> >         vwredsumu.vs    v0, v24, v0
> >> >         vmv.x.s         a0, v0
> >> >         ret
> >> > endfunc
> >> >
> >> > Rémi Denis-Courmont <remi@remlab.net> 于2024年2月7日周三 00:58写道:
> >> >
> >> > > Hi,
> >> > >
> >> > > To sum a vector, you should only reduce once at the end of the
> >> function,
> >> > > c.f.
> >> > > how it's done in existing scalar products. Reduction instructions
> are
> >> > > (intrinsically) slow.
> >> > >
> >> > > --
> >> > > Rémi Denis-Courmont
> >> > > http://www.remlab.net/
> >> >
> >> > _______________________________________________
> >> > ffmpeg-devel mailing list
> >> > ffmpeg-devel@ffmpeg.org
> >> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >> >
> >> > To unsubscribe, visit link above, or email
> >> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> >>
> >>
> >> --
> >> 雷米‧德尼-库尔蒙
> >> http://www.remlab.net/
> >>
> >>
> >>
> >> _______________________________________________
> >> ffmpeg-devel mailing list
> >> ffmpeg-devel@ffmpeg.org
> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>
> >> To unsubscribe, visit link above, or email
> >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> >>
> >_______________________________________________
> >ffmpeg-devel mailing list
> >ffmpeg-devel@ffmpeg.org
> >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> >To unsubscribe, visit link above, or email
> >ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/7] lavc/me_cmp: R-V V pix_abs
  2024-02-09 15:34           ` flow gg
@ 2024-02-10  9:14             ` Rémi Denis-Courmont
  2024-02-10  9:15               ` Rémi Denis-Courmont
  0 siblings, 1 reply; 9+ messages in thread
From: Rémi Denis-Courmont @ 2024-02-10  9:14 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le perjantaina 9. helmikuuta 2024, 17.34.40 EET flow gg a écrit :
> The issue here is that any load greater than e8 will fail the test(Bus
> error), so it cannot use vlse64 or similar methods...

AFAICT, data is aligned on 16 bytes here, so using larger element sizes should 
not be a problem. That being the case, you can load pretty much any power-of-
two byte quantity per row up to 512 bits, as 8 segments of 64-bit elements. 
That is more than enough to deal with 16-byte rows.

Of course, that results in a tiled data layout, so it only works if individual 
elements are all treated equally with no cross-row calculations. This might 
require trickery or not work at all for those functions that subtract adjacent 
values. But your patchset seems to leave those out anyway.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/7] lavc/me_cmp: R-V V pix_abs
  2024-02-10  9:14             ` Rémi Denis-Courmont
@ 2024-02-10  9:15               ` Rémi Denis-Courmont
  0 siblings, 0 replies; 9+ messages in thread
From: Rémi Denis-Courmont @ 2024-02-10  9:15 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le lauantaina 10. helmikuuta 2024, 11.14.11 EET Rémi Denis-Courmont a écrit :
> But your patchset seems to leave those out anyway.

Nevermind that bit, I missed other mails


-- 
レミ・デニ-クールモン
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2024-02-10  9:15 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-06 15:55 [FFmpeg-devel] [PATCH 1/7] lavc/me_cmp: R-V V pix_abs flow gg
2024-02-06 16:58 ` Rémi Denis-Courmont
2024-02-07  0:01   ` flow gg
2024-02-08 19:41     ` Rémi Denis-Courmont
2024-02-08 22:39       ` flow gg
2024-02-09 10:32         ` Rémi Denis-Courmont
2024-02-09 15:34           ` flow gg
2024-02-10  9:14             ` Rémi Denis-Courmont
2024-02-10  9:15               ` Rémi Denis-Courmont

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git