Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
@ 2024-03-02 12:06 flow gg
  2024-03-07 18:55 ` Rémi Denis-Courmont
  0 siblings, 1 reply; 8+ messages in thread
From: flow gg @ 2024-03-02 12:06 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 82 bytes --]

Here adjusting the order, rather than simply using .rept, will be 13%-24%
faster.

[-- Attachment #2: 0002-lavc-vc1dsp-R-V-V-mspel_pixels.patch --]
[-- Type: text/x-patch, Size: 5159 bytes --]

From 07aa3e2eff0fe1660ac82dec5d06d50fa4c433a4 Mon Sep 17 00:00:00 2001
From: sunyuechi <sunyuechi@iscas.ac.cn>
Date: Wed, 28 Feb 2024 16:32:39 +0800
Subject: [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels

vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.2
vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 147.7
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.0
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.2
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.0
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 24.0
---
 libavcodec/riscv/vc1dsp_init.c |  8 ++++
 libavcodec/riscv/vc1dsp_rvv.S  | 76 ++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)

diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
index e47b644f80..610c43a1a3 100644
--- a/libavcodec/riscv/vc1dsp_init.c
+++ b/libavcodec/riscv/vc1dsp_init.c
@@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block
 void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_put_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
 
 av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
 {
@@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
     if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
         dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
         dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
+        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
+        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
         if (flags & AV_CPU_FLAG_RVV_I64) {
             dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
             dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
+            dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
+            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
         }
     }
 #endif
diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
index 4a00945ead..af1df85403 100644
--- a/libavcodec/riscv/vc1dsp_rvv.S
+++ b/libavcodec/riscv/vc1dsp_rvv.S
@@ -111,3 +111,79 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
         vsse32.v      v0, (a0), a1
         ret
 endfunc
+
+func ff_put_pixels16x16_rvv, zve32x
+        vsetivli      zero, 16, e8, m1, ta, ma
+        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+        vle8.v        v\n, (a1)
+        add           a1, a1, a2
+        .endr
+        vle8.v        v31, (a1)
+        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+        vse8.v        v\n, (a0)
+        add           a0, a0, a2
+        .endr
+        vse8.v        v31, (a0)
+
+        ret
+endfunc
+
+func ff_put_pixels8x8_rvv, zve64x
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vlse64.v      v8, (a1), a2
+        vsse64.v      v8, (a0), a2
+
+        ret
+endfunc
+
+func ff_avg_pixels16x16_rvv, zve32x
+        csrwi         vxrm, 0
+        vsetivli      zero, 16, e8, m1, ta, ma
+        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+        vle8.v        v\n, (a1)
+        add           a1, a1, a2
+        .endr
+        vle8.v        v31, (a1)
+        .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+        vle8.v        v\n, (a0)
+        add           a0, a0, a2
+        .endr
+        vle8.v        v15, (a0)
+        vaaddu.vv     v0, v0, v16
+        vaaddu.vv     v1, v1, v17
+        vaaddu.vv     v2, v2, v18
+        vaaddu.vv     v3, v3, v19
+        vaaddu.vv     v4, v4, v20
+        vaaddu.vv     v5, v5, v21
+        vaaddu.vv     v6, v6, v22
+        vaaddu.vv     v7, v7, v23
+        vaaddu.vv     v8, v8, v24
+        vaaddu.vv     v9, v9, v25
+        vaaddu.vv     v10, v10, v26
+        vaaddu.vv     v11, v11, v27
+        vaaddu.vv     v12, v12, v28
+        vaaddu.vv     v13, v13, v29
+        vaaddu.vv     v14, v14, v30
+        vaaddu.vv     v15, v15, v31
+        .irp n  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+        vse8.v        v\n, (a0)
+        sub           a0, a0, a2
+        .endr
+        vse8.v        v0, (a0)
+
+        ret
+endfunc
+
+func ff_avg_pixels8x8_rvv, zve64x
+        csrwi         vxrm, 0
+        li            t0, 64
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vlse64.v      v16, (a1), a2
+        vlse64.v      v8, (a0), a2
+        vsetvli       zero, t0, e8, m4, ta, ma
+        vaaddu.vv     v16, v16, v8
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vsse64.v      v16, (a0), a2
+
+        ret
+endfunc
-- 
2.44.0


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
  2024-03-02 12:06 [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels flow gg
@ 2024-03-07 18:55 ` Rémi Denis-Courmont
  2024-03-08  0:45   ` flow gg
  0 siblings, 1 reply; 8+ messages in thread
From: Rémi Denis-Courmont @ 2024-03-07 18:55 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le lauantaina 2. maaliskuuta 2024, 14.06.13 EET flow gg a écrit :
> Here adjusting the order, rather than simply using .rept, will be 13%-24%
> faster.

Isn't it also faster to max LMUL for the adds here?

Also this might not be much noticeable on C908, but avoiding sequential 
dependencies on the address registers may help. I mean, avoid using as address 
operand a value that was calculated by the immediate previous instruction.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
  2024-03-07 18:55 ` Rémi Denis-Courmont
@ 2024-03-08  0:45   ` flow gg
  2024-03-08  9:08     ` Rémi Denis-Courmont
  0 siblings, 1 reply; 8+ messages in thread
From: flow gg @ 2024-03-08  0:45 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

> Isn't it also faster to max LMUL for the adds here?

It requires the use of one more vset, making the time slightly longer:
147.7 (m1), 148.7 (m8 + vset).

Also this might not be much noticeable on C908, but avoiding sequential
dependencies on the address registers may help. I mean, avoid using as
address
operand a value that was calculated by the immediate previous instruction.

> Okay, but the test results haven't changed..
It would add more than ten lines of code, perhaps shorter code will better?

Rémi Denis-Courmont <remi@remlab.net> 于2024年3月8日周五 02:55写道:

> Le lauantaina 2. maaliskuuta 2024, 14.06.13 EET flow gg a écrit :
> > Here adjusting the order, rather than simply using .rept, will be 13%-24%
> > faster.
>
> Isn't it also faster to max LMUL for the adds here?
>
> Also this might not be much noticeable on C908, but avoiding sequential
> dependencies on the address registers may help. I mean, avoid using as
> address
> operand a value that was calculated by the immediate previous instruction.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
  2024-03-08  0:45   ` flow gg
@ 2024-03-08  9:08     ` Rémi Denis-Courmont
  2024-03-08  9:46       ` flow gg
  0 siblings, 1 reply; 8+ messages in thread
From: Rémi Denis-Courmont @ 2024-03-08  9:08 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



Le 8 mars 2024 02:45:46 GMT+02:00, flow gg <hlefthleft@gmail.com> a écrit :
>> Isn't it also faster to max LMUL for the adds here?
>
>It requires the use of one more vset, making the time slightly longer:
>147.7 (m1), 148.7 (m8 + vset).

A variation of 0.6% on a single set of kernels will end up below measurement noise in real overall codec usage. And then reducing the I-cache contention can improve performance in other ways. Larger LMUL should also improve performance on bigger cores with more ALUs. So it's not all black and white.

My personal preference is to keep the code small if it makes almost no difference but I'm not BDFL.

>Also this might not be much noticeable on C908, but avoiding sequential
>dependencies on the address registers may help. I mean, avoid using as
>address
>operand a value that was calculated by the immediate previous instruction.
>
>> Okay, but the test results haven't changed..
>It would add more than ten lines of code, perhaps shorter code will better?

I don't know. There are definitely in-order vector cores coming, and data dependencies will hurt them. But I don't know if anyone will care about FFmpeg on those.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
  2024-03-08  9:08     ` Rémi Denis-Courmont
@ 2024-03-08  9:46       ` flow gg
  2024-04-07  5:38         ` flow gg
  0 siblings, 1 reply; 8+ messages in thread
From: flow gg @ 2024-03-08  9:46 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1735 bytes --]

Alright, using m8, but for now don't add code to address dependencies in
loops that have a minor impact. Updated in the reply

Rémi Denis-Courmont <remi@remlab.net> 于2024年3月8日周五 17:08写道:

>
>
> Le 8 mars 2024 02:45:46 GMT+02:00, flow gg <hlefthleft@gmail.com> a
> écrit :
> >> Isn't it also faster to max LMUL for the adds here?
> >
> >It requires the use of one more vset, making the time slightly longer:
> >147.7 (m1), 148.7 (m8 + vset).
>
> A variation of 0.6% on a single set of kernels will end up below
> measurement noise in real overall codec usage. And then reducing the
> I-cache contention can improve performance in other ways. Larger LMUL
> should also improve performance on bigger cores with more ALUs. So it's not
> all black and white.
>
> My personal preference is to keep the code small if it makes almost no
> difference but I'm not BDFL.
>
> >Also this might not be much noticeable on C908, but avoiding sequential
> >dependencies on the address registers may help. I mean, avoid using as
> >address
> >operand a value that was calculated by the immediate previous instruction.
> >
> >> Okay, but the test results haven't changed..
> >It would add more than ten lines of code, perhaps shorter code will
> better?
>
> I don't know. There are definitely in-order vector cores coming, and data
> dependencies will hurt them. But I don't know if anyone will care about
> FFmpeg on those.
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>

[-- Attachment #2: 0002-lavc-vc1dsp-R-V-V-mspel_pixels.patch --]
[-- Type: text/x-patch, Size: 4795 bytes --]

From 47ae233e6bb8f52dd7d92ac062bed1ac85ac49a0 Mon Sep 17 00:00:00 2001
From: sunyuechi <sunyuechi@iscas.ac.cn>
Date: Wed, 28 Feb 2024 16:32:39 +0800
Subject: [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels

vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7
vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7

vc1tmp
---
 libavcodec/riscv/vc1dsp_init.c |  8 +++++
 libavcodec/riscv/vc1dsp_rvv.S  | 66 ++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
index e47b644f80..610c43a1a3 100644
--- a/libavcodec/riscv/vc1dsp_init.c
+++ b/libavcodec/riscv/vc1dsp_init.c
@@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block
 void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_put_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
 
 av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
 {
@@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
     if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
         dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
         dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
+        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
+        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
         if (flags & AV_CPU_FLAG_RVV_I64) {
             dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
             dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
+            dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
+            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
         }
     }
 #endif
diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
index 4a00945ead..48244f91aa 100644
--- a/libavcodec/riscv/vc1dsp_rvv.S
+++ b/libavcodec/riscv/vc1dsp_rvv.S
@@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
         vsse32.v      v0, (a0), a1
         ret
 endfunc
+
+func ff_put_pixels16x16_rvv, zve32x
+        vsetivli      zero, 16, e8, m1, ta, ma
+        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+        vle8.v        v\n, (a1)
+        add           a1, a1, a2
+        .endr
+        vle8.v        v31, (a1)
+        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+        vse8.v        v\n, (a0)
+        add           a0, a0, a2
+        .endr
+        vse8.v        v31, (a0)
+
+        ret
+endfunc
+
+func ff_put_pixels8x8_rvv, zve64x
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vlse64.v      v8, (a1), a2
+        vsse64.v      v8, (a0), a2
+
+        ret
+endfunc
+
+func ff_avg_pixels16x16_rvv, zve32x
+        csrwi         vxrm, 0
+        vsetivli      zero, 16, e8, m1, ta, ma
+        li            t0, 128
+
+        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+        vle8.v        v\n, (a1)
+        add           a1, a1, a2
+        .endr
+        vle8.v        v31, (a1)
+        .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+        vle8.v        v\n, (a0)
+        add           a0, a0, a2
+        .endr
+        vle8.v        v15, (a0)
+        vsetvli       zero, t0, e8, m8, ta, ma
+        vaaddu.vv     v0, v0, v16
+        vaaddu.vv     v8, v8, v24
+        vsetivli      zero, 16, e8, m1, ta, ma
+        .irp n  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+        vse8.v        v\n, (a0)
+        sub           a0, a0, a2
+        .endr
+        vse8.v        v0, (a0)
+
+        ret
+endfunc
+
+func ff_avg_pixels8x8_rvv, zve64x
+        csrwi         vxrm, 0
+        li            t0, 64
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vlse64.v      v16, (a1), a2
+        vlse64.v      v8, (a0), a2
+        vsetvli       zero, t0, e8, m4, ta, ma
+        vaaddu.vv     v16, v16, v8
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vsse64.v      v16, (a0), a2
+
+        ret
+endfunc
-- 
2.44.0


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
  2024-03-08  9:46       ` flow gg
@ 2024-04-07  5:38         ` flow gg
  2024-04-28 18:06           ` Rémi Denis-Courmont
  0 siblings, 1 reply; 8+ messages in thread
From: flow gg @ 2024-04-07  5:38 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

ping

flow gg <hlefthleft@gmail.com> 于2024年3月8日周五 17:46写道:

> Alright, using m8, but for now don't add code to address dependencies in
> loops that have a minor impact. Updated in the reply
>
> Rémi Denis-Courmont <remi@remlab.net> 于2024年3月8日周五 17:08写道:
>
>>
>>
>> Le 8 mars 2024 02:45:46 GMT+02:00, flow gg <hlefthleft@gmail.com> a
>> écrit :
>> >> Isn't it also faster to max LMUL for the adds here?
>> >
>> >It requires the use of one more vset, making the time slightly longer:
>> >147.7 (m1), 148.7 (m8 + vset).
>>
>> A variation of 0.6% on a single set of kernels will end up below
>> measurement noise in real overall codec usage. And then reducing the
>> I-cache contention can improve performance in other ways. Larger LMUL
>> should also improve performance on bigger cores with more ALUs. So it's not
>> all black and white.
>>
>> My personal preference is to keep the code small if it makes almost no
>> difference but I'm not BDFL.
>>
>> >Also this might not be much noticeable on C908, but avoiding sequential
>> >dependencies on the address registers may help. I mean, avoid using as
>> >address
>> >operand a value that was calculated by the immediate previous
>> instruction.
>> >
>> >> Okay, but the test results haven't changed..
>> >It would add more than ten lines of code, perhaps shorter code will
>> better?
>>
>> I don't know. There are definitely in-order vector cores coming, and data
>> dependencies will hurt them. But I don't know if anyone will care about
>> FFmpeg on those.
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
  2024-04-07  5:38         ` flow gg
@ 2024-04-28 18:06           ` Rémi Denis-Courmont
  2024-04-29  7:09             ` flow gg
  0 siblings, 1 reply; 8+ messages in thread
From: Rémi Denis-Courmont @ 2024-04-28 18:06 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le sunnuntaina 7. huhtikuuta 2024, 8.38.54 EEST flow gg a écrit :
> ping

I have been away for a while, and catching up takes time, sorry.

-- 
レミ・デニ-クールモン
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
  2024-04-28 18:06           ` Rémi Denis-Courmont
@ 2024-04-29  7:09             ` flow gg
  0 siblings, 0 replies; 8+ messages in thread
From: flow gg @ 2024-04-29  7:09 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Happy to see you back :)

Rémi Denis-Courmont <remi@remlab.net> 于2024年4月29日周一 02:06写道:

> Le sunnuntaina 7. huhtikuuta 2024, 8.38.54 EEST flow gg a écrit :
> > ping
>
> I have been away for a while, and catching up takes time, sorry.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-04-29  7:09 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-02 12:06 [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels flow gg
2024-03-07 18:55 ` Rémi Denis-Courmont
2024-03-08  0:45   ` flow gg
2024-03-08  9:08     ` Rémi Denis-Courmont
2024-03-08  9:46       ` flow gg
2024-04-07  5:38         ` flow gg
2024-04-28 18:06           ` Rémi Denis-Courmont
2024-04-29  7:09             ` flow gg

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git