* [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
@ 2024-03-02 12:06 flow gg
2024-03-07 18:55 ` Rémi Denis-Courmont
0 siblings, 1 reply; 8+ messages in thread
From: flow gg @ 2024-03-02 12:06 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 82 bytes --]
Here adjusting the order, rather than simply using .rept, will be 13%-24%
faster.
[-- Attachment #2: 0002-lavc-vc1dsp-R-V-V-mspel_pixels.patch --]
[-- Type: text/x-patch, Size: 5159 bytes --]
From 07aa3e2eff0fe1660ac82dec5d06d50fa4c433a4 Mon Sep 17 00:00:00 2001
From: sunyuechi <sunyuechi@iscas.ac.cn>
Date: Wed, 28 Feb 2024 16:32:39 +0800
Subject: [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.2
vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 147.7
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.0
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.2
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.0
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 24.0
---
libavcodec/riscv/vc1dsp_init.c | 8 ++++
libavcodec/riscv/vc1dsp_rvv.S | 76 ++++++++++++++++++++++++++++++++++
2 files changed, 84 insertions(+)
diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
index e47b644f80..610c43a1a3 100644
--- a/libavcodec/riscv/vc1dsp_init.c
+++ b/libavcodec/riscv/vc1dsp_init.c
@@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block
void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_put_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
{
@@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
+ dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
+ dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
if (flags & AV_CPU_FLAG_RVV_I64) {
dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
+ dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
+ dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
}
}
#endif
diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
index 4a00945ead..af1df85403 100644
--- a/libavcodec/riscv/vc1dsp_rvv.S
+++ b/libavcodec/riscv/vc1dsp_rvv.S
@@ -111,3 +111,79 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
vsse32.v v0, (a0), a1
ret
endfunc
+
+func ff_put_pixels16x16_rvv, zve32x
+ vsetivli zero, 16, e8, m1, ta, ma
+ .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+ vle8.v v\n, (a1)
+ add a1, a1, a2
+ .endr
+ vle8.v v31, (a1)
+ .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+ vse8.v v\n, (a0)
+ add a0, a0, a2
+ .endr
+ vse8.v v31, (a0)
+
+ ret
+endfunc
+
+func ff_put_pixels8x8_rvv, zve64x
+ vsetivli zero, 8, e8, mf2, ta, ma
+ vlse64.v v8, (a1), a2
+ vsse64.v v8, (a0), a2
+
+ ret
+endfunc
+
+func ff_avg_pixels16x16_rvv, zve32x
+ csrwi vxrm, 0
+ vsetivli zero, 16, e8, m1, ta, ma
+ .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+ vle8.v v\n, (a1)
+ add a1, a1, a2
+ .endr
+ vle8.v v31, (a1)
+ .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+ vle8.v v\n, (a0)
+ add a0, a0, a2
+ .endr
+ vle8.v v15, (a0)
+ vaaddu.vv v0, v0, v16
+ vaaddu.vv v1, v1, v17
+ vaaddu.vv v2, v2, v18
+ vaaddu.vv v3, v3, v19
+ vaaddu.vv v4, v4, v20
+ vaaddu.vv v5, v5, v21
+ vaaddu.vv v6, v6, v22
+ vaaddu.vv v7, v7, v23
+ vaaddu.vv v8, v8, v24
+ vaaddu.vv v9, v9, v25
+ vaaddu.vv v10, v10, v26
+ vaaddu.vv v11, v11, v27
+ vaaddu.vv v12, v12, v28
+ vaaddu.vv v13, v13, v29
+ vaaddu.vv v14, v14, v30
+ vaaddu.vv v15, v15, v31
+ .irp n 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+ vse8.v v\n, (a0)
+ sub a0, a0, a2
+ .endr
+ vse8.v v0, (a0)
+
+ ret
+endfunc
+
+func ff_avg_pixels8x8_rvv, zve64x
+ csrwi vxrm, 0
+ li t0, 64
+ vsetivli zero, 8, e8, mf2, ta, ma
+ vlse64.v v16, (a1), a2
+ vlse64.v v8, (a0), a2
+ vsetvli zero, t0, e8, m4, ta, ma
+ vaaddu.vv v16, v16, v8
+ vsetivli zero, 8, e8, mf2, ta, ma
+ vsse64.v v16, (a0), a2
+
+ ret
+endfunc
--
2.44.0
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
2024-03-02 12:06 [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels flow gg
@ 2024-03-07 18:55 ` Rémi Denis-Courmont
2024-03-08 0:45 ` flow gg
0 siblings, 1 reply; 8+ messages in thread
From: Rémi Denis-Courmont @ 2024-03-07 18:55 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Le lauantaina 2. maaliskuuta 2024, 14.06.13 EET flow gg a écrit :
> Here adjusting the order, rather than simply using .rept, will be 13%-24%
> faster.
Isn't it also faster to max LMUL for the adds here?
Also this might not be much noticeable on C908, but avoiding sequential
dependencies on the address registers may help. I mean, avoid using as address
operand a value that was calculated by the immediate previous instruction.
--
Rémi Denis-Courmont
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
2024-03-07 18:55 ` Rémi Denis-Courmont
@ 2024-03-08 0:45 ` flow gg
2024-03-08 9:08 ` Rémi Denis-Courmont
0 siblings, 1 reply; 8+ messages in thread
From: flow gg @ 2024-03-08 0:45 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> Isn't it also faster to max LMUL for the adds here?
It requires the use of one more vset, making the time slightly longer:
147.7 (m1), 148.7 (m8 + vset).
Also this might not be much noticeable on C908, but avoiding sequential
dependencies on the address registers may help. I mean, avoid using as
address
operand a value that was calculated by the immediate previous instruction.
> Okay, but the test results haven't changed..
It would add more than ten lines of code, perhaps shorter code will better?
Rémi Denis-Courmont <remi@remlab.net> 于2024年3月8日周五 02:55写道:
> Le lauantaina 2. maaliskuuta 2024, 14.06.13 EET flow gg a écrit :
> > Here adjusting the order, rather than simply using .rept, will be 13%-24%
> > faster.
>
> Isn't it also faster to max LMUL for the adds here?
>
> Also this might not be much noticeable on C908, but avoiding sequential
> dependencies on the address registers may help. I mean, avoid using as
> address
> operand a value that was calculated by the immediate previous instruction.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
2024-03-08 0:45 ` flow gg
@ 2024-03-08 9:08 ` Rémi Denis-Courmont
2024-03-08 9:46 ` flow gg
0 siblings, 1 reply; 8+ messages in thread
From: Rémi Denis-Courmont @ 2024-03-08 9:08 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Le 8 mars 2024 02:45:46 GMT+02:00, flow gg <hlefthleft@gmail.com> a écrit :
>> Isn't it also faster to max LMUL for the adds here?
>
>It requires the use of one more vset, making the time slightly longer:
>147.7 (m1), 148.7 (m8 + vset).
A variation of 0.6% on a single set of kernels will end up below measurement noise in real overall codec usage. And then reducing the I-cache contention can improve performance in other ways. Larger LMUL should also improve performance on bigger cores with more ALUs. So it's not all black and white.
My personal preference is to keep the code small if it makes almost no difference but I'm not BDFL.
>Also this might not be much noticeable on C908, but avoiding sequential
>dependencies on the address registers may help. I mean, avoid using as
>address
>operand a value that was calculated by the immediate previous instruction.
>
>> Okay, but the test results haven't changed..
>It would add more than ten lines of code, perhaps shorter code will better?
I don't know. There are definitely in-order vector cores coming, and data dependencies will hurt them. But I don't know if anyone will care about FFmpeg on those.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
2024-03-08 9:08 ` Rémi Denis-Courmont
@ 2024-03-08 9:46 ` flow gg
2024-04-07 5:38 ` flow gg
0 siblings, 1 reply; 8+ messages in thread
From: flow gg @ 2024-03-08 9:46 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 1735 bytes --]
Alright, using m8, but for now don't add code to address dependencies in
loops that have a minor impact. Updated in the reply
Rémi Denis-Courmont <remi@remlab.net> 于2024年3月8日周五 17:08写道:
>
>
> Le 8 mars 2024 02:45:46 GMT+02:00, flow gg <hlefthleft@gmail.com> a
> écrit :
> >> Isn't it also faster to max LMUL for the adds here?
> >
> >It requires the use of one more vset, making the time slightly longer:
> >147.7 (m1), 148.7 (m8 + vset).
>
> A variation of 0.6% on a single set of kernels will end up below
> measurement noise in real overall codec usage. And then reducing the
> I-cache contention can improve performance in other ways. Larger LMUL
> should also improve performance on bigger cores with more ALUs. So it's not
> all black and white.
>
> My personal preference is to keep the code small if it makes almost no
> difference but I'm not BDFL.
>
> >Also this might not be much noticeable on C908, but avoiding sequential
> >dependencies on the address registers may help. I mean, avoid using as
> >address
> >operand a value that was calculated by the immediate previous instruction.
> >
> >> Okay, but the test results haven't changed..
> >It would add more than ten lines of code, perhaps shorter code will
> better?
>
> I don't know. There are definitely in-order vector cores coming, and data
> dependencies will hurt them. But I don't know if anyone will care about
> FFmpeg on those.
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
[-- Attachment #2: 0002-lavc-vc1dsp-R-V-V-mspel_pixels.patch --]
[-- Type: text/x-patch, Size: 4795 bytes --]
From 47ae233e6bb8f52dd7d92ac062bed1ac85ac49a0 Mon Sep 17 00:00:00 2001
From: sunyuechi <sunyuechi@iscas.ac.cn>
Date: Wed, 28 Feb 2024 16:32:39 +0800
Subject: [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7
vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7
vc1tmp
---
libavcodec/riscv/vc1dsp_init.c | 8 +++++
libavcodec/riscv/vc1dsp_rvv.S | 66 ++++++++++++++++++++++++++++++++++
2 files changed, 74 insertions(+)
diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
index e47b644f80..610c43a1a3 100644
--- a/libavcodec/riscv/vc1dsp_init.c
+++ b/libavcodec/riscv/vc1dsp_init.c
@@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block
void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_put_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
{
@@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
+ dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
+ dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
if (flags & AV_CPU_FLAG_RVV_I64) {
dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
+ dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
+ dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
}
}
#endif
diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
index 4a00945ead..48244f91aa 100644
--- a/libavcodec/riscv/vc1dsp_rvv.S
+++ b/libavcodec/riscv/vc1dsp_rvv.S
@@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
vsse32.v v0, (a0), a1
ret
endfunc
+
+func ff_put_pixels16x16_rvv, zve32x
+ vsetivli zero, 16, e8, m1, ta, ma
+ .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+ vle8.v v\n, (a1)
+ add a1, a1, a2
+ .endr
+ vle8.v v31, (a1)
+ .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+ vse8.v v\n, (a0)
+ add a0, a0, a2
+ .endr
+ vse8.v v31, (a0)
+
+ ret
+endfunc
+
+func ff_put_pixels8x8_rvv, zve64x
+ vsetivli zero, 8, e8, mf2, ta, ma
+ vlse64.v v8, (a1), a2
+ vsse64.v v8, (a0), a2
+
+ ret
+endfunc
+
+func ff_avg_pixels16x16_rvv, zve32x
+ csrwi vxrm, 0
+ vsetivli zero, 16, e8, m1, ta, ma
+ li t0, 128
+
+ .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+ vle8.v v\n, (a1)
+ add a1, a1, a2
+ .endr
+ vle8.v v31, (a1)
+ .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+ vle8.v v\n, (a0)
+ add a0, a0, a2
+ .endr
+ vle8.v v15, (a0)
+ vsetvli zero, t0, e8, m8, ta, ma
+ vaaddu.vv v0, v0, v16
+ vaaddu.vv v8, v8, v24
+ vsetivli zero, 16, e8, m1, ta, ma
+ .irp n 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+ vse8.v v\n, (a0)
+ sub a0, a0, a2
+ .endr
+ vse8.v v0, (a0)
+
+ ret
+endfunc
+
+func ff_avg_pixels8x8_rvv, zve64x
+ csrwi vxrm, 0
+ li t0, 64
+ vsetivli zero, 8, e8, mf2, ta, ma
+ vlse64.v v16, (a1), a2
+ vlse64.v v8, (a0), a2
+ vsetvli zero, t0, e8, m4, ta, ma
+ vaaddu.vv v16, v16, v8
+ vsetivli zero, 8, e8, mf2, ta, ma
+ vsse64.v v16, (a0), a2
+
+ ret
+endfunc
--
2.44.0
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels
2024-03-08 9:46 ` flow gg
@ 2024-04-07 5:38 ` flow gg
2024-04-28 18:06 ` Rémi Denis-Courmont
0 siblings, 1 reply; 8+ messages in thread
From: flow gg @ 2024-04-07 5:38 UTC (permalink / raw)
To: FFmpeg development discussions and patches
ping
flow gg <hlefthleft@gmail.com> 于2024年3月8日周五 17:46写道:
> Alright, using m8, but for now don't add code to address dependencies in
> loops that have a minor impact. Updated in the reply
>
> Rémi Denis-Courmont <remi@remlab.net> 于2024年3月8日周五 17:08写道:
>
>>
>>
>> Le 8 mars 2024 02:45:46 GMT+02:00, flow gg <hlefthleft@gmail.com> a
>> écrit :
>> >> Isn't it also faster to max LMUL for the adds here?
>> >
>> >It requires the use of one more vset, making the time slightly longer:
>> >147.7 (m1), 148.7 (m8 + vset).
>>
>> A variation of 0.6% on a single set of kernels will end up below
>> measurement noise in real overall codec usage. And then reducing the
>> I-cache contention can improve performance in other ways. Larger LMUL
>> should also improve performance on bigger cores with more ALUs. So it's not
>> all black and white.
>>
>> My personal preference is to keep the code small if it makes almost no
>> difference but I'm not BDFL.
>>
>> >Also this might not be much noticeable on C908, but avoiding sequential
>> >dependencies on the address registers may help. I mean, avoid using as
>> >address
>> >operand a value that was calculated by the immediate previous
>> instruction.
>> >
>> >> Okay, but the test results haven't changed..
>> >It would add more than ten lines of code, perhaps shorter code will
>> better?
>>
>> I don't know. There are definitely in-order vector cores coming, and data
>> dependencies will hurt them. But I don't know if anyone will care about
>> FFmpeg on those.
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2024-04-29 7:09 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-02 12:06 [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels flow gg
2024-03-07 18:55 ` Rémi Denis-Courmont
2024-03-08 0:45 ` flow gg
2024-03-08 9:08 ` Rémi Denis-Courmont
2024-03-08 9:46 ` flow gg
2024-04-07 5:38 ` flow gg
2024-04-28 18:06 ` Rémi Denis-Courmont
2024-04-29 7:09 ` flow gg
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git