* [FFmpeg-devel] [PATCHv4 1/4] lavu/riscv: add assembler macros for adjusting vector LMUL
@ 2024-05-16 16:48 Rémi Denis-Courmont
2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 2/4] lavc/flacdsp: optimise RVV vector type for lpc16 Rémi Denis-Courmont
` (2 more replies)
0 siblings, 3 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-16 16:48 UTC (permalink / raw)
To: ffmpeg-devel
vtype_vli computes the VTYPE value with the optimal LMUL for a given
element width, tail and mask policies and a run-time vector length.
vtype_ivli does the same, but with the compile-time constant vector
length.
vwtypei and vntypei can be used to widen or narrow a VTYPE value for
use in mixed-width vector-optimised functions.
---
libavutil/riscv/asm.S | 166 +++++++++++++++++++++++++++++-------------
1 file changed, 117 insertions(+), 49 deletions(-)
diff --git a/libavutil/riscv/asm.S b/libavutil/riscv/asm.S
index 14be5055f5..1e6358dcb5 100644
--- a/libavutil/riscv/asm.S
+++ b/libavutil/riscv/asm.S
@@ -96,77 +96,145 @@
.endm
#endif
- /* Convenience macro to load a Vector type (vtype) as immediate */
- .macro lvtypei rd, e, m=m1, tp=tu, mp=mu
+#if defined (__riscv_v_elen)
+# define RV_V_ELEN __riscv_v_elen
+#else
+/* Run-time detection of the V extension implies ELEN >= 64. */
+# define RV_V_ELEN 64
+#endif
+#if RV_V_ELEN == 32
+# define VSEW_MAX 2
+#else
+# define VSEW_MAX 3
+#endif
- .ifc \e,e8
- .equ ei, 0
+ .macro parse_vtype ew, tp, mp
+ .ifc \ew,e8
+ .equ vsew, 0
.else
- .ifc \e,e16
- .equ ei, 8
+ .ifc \ew,e16
+ .equ vsew, 1
.else
- .ifc \e,e32
- .equ ei, 16
+ .ifc \ew,e32
+ .equ vsew, 2
.else
- .ifc \e,e64
- .equ ei, 24
+ .ifc \ew,e64
+ .equ vsew, 3
.else
- .error "Unknown element type"
+ .error "Unknown element width \ew"
.endif
.endif
.endif
.endif
- .ifc \m,m1
- .equ mi, 0
- .else
- .ifc \m,m2
- .equ mi, 1
- .else
- .ifc \m,m4
- .equ mi, 2
+ .ifc \tp,tu
+ .equ tp, 0
.else
- .ifc \m,m8
- .equ mi, 3
+ .ifc \tp,ta
+ .equ tp, 1
.else
- .ifc \m,mf8
- .equ mi, 5
- .else
- .ifc \m,mf4
- .equ mi, 6
- .else
- .ifc \m,mf2
- .equ mi, 7
- .else
- .error "Unknown multiplier"
- .equ mi, 3
- .endif
- .endif
- .endif
- .endif
- .endif
+ .error "Unknown tail policy \tp"
.endif
.endif
- .ifc \tp,tu
- .equ tpi, 0
+ .ifc \mp,mu
+ .equ mp, 0
.else
- .ifc \tp,ta
- .equ tpi, 64
+ .ifc \mp,ma
+ .equ mp, 1
.else
- .error "Unknown tail policy"
+ .error "Unknown mask policy \mp"
.endif
.endif
+ .endm
- .ifc \mp,mu
- .equ mpi, 0
- .else
- .ifc \mp,ma
- .equ mpi, 128
+ /**
+ * Gets the vector type with the smallest suitable LMUL value.
+ * @param[out] rd vector type destination register
+ * @param vl vector length constant
+ * @param ew element width: e8, e16, e32 or e64
+ * @param tp tail policy: tu or ta
+ * @param mp mask policty: mu or ma
+ */
+ .macro vtype_ivli rd, avl, ew, tp=tu, mp=mu
+ .if \avl <= 1
+ .equ log2vl, 0
+ .elseif \avl <= 2
+ .equ log2vl, 1
+ .elseif \avl <= 4
+ .equ log2vl, 2
+ .elseif \avl <= 8
+ .equ log2vl, 3
+ .elseif \avl <= 16
+ .equ log2vl, 4
+ .elseif \avl <= 32
+ .equ log2vl, 5
+ .elseif \avl <= 64
+ .equ log2vl, 6
+ .elseif \avl <= 128
+ .equ log2vl, 7
.else
- .error "Unknown mask policy"
+ .error "Vector length \avl out of range"
.endif
+ parse_vtype \ew, \tp, \mp
+ csrr \rd, vlenb
+ clz \rd, \rd
+ addi \rd, \rd, log2vl + 1 + VSEW_MAX - __riscv_xlen
+ max \rd, \rd, zero // VLMUL must be >= VSEW - VSEW_MAX
+ .if vsew < VSEW_MAX
+ addi \rd, \rd, vsew - VSEW_MAX
+ andi \rd, \rd, 7
.endif
+ ori \rd, \rd, (vsew << 3) | (tp << 6) | (mp << 7)
+ .endm
+
+ /**
+ * Gets the vector type with the smallest suitable LMUL value.
+ * @param[out] rd vector type destination register
+ * @param rs vector length source register
+ * @param[out] tmp temporary register to be clobbered
+ * @param ew element width: e8, e16, e32 or e64
+ * @param tp tail policy: tu or ta
+ * @param mp mask policty: mu or ma
+ */
+ .macro vtype_vli rd, rs, tmp, ew, tp=tu, mp=mu
+ parse_vtype \ew, \tp, \mp
+ /*
+ * The difference between the CLZ's notionally equals the VLMUL value
+ * for 4-bit elements. But we want the value for SEW_MAX-bit elements.
+ */
+ slli \tmp, \rs, 1 + VSEW_MAX
+ csrr \rd, vlenb
+ addi \tmp, \tmp, -1
+ clz \rd, \rd
+ clz \tmp, \tmp
+ sub \rd, \rd, \tmp
+ max \rd, \rd, zero // VLMUL must be >= VSEW - VSEW_MAX
+ .if vsew < VSEW_MAX
+ addi \rd, \rd, vsew - VSEW_MAX
+ andi \rd, \rd, 7
+ .endif
+ ori \rd, \rd, (vsew << 3) | (tp << 6) | (mp << 7)
+ .endm
+
+ /**
+ * Widens a vector type.
+ * @param[out] rd widened vector type destination register
+ * @param rs vector type source register
+ * @param n number of times to widen (once by default)
+ */
+ .macro vwtypei rd, rs, n=1
+ xori \rd, \rs, 4
+ addi \rd, \rd, (\n) * 011
+ xori \rd, \rd, 4
+ .endm
- li \rd, (ei | mi | tpi | mpi)
+ /**
+ * Narrows a vector type.
+ * @param[out] rd narrowed vector type destination register
+ * @param rs vector type source register
+ * @param n number of times to narrow (once by default)
+ */
+ .macro vntypei rd, rs, n=1
+ vwtypei \rd, \rs, -(\n)
.endm
--
2.43.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCHv4 2/4] lavc/flacdsp: optimise RVV vector type for lpc16
2024-05-16 16:48 [FFmpeg-devel] [PATCHv4 1/4] lavu/riscv: add assembler macros for adjusting vector LMUL Rémi Denis-Courmont
@ 2024-05-16 16:48 ` Rémi Denis-Courmont
2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 3/4] lavc/flacdsp: optimise RVV vector type for lpc32 Rémi Denis-Courmont
2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 4/4] lavc/huffyuvdsp: optimise RVV vtype for add_hfyu_left_pred_bgr32 Rémi Denis-Courmont
2 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-16 16:48 UTC (permalink / raw)
To: ffmpeg-devel
This calculates the optimal vector type value at run-time based on the
hardware vector length and the FLAC LPC prediction order. In this
particular case, the additional computation is easily amortised over
the loop iterations:
T-Head C908:
C V before V after
1 48.0 214.7 95.2
2 64.7 214.2 94.7
3 79.7 213.5 94.5
4 96.2 196.5 94.2 #
5 111.0 195.7 118.5
6 127.0 211.2 102.0
7 143.7 194.2 101.5
8 175.7 193.2 101.2 #
9 176.2 224.2 126.0
10 191.5 192.0 125.5
11 224.5 191.2 124.7
12 223.0 190.2 124.2
13 239.2 189.5 123.7
14 253.7 188.7 139.5
15 286.2 188.0 122.7
16 284.0 187.0 122.5 #
17 300.2 186.5 186.5
18 314.0 185.5 185.7
19 329.7 184.7 185.0
20 343.0 184.2 184.2
21 358.7 199.2 183.7
22 371.7 182.7 182.7
23 387.5 181.7 182.0
24 400.7 181.0 181.2
25 431.5 180.2 196.5
26 443.7 195.5 196.0
27 459.0 178.7 196.2
28 470.7 177.7 194.2
29 470.0 177.0 193.5
30 481.2 176.2 176.5
31 496.2 175.5 175.7
32 507.2 174.7 191.0 #
# Power of two boundary.
With 128-bit vectors, improvements are expected for the first two
test cases only. For the other two, there is overhead but below noise.
Improvements should be better observable with prediction order of 8
and less, or on hardware with larger vector sizes.
The same optimisation strategy should be applicable to LPC32
(and work-in-progress LPC33), but is left as a future exercise.
flac
lpc16
---
libavcodec/riscv/flacdsp_init.c | 2 +-
libavcodec/riscv/flacdsp_rvv.S | 5 +++--
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/libavcodec/riscv/flacdsp_init.c b/libavcodec/riscv/flacdsp_init.c
index 4f1652dbe7..735aec0691 100644
--- a/libavcodec/riscv/flacdsp_init.c
+++ b/libavcodec/riscv/flacdsp_init.c
@@ -71,7 +71,7 @@ av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum AVSampleFormat fmt,
if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
int vlenb = ff_get_rv_vlenb();
- if (vlenb >= 16)
+ if ((flags & AV_CPU_FLAG_RVB_BASIC) && vlenb >= 16)
c->lpc16 = ff_flac_lpc16_rvv;
# if (__riscv_xlen >= 64)
diff --git a/libavcodec/riscv/flacdsp_rvv.S b/libavcodec/riscv/flacdsp_rvv.S
index 6287faa260..e1a20ce8e1 100644
--- a/libavcodec/riscv/flacdsp_rvv.S
+++ b/libavcodec/riscv/flacdsp_rvv.S
@@ -20,8 +20,9 @@
#include "libavutil/riscv/asm.S"
-func ff_flac_lpc16_rvv, zve32x
- vsetvli zero, a2, e32, m8, ta, ma
+func ff_flac_lpc16_rvv, zve32x, zbb
+ vtype_vli t0, a2, t2, e32, ta, ma
+ vsetvl zero, a2, t0
vle32.v v8, (a1)
sub a4, a4, a2
vle32.v v16, (a0)
--
2.43.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCHv4 3/4] lavc/flacdsp: optimise RVV vector type for lpc32
2024-05-16 16:48 [FFmpeg-devel] [PATCHv4 1/4] lavu/riscv: add assembler macros for adjusting vector LMUL Rémi Denis-Courmont
2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 2/4] lavc/flacdsp: optimise RVV vector type for lpc16 Rémi Denis-Courmont
@ 2024-05-16 16:48 ` Rémi Denis-Courmont
2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 4/4] lavc/huffyuvdsp: optimise RVV vtype for add_hfyu_left_pred_bgr32 Rémi Denis-Courmont
2 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-16 16:48 UTC (permalink / raw)
To: ffmpeg-devel
This is pretty much the same as for lpc16, though it only improves half
as large prediction orders. With 128-bit vectors, this gives:
C V old V new
1 69.2 181.5 95.5
2 107.7 180.7 95.2
3 145.5 180.0 103.5
4 183.0 179.2 102.7
5 220.7 178.5 128.0
6 257.7 194.0 127.5
7 294.5 193.7 126.7
8 331.0 193.0 126.5
Larger prediction orders see no significant changes at that size.
---
libavcodec/riscv/flacdsp_init.c | 15 ++++++++-------
libavcodec/riscv/flacdsp_rvv.S | 12 +++++++-----
2 files changed, 15 insertions(+), 12 deletions(-)
diff --git a/libavcodec/riscv/flacdsp_init.c b/libavcodec/riscv/flacdsp_init.c
index 735aec0691..830ae36534 100644
--- a/libavcodec/riscv/flacdsp_init.c
+++ b/libavcodec/riscv/flacdsp_init.c
@@ -71,17 +71,18 @@ av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum AVSampleFormat fmt,
if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
int vlenb = ff_get_rv_vlenb();
- if ((flags & AV_CPU_FLAG_RVB_BASIC) && vlenb >= 16)
+ if ((flags & AV_CPU_FLAG_RVB_BASIC) && vlenb >= 16) {
c->lpc16 = ff_flac_lpc16_rvv;
# if (__riscv_xlen >= 64)
- if (flags & AV_CPU_FLAG_RVV_I64) {
- if (vlenb > 16)
- c->lpc32 = ff_flac_lpc32_rvv_simple;
- else
- c->lpc32 = ff_flac_lpc32_rvv;
- }
+ if (flags & AV_CPU_FLAG_RVV_I64) {
+ if (vlenb > 16)
+ c->lpc32 = ff_flac_lpc32_rvv_simple;
+ else
+ c->lpc32 = ff_flac_lpc32_rvv;
+ }
# endif
+ }
c->wasted32 = ff_flac_wasted32_rvv;
diff --git a/libavcodec/riscv/flacdsp_rvv.S b/libavcodec/riscv/flacdsp_rvv.S
index e1a20ce8e1..2941928465 100644
--- a/libavcodec/riscv/flacdsp_rvv.S
+++ b/libavcodec/riscv/flacdsp_rvv.S
@@ -76,22 +76,24 @@ func ff_flac_lpc32_rvv, zve64x
ret
endfunc
-func ff_flac_lpc32_rvv_simple, zve64x
- vsetivli zero, 1, e64, m1, ta, ma
+func ff_flac_lpc32_rvv_simple, zve64x, zbb
+ vtype_vli t3, a2, t1, e64, ta, ma
+ vntypei t2, t3
+ vsetvl zero, a2, t3 // e64
vmv.s.x v0, zero
- vsetvli zero, a2, e32, m4, ta, ma
+ vsetvl zero, zero, t2 // e32
vle32.v v8, (a1)
sub a4, a4, a2
vle32.v v16, (a0)
sh2add a0, a2, a0
1:
vwmul.vv v24, v8, v16
- vsetvli zero, zero, e64, m8, ta, ma
+ vsetvl zero, zero, t3 // e64
vredsum.vs v24, v24, v0
lw t0, (a0)
addi a4, a4, -1
vmv.x.s t1, v24
- vsetvli zero, zero, e32, m4, ta, ma
+ vsetvl zero, zero, t2 // e32
sra t1, t1, a3
add t0, t0, t1
vslide1down.vx v16, v16, t0
--
2.43.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCHv4 4/4] lavc/huffyuvdsp: optimise RVV vtype for add_hfyu_left_pred_bgr32
2024-05-16 16:48 [FFmpeg-devel] [PATCHv4 1/4] lavu/riscv: add assembler macros for adjusting vector LMUL Rémi Denis-Courmont
2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 2/4] lavc/flacdsp: optimise RVV vector type for lpc16 Rémi Denis-Courmont
2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 3/4] lavc/flacdsp: optimise RVV vector type for lpc32 Rémi Denis-Courmont
@ 2024-05-16 16:48 ` Rémi Denis-Courmont
2024-05-16 17:14 ` Rémi Denis-Courmont
2 siblings, 1 reply; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-16 16:48 UTC (permalink / raw)
To: ffmpeg-devel
T-Head C908:
add_hfyu_left_pred_bgr32_c: 237.5
add_hfyu_left_pred_bgr32_rvv_i32: 173.5 (before)
add_hfyu_left_pred_bgr32_rvv_i32: 110.0 (after)
---
libavcodec/riscv/huffyuvdsp_rvv.S | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/libavcodec/riscv/huffyuvdsp_rvv.S b/libavcodec/riscv/huffyuvdsp_rvv.S
index 9c4434907d..d334f5c6d0 100644
--- a/libavcodec/riscv/huffyuvdsp_rvv.S
+++ b/libavcodec/riscv/huffyuvdsp_rvv.S
@@ -36,8 +36,10 @@ func ff_add_int16_rvv, zve32x
ret
endfunc
-func ff_add_hfyu_left_pred_bgr32_rvv, zve32x
- vsetivli zero, 4, e8, m1, ta, ma
+func ff_add_hfyu_left_pred_bgr32_rvv, zve32x, zbb
+ vtype_ivli t1, 4, e8, ta, ma
+ li t0, 4
+ vsetvl zero, t0, t1
vle8.v v8, (a3)
sh2add a2, a2, a1
1:
--
2.43.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [FFmpeg-devel] [PATCHv4 4/4] lavc/huffyuvdsp: optimise RVV vtype for add_hfyu_left_pred_bgr32
2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 4/4] lavc/huffyuvdsp: optimise RVV vtype for add_hfyu_left_pred_bgr32 Rémi Denis-Courmont
@ 2024-05-16 17:14 ` Rémi Denis-Courmont
0 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-16 17:14 UTC (permalink / raw)
To: ffmpeg-devel
Le torstaina 16. toukokuuta 2024, 19.48.40 EEST Rémi Denis-Courmont a écrit :
> T-Head C908:
> add_hfyu_left_pred_bgr32_c: 237.5
> add_hfyu_left_pred_bgr32_rvv_i32: 173.5 (before)
> add_hfyu_left_pred_bgr32_rvv_i32: 110.0 (after)
> ---
> libavcodec/riscv/huffyuvdsp_rvv.S | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/libavcodec/riscv/huffyuvdsp_rvv.S
> b/libavcodec/riscv/huffyuvdsp_rvv.S index 9c4434907d..d334f5c6d0 100644
> --- a/libavcodec/riscv/huffyuvdsp_rvv.S
> +++ b/libavcodec/riscv/huffyuvdsp_rvv.S
> @@ -36,8 +36,10 @@ func ff_add_int16_rvv, zve32x
> ret
> endfunc
>
> -func ff_add_hfyu_left_pred_bgr32_rvv, zve32x
> - vsetivli zero, 4, e8, m1, ta, ma
> +func ff_add_hfyu_left_pred_bgr32_rvv, zve32x, zbb
> + vtype_ivli t1, 4, e8, ta, ma
> + li t0, 4
> + vsetvl zero, t0, t1
> vle8.v v8, (a3)
> sh2add a2, a2, a1
> 1:
Missing Zvbb check in init.c will be added.
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2024-05-16 17:14 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-16 16:48 [FFmpeg-devel] [PATCHv4 1/4] lavu/riscv: add assembler macros for adjusting vector LMUL Rémi Denis-Courmont
2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 2/4] lavc/flacdsp: optimise RVV vector type for lpc16 Rémi Denis-Courmont
2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 3/4] lavc/flacdsp: optimise RVV vector type for lpc32 Rémi Denis-Courmont
2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 4/4] lavc/huffyuvdsp: optimise RVV vtype for add_hfyu_left_pred_bgr32 Rémi Denis-Courmont
2024-05-16 17:14 ` Rémi Denis-Courmont
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git