* [FFmpeg-devel] [PATCH 1/5] lavc/h264dsp: factor some mostly identical R-V V code
@ 2024-07-18 19:35 Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 2/5] lavc/h264dsp: move R-V V idct_dc_add Rémi Denis-Courmont
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-18 19:35 UTC (permalink / raw)
To: ffmpeg-devel
---
libavcodec/riscv/h264idct_rvv.S | 108 ++++++--------------------------
1 file changed, 18 insertions(+), 90 deletions(-)
diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index 48de65ec0b..505f491308 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -418,8 +418,8 @@ const ff_h264_scan8
endconst
#if (__riscv_xlen == 64)
-.irp depth, 8, 16
-func ff_h264_idct_add16_\depth\()_rvv, zve32x
+.macro idct4_adds type, depth
+func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
csrwi vxrm, 0
addi sp, sp, -96
lla t0, ff_h264_scan8
@@ -455,9 +455,13 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x
.endif
vmsne.vi v1, v16, 0
vsetvli zero, zero, e8, m1, ta, ma
+.ifc \type, 16
vmseq.vi v2, v12, 1
+.endif
vmsne.vi v0, v12, 0
+.ifc \type, 16
vmand.mm v1, v1, v2
+.endif
vsetvli zero, zero, e16, m2, ta, ma
vmv.x.s s2, v0
vmv.x.s s3, v1
@@ -470,96 +474,9 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x
andi t0, s2, 1
addi s1, s1, -1
srli s2, s2, 1
+.ifc \type, 16
beqz t0, 3f # if (nnz)
- lw t2, (s5) # block_offset[i]
- andi t1, s3, 1
- mv a1, s6
- mv a2, s7
- add a0, s4, t2
-.if \depth > 8
- mv a5, s8
.endif
- bnez t1, 2f # if (nnz == 1 && block[i * 16])
- jal .Lidct_add4_\depth\()_rvv
- j 3f
-2:
-.if \depth == 8
- call ff_h264_idct_dc_add_\depth\()_c
-.else
- jalr s9
-.endif
-3:
- srli s3, s3, 1
- addi s5, s5, 4
- addi s6, s6, 16 * 2 * (\depth / 8)
- bnez s1, 1b
-
-.if \depth > 8
- ld s9, 80(sp)
- ld s8, 72(sp)
-.endif
- ld s7, 64(sp)
- ld s6, 56(sp)
- ld s5, 48(sp)
- ld s4, 40(sp)
- ld s3, 32(sp)
- ld s2, 24(sp)
- ld s1, 16(sp)
- ld ra, 8(sp)
- ld s0, 0(sp)
- addi sp, sp, 96
- ret
-endfunc
-
-func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
- csrwi vxrm, 0
- addi sp, sp, -96
- lla t0, ff_h264_scan8
- sd s0, (sp)
- li t1, 32 * (\depth / 8)
- mv s0, sp
- sd ra, 8(sp)
- sd s1, 16(sp)
- sd s2, 24(sp)
- sd s3, 32(sp)
- sd s4, 40(sp)
- sd s5, 48(sp)
- sd s6, 56(sp)
- sd s7, 64(sp)
-.if \depth > 8
- sd s8, 72(sp)
- sd s9, 80(sp)
- mv s8, a5
- mv s9, a6
-.endif
- vsetivli zero, 16, e8, m1, ta, ma
- vle8.v v8, (t0)
-.if \depth == 8
- vlse16.v v16, (a2), t1
-.else
- vlse32.v v16, (a2), t1
-.endif
- vluxei8.v v12, (a4), v8
-.if \depth == 8
- vsetvli zero, zero, e16, m2, ta, ma
-.else
- vsetvli zero, zero, e32, m4, ta, ma
-.endif
- vmsne.vi v1, v16, 0
- vsetvli zero, zero, e8, m1, ta, ma
- vmsne.vi v0, v12, 0
- vsetvli zero, zero, e16, m2, ta, ma
- vmv.x.s s2, v0
- vmv.x.s s3, v1
- li s1, 16
- mv s4, a0
- mv s5, a1
- mv s6, a2
- mv s7, a3
-1:
- andi t0, s2, 1
- addi s1, s1, -1
- srli s2, s2, 1
lw t2, (s5) # block_offset[i]
andi t1, s3, 1
mv a1, s6
@@ -568,11 +485,17 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
.if \depth > 8
mv a5, s8
.endif
+.ifc \type, 16
+ bnez t1, 2f # if (nnz == 1 && block[i * 16])
+.else
beqz t0, 2f # if (nnzc[scan8[i]])
+.endif
jal .Lidct_add4_\depth\()_rvv
j 3f
2:
+.ifnc \type, 16
beqz t1, 3f # if (block[i * 16])
+.endif
.if \depth == 8
call ff_h264_idct_dc_add_\depth\()_c
.else
@@ -600,6 +523,11 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
addi sp, sp, 96
ret
endfunc
+.endm
+
+.irp depth, 8, 16
+idct4_adds 16, \depth
+idct4_adds 16intra, \depth
func ff_h264_idct8_add4_\depth\()_rvv, zve32x
csrwi vxrm, 0
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 2/5] lavc/h264dsp: move R-V V idct_dc_add
2024-07-18 19:35 [FFmpeg-devel] [PATCH 1/5] lavc/h264dsp: factor some mostly identical R-V V code Rémi Denis-Courmont
@ 2024-07-18 19:35 ` Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 3/5] lavc/h264dsp: correct VL and LMUL in idct_dc_add Rémi Denis-Courmont
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-18 19:35 UTC (permalink / raw)
To: ffmpeg-devel
From: "J. Dekker" <jdek@itanimul.li>
No functional changes. This just moves the assembler so that it can be
referenced by other functions in h264idct_rvv.S with local jumps.
Edited-by: Rémi Denis-Courmont <remi@remlab.net>
---
libavcodec/riscv/h264dsp_rvv.S | 103 -------------------------------
libavcodec/riscv/h264idct_rvv.S | 105 ++++++++++++++++++++++++++++++++
2 files changed, 105 insertions(+), 103 deletions(-)
diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
index 5c70709cf2..ed6a16a9c4 100644
--- a/libavcodec/riscv/h264dsp_rvv.S
+++ b/libavcodec/riscv/h264dsp_rvv.S
@@ -1,7 +1,6 @@
/*
* SPDX-License-Identifier: BSD-2-Clause
*
- * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
* Copyright © 2024 Rémi Denis-Courmont.
*
* Redistribution and use in source and binary forms, with or without
@@ -326,105 +325,3 @@ func ff_h264_h_loop_filter_luma_mbaff_8_rvv, zve32x
vssseg6e8.v v8, (a0), a1
ret
endfunc
-
-.macro idct_dc_add8 width
-func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
-.if \width == 8
- vsetivli zero, \width, e16, m1, ta, ma
-.else
- vsetivli zero, \width, e16, mf2, ta, ma
-.endif
- lh a3, 0(a1)
- addi a3, a3, 32
- srai a3, a3, 6
- sh zero, 0(a1)
-.if \width == 8
- vlse64.v v24, (a0), a2
- vsetvli t0, zero, e16, m8, ta, ma
-.else
- vlse32.v v24, (a0), a2
- vsetvli t0, zero, e16, m4, ta, ma
-.endif
- vzext.vf2 v0, v24
- vadd.vx v0, v0, a3
- vmax.vx v0, v0, zero
-.if \width == 8
- vsetvli zero, zero, e8, m4, ta, ma
-.else
- vsetvli zero, zero, e8, m2, ta, ma
-.endif
- vnclipu.wi v24, v0, 0
- vsetivli zero, \width, e8, m1, ta, ma
-.if \width == 8
- vsse64.v v24, (a0), a2
-.else
- vsse32.v v24, (a0), a2
-.endif
- ret
-endfunc
-.endm
-
-idct_dc_add8 4
-idct_dc_add8 8
-
-.macro idct_dc_add width
-func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
- vsetivli zero, \width, e16, m1, ta, ma
- lw a3, 0(a1)
- addi a3, a3, 32
- srai a3, a3, 6
- sw zero, 0(a1)
- add t4, a0, a2
- sh1add t5, a2, a0
- sh1add t6, a2, t4
-.if \width == 8
- sh2add t0, a2, a0
- sh2add t1, a2, t4
- sh2add t2, a2, t5
- sh2add t3, a2, t6
-.endif
- vle16.v v0, (a0)
- vle16.v v1, (t4)
- vle16.v v2, (t5)
- vle16.v v3, (t6)
-.if \width == 8
- vle16.v v4, (t0)
- vle16.v v5, (t1)
- vle16.v v6, (t2)
- vle16.v v7, (t3)
- vsetvli a6, zero, e16, m8, ta, ma
-.else
- vsetvli a6, zero, e16, m4, ta, ma
-.endif
- vadd.vx v0, v0, a3
- vmax.vx v0, v0, zero
- vmin.vx v0, v0, a5
- vsetivli zero, \width, e16, m1, ta, ma
- vse16.v v0, (a0)
- vse16.v v1, (t4)
- vse16.v v2, (t5)
- vse16.v v3, (t6)
-.if \width == 8
- vse16.v v4, (t0)
- vse16.v v5, (t1)
- vse16.v v6, (t2)
- vse16.v v7, (t3)
-.endif
- ret
-endfunc
-.endm
-
-idct_dc_add 4
-idct_dc_add 8
-
-.irp depth,9,10,12,14
-func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x
- li a5, (1 << \depth) - 1
- j ff_h264_idct4_dc_add_16_rvv
-endfunc
-
-func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
- li a5, (1 << \depth) - 1
- j ff_h264_idct8_dc_add_16_rvv
-endfunc
-.endr
diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index 505f491308..37b27fc92a 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -1,4 +1,7 @@
/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
* Copyright © 2024 Rémi Denis-Courmont.
*
* Redistribution and use in source and binary forms, with or without
@@ -412,6 +415,108 @@ func ff_h264_idct8_add_\depth\()_rvv, zve32x
endfunc
.endr
+.macro idct_dc_add8 width
+func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
+.if \width == 8
+ vsetivli zero, \width, e16, m1, ta, ma
+.else
+ vsetivli zero, \width, e16, mf2, ta, ma
+.endif
+ lh a3, 0(a1)
+ addi a3, a3, 32
+ srai a3, a3, 6
+ sh zero, 0(a1)
+.if \width == 8
+ vlse64.v v24, (a0), a2
+ vsetvli t0, zero, e16, m8, ta, ma
+.else
+ vlse32.v v24, (a0), a2
+ vsetvli t0, zero, e16, m4, ta, ma
+.endif
+ vzext.vf2 v0, v24
+ vadd.vx v0, v0, a3
+ vmax.vx v0, v0, zero
+.if \width == 8
+ vsetvli zero, zero, e8, m4, ta, ma
+.else
+ vsetvli zero, zero, e8, m2, ta, ma
+.endif
+ vnclipu.wi v24, v0, 0
+ vsetivli zero, \width, e8, m1, ta, ma
+.if \width == 8
+ vsse64.v v24, (a0), a2
+.else
+ vsse32.v v24, (a0), a2
+.endif
+ ret
+endfunc
+.endm
+
+idct_dc_add8 4
+idct_dc_add8 8
+
+.macro idct_dc_add width
+func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
+ vsetivli zero, \width, e16, m1, ta, ma
+ lw a3, 0(a1)
+ addi a3, a3, 32
+ srai a3, a3, 6
+ sw zero, 0(a1)
+ add t4, a0, a2
+ sh1add t5, a2, a0
+ sh1add t6, a2, t4
+.if \width == 8
+ sh2add t0, a2, a0
+ sh2add t1, a2, t4
+ sh2add t2, a2, t5
+ sh2add t3, a2, t6
+.endif
+ vle16.v v0, (a0)
+ vle16.v v1, (t4)
+ vle16.v v2, (t5)
+ vle16.v v3, (t6)
+.if \width == 8
+ vle16.v v4, (t0)
+ vle16.v v5, (t1)
+ vle16.v v6, (t2)
+ vle16.v v7, (t3)
+ vsetvli a6, zero, e16, m8, ta, ma
+.else
+ vsetvli a6, zero, e16, m4, ta, ma
+.endif
+ vadd.vx v0, v0, a3
+ vmax.vx v0, v0, zero
+ vmin.vx v0, v0, a5
+ vsetivli zero, \width, e16, m1, ta, ma
+ vse16.v v0, (a0)
+ vse16.v v1, (t4)
+ vse16.v v2, (t5)
+ vse16.v v3, (t6)
+.if \width == 8
+ vse16.v v4, (t0)
+ vse16.v v5, (t1)
+ vse16.v v6, (t2)
+ vse16.v v7, (t3)
+.endif
+ ret
+endfunc
+.endm
+
+idct_dc_add 4
+idct_dc_add 8
+
+.irp depth,9,10,12,14
+func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x
+ li a5, (1 << \depth) - 1
+ j ff_h264_idct4_dc_add_16_rvv
+endfunc
+
+func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
+ li a5, (1 << \depth) - 1
+ j ff_h264_idct8_dc_add_16_rvv
+endfunc
+.endr
+
const ff_h264_scan8
.byte 014, 015, 024, 025, 016, 017, 026, 027
.byte 034, 035, 044, 045, 036, 037, 046, 047
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 3/5] lavc/h264dsp: correct VL and LMUL in idct_dc_add
2024-07-18 19:35 [FFmpeg-devel] [PATCH 1/5] lavc/h264dsp: factor some mostly identical R-V V code Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 2/5] lavc/h264dsp: move R-V V idct_dc_add Rémi Denis-Courmont
@ 2024-07-18 19:35 ` Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 4/5] lavc/h264dsp: reuse the R-V V IDCT DC add functions Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 5/5] lavc/h264dsp: reduce spills in R-V V idct_add16 Rémi Denis-Courmont
3 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-18 19:35 UTC (permalink / raw)
To: ffmpeg-devel
T-Head C908 (cycles):
h264_idct4_dc_add_8bpp_c: 94.7
h264_idct4_dc_add_8bpp_rvv_i32: 55.0 (before)
h264_idct4_dc_add_8bpp_rvv_i32: 34.5 (after)
h264_idct4_dc_add_9bpp_c: 94.7
h264_idct4_dc_add_9bpp_rvv_i32: 43.5 (before)
h264_idct4_dc_add_9bpp_rvv_i32: 38.2 (after)
h264_idct4_dc_add_10bpp_c: 94.7
h264_idct4_dc_add_10bpp_rvv_i32: 43.5 (before)
h264_idct4_dc_add_10bpp_rvv_i32: 38.2 (after)
h264_idct4_dc_add_12bpp_c: 94.7
h264_idct4_dc_add_12bpp_rvv_i32: 43.7 (before)
h264_idct4_dc_add_12bpp_rvv_i32: 38.5 (after)
h264_idct4_dc_add_14bpp_c: 94.7
h264_idct4_dc_add_14bpp_rvv_i32: 43.7 (before)
h264_idct4_dc_add_14bpp_rvv_i32: 38.5 (after)
---
libavcodec/riscv/h264idct_rvv.S | 24 +++++++++++++++++-------
1 file changed, 17 insertions(+), 7 deletions(-)
diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index 37b27fc92a..2648e06aeb 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -416,22 +416,23 @@ endfunc
.endr
.macro idct_dc_add8 width
-func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
+func ff_h264_idct\width\()_dc_add_8_rvv, zve64x
.if \width == 8
- vsetivli zero, \width, e16, m1, ta, ma
+ vsetivli zero, \width, e8, mf2, ta, ma
.else
- vsetivli zero, \width, e16, mf2, ta, ma
+ vsetivli zero, \width, e8, mf4, ta, ma
.endif
lh a3, 0(a1)
addi a3, a3, 32
srai a3, a3, 6
sh zero, 0(a1)
.if \width == 8
+ li a6, \width * \width
vlse64.v v24, (a0), a2
- vsetvli t0, zero, e16, m8, ta, ma
+ vsetvli zero, a6, e16, m8, ta, ma
.else
vlse32.v v24, (a0), a2
- vsetvli t0, zero, e16, m4, ta, ma
+ vsetivli zero, \width * \width, e16, m2, ta, ma
.endif
vzext.vf2 v0, v24
vadd.vx v0, v0, a3
@@ -439,13 +440,14 @@ func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
.if \width == 8
vsetvli zero, zero, e8, m4, ta, ma
.else
- vsetvli zero, zero, e8, m2, ta, ma
+ vsetvli zero, zero, e8, m1, ta, ma
.endif
vnclipu.wi v24, v0, 0
- vsetivli zero, \width, e8, m1, ta, ma
.if \width == 8
+ vsetivli zero, \width, e8, mf2, ta, ma
vsse64.v v24, (a0), a2
.else
+ vsetivli zero, \width, e8, mf4, ta, ma
vsse32.v v24, (a0), a2
.endif
ret
@@ -457,7 +459,11 @@ idct_dc_add8 8
.macro idct_dc_add width
func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
+.if \width == 8
vsetivli zero, \width, e16, m1, ta, ma
+.else
+ vsetivli zero, \width, e16, mf2, ta, ma
+.endif
lw a3, 0(a1)
addi a3, a3, 32
srai a3, a3, 6
@@ -487,7 +493,11 @@ func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
vadd.vx v0, v0, a3
vmax.vx v0, v0, zero
vmin.vx v0, v0, a5
+.if \width == 8
vsetivli zero, \width, e16, m1, ta, ma
+.else
+ vsetivli zero, \width, e16, mf2, ta, ma
+.endif
vse16.v v0, (a0)
vse16.v v1, (t4)
vse16.v v2, (t5)
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 4/5] lavc/h264dsp: reuse the R-V V IDCT DC add functions
2024-07-18 19:35 [FFmpeg-devel] [PATCH 1/5] lavc/h264dsp: factor some mostly identical R-V V code Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 2/5] lavc/h264dsp: move R-V V idct_dc_add Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 3/5] lavc/h264dsp: correct VL and LMUL in idct_dc_add Rémi Denis-Courmont
@ 2024-07-18 19:35 ` Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 5/5] lavc/h264dsp: reduce spills in R-V V idct_add16 Rémi Denis-Courmont
3 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-18 19:35 UTC (permalink / raw)
To: ffmpeg-devel
This reuses the DC bypass functions from the multiple IDCT functions, to
leverage vector code.
As an added bonus, the caller functions can now rely on the callee functions
to preserve their parameters, thus cutting down on stack spills.
---
libavcodec/riscv/h264idct_rvv.S | 76 +++++++--------------------------
1 file changed, 16 insertions(+), 60 deletions(-)
diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index 2648e06aeb..c42db6ef29 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -536,7 +536,7 @@ endconst
.macro idct4_adds type, depth
func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
csrwi vxrm, 0
- addi sp, sp, -96
+ addi sp, sp, -64
lla t0, ff_h264_scan8
sd s0, (sp)
li t1, 32 * (\depth / 8)
@@ -547,14 +547,6 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
sd s3, 32(sp)
sd s4, 40(sp)
sd s5, 48(sp)
- sd s6, 56(sp)
- sd s7, 64(sp)
-.if \depth > 8
- sd s8, 72(sp)
- sd s9, 80(sp)
- mv s8, a5
- mv s9, a6
-.endif
vsetivli zero, 16, e8, m1, ta, ma
vle8.v v8, (t0)
.if \depth == 8
@@ -583,8 +575,8 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
li s1, 16
mv s4, a0
mv s5, a1
- mv s6, a2
- mv s7, a3
+ mv a1, a2
+ mv a2, a3
1:
andi t0, s2, 1
addi s1, s1, -1
@@ -594,12 +586,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
.endif
lw t2, (s5) # block_offset[i]
andi t1, s3, 1
- mv a1, s6
- mv a2, s7
add a0, s4, t2
-.if \depth > 8
- mv a5, s8
-.endif
.ifc \type, 16
bnez t1, 2f # if (nnz == 1 && block[i * 16])
.else
@@ -611,23 +598,13 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
.ifnc \type, 16
beqz t1, 3f # if (block[i * 16])
.endif
-.if \depth == 8
- call ff_h264_idct_dc_add_\depth\()_c
-.else
- jalr s9
-.endif
+ jal ff_h264_idct4_dc_add_\depth\()_rvv
3:
srli s3, s3, 1
addi s5, s5, 4
- addi s6, s6, 16 * 2 * (\depth / 8)
+ addi a1, a1, 16 * 2 * (\depth / 8)
bnez s1, 1b
-.if \depth > 8
- ld s9, 80(sp)
- ld s8, 72(sp)
-.endif
- ld s7, 64(sp)
- ld s6, 56(sp)
ld s5, 48(sp)
ld s4, 40(sp)
ld s3, 32(sp)
@@ -635,7 +612,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
ld s1, 16(sp)
ld ra, 8(sp)
ld s0, 0(sp)
- addi sp, sp, 96
+ addi sp, sp, 64
ret
endfunc
.endm
@@ -646,7 +623,7 @@ idct4_adds 16intra, \depth
func ff_h264_idct8_add4_\depth\()_rvv, zve32x
csrwi vxrm, 0
- addi sp, sp, -96
+ addi sp, sp, -64
lla t0, ff_h264_scan8
sd s0, (sp)
li t1, 4 * 32 * (\depth / 8)
@@ -658,14 +635,6 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
sd s3, 32(sp)
sd s4, 40(sp)
sd s5, 48(sp)
- sd s6, 56(sp)
- sd s7, 64(sp)
-.if \depth > 8
- sd s8, 72(sp)
- sd s9, 80(sp)
- mv s8, a5
- mv s9, a6
-.endif
vsetivli zero, 4, e8, mf4, ta, ma
vlse8.v v8, (t0), t2
.if \depth == 8
@@ -689,8 +658,8 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
li s1, 4
mv s4, a0
mv s5, a1
- mv s6, a2
- mv s7, a3
+ mv a1, a2
+ mv a2, a3
1:
andi t0, s2, 1
addi s1, s1, -1
@@ -698,33 +667,23 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
beqz t0, 3f # if (nnz)
lw t2, (s5) # block_offset[i]
andi t1, s3, 1
- mv a1, s6
- mv a2, s7
add a0, s4, t2
-.if \depth > 8
- mv a5, s8
-.endif
bnez t1, 2f # if (nnz == 1 && block[i * 16])
jal .Lidct8_add_\depth\()_rvv
- j 3f
-2:
.if \depth == 8
- call ff_h264_idct8_dc_add_\depth\()_c
+ j 3f
.else
- jalr s9
+ j 4f # idct8_add_16 updates a1
.endif
+2:
+ jal ff_h264_idct8_dc_add_\depth\()_rvv
3:
+ addi a1, a1, 4 * 16 * 2 * (\depth / 8)
+4:
srli s3, s3, 1
addi s5, s5, 4 * 4
- addi s6, s6, 4 * 16 * 2 * (\depth / 8)
bnez s1, 1b
-.if \depth > 8
- ld s9, 80(sp)
- ld s8, 72(sp)
-.endif
- ld s7, 64(sp)
- ld s6, 56(sp)
ld s5, 48(sp)
ld s4, 40(sp)
ld s3, 32(sp)
@@ -732,7 +691,7 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
ld s1, 16(sp)
ld ra, 8(sp)
ld s0, 0(sp)
- addi sp, sp, 96
+ addi sp, sp, 64
ret
endfunc
.endr
@@ -740,19 +699,16 @@ endfunc
.irp depth, 9, 10, 12, 14
func ff_h264_idct_add16_\depth\()_rvv, zve32x
li a5, (1 << \depth) - 1
- lla a6, ff_h264_idct_dc_add_\depth\()_c
j ff_h264_idct_add16_16_rvv
endfunc
func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
li a5, (1 << \depth) - 1
- lla a6, ff_h264_idct_dc_add_\depth\()_c
j ff_h264_idct_add16intra_16_rvv
endfunc
func ff_h264_idct8_add4_\depth\()_rvv, zve32x
li a5, (1 << \depth) - 1
- lla a6, ff_h264_idct8_dc_add_\depth\()_c
j ff_h264_idct8_add4_16_rvv
endfunc
.endr
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 5/5] lavc/h264dsp: reduce spills in R-V V idct_add16
2024-07-18 19:35 [FFmpeg-devel] [PATCH 1/5] lavc/h264dsp: factor some mostly identical R-V V code Rémi Denis-Courmont
` (2 preceding siblings ...)
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 4/5] lavc/h264dsp: reuse the R-V V IDCT DC add functions Rémi Denis-Courmont
@ 2024-07-18 19:35 ` Rémi Denis-Courmont
3 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-18 19:35 UTC (permalink / raw)
To: ffmpeg-devel
---
libavcodec/riscv/h264idct_rvv.S | 100 ++++++++++++++------------------
1 file changed, 45 insertions(+), 55 deletions(-)
diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index c42db6ef29..c74ea18c19 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -422,9 +422,9 @@ func ff_h264_idct\width\()_dc_add_8_rvv, zve64x
.else
vsetivli zero, \width, e8, mf4, ta, ma
.endif
- lh a3, 0(a1)
- addi a3, a3, 32
- srai a3, a3, 6
+ lh t0, 0(a1)
+ addi t0, t0, 32
+ srai t0, t0, 6
sh zero, 0(a1)
.if \width == 8
li a6, \width * \width
@@ -435,7 +435,7 @@ func ff_h264_idct\width\()_dc_add_8_rvv, zve64x
vsetivli zero, \width * \width, e16, m2, ta, ma
.endif
vzext.vf2 v0, v24
- vadd.vx v0, v0, a3
+ vadd.vx v0, v0, t0
vmax.vx v0, v0, zero
.if \width == 8
vsetvli zero, zero, e8, m4, ta, ma
@@ -464,33 +464,33 @@ func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
.else
vsetivli zero, \width, e16, mf2, ta, ma
.endif
- lw a3, 0(a1)
- addi a3, a3, 32
- srai a3, a3, 6
+ lw t0, 0(a1)
+ addi t0, t0, 32
+ srai t0, t0, 6
sw zero, 0(a1)
- add t4, a0, a2
- sh1add t5, a2, a0
- sh1add t6, a2, t4
+ add t1, a0, a2
+ sh1add t2, a2, a0
+ sh1add t3, a2, t1
.if \width == 8
- sh2add t0, a2, a0
- sh2add t1, a2, t4
- sh2add t2, a2, t5
- sh2add t3, a2, t6
+ sh2add t4, a2, a0
+ sh2add t5, a2, t1
+ sh2add t6, a2, t2
+ sh2add a7, a2, t3
.endif
vle16.v v0, (a0)
- vle16.v v1, (t4)
- vle16.v v2, (t5)
- vle16.v v3, (t6)
+ vle16.v v1, (t1)
+ vle16.v v2, (t2)
+ vle16.v v3, (t3)
.if \width == 8
- vle16.v v4, (t0)
- vle16.v v5, (t1)
- vle16.v v6, (t2)
- vle16.v v7, (t3)
+ vle16.v v4, (t4)
+ vle16.v v5, (t5)
+ vle16.v v6, (t6)
+ vle16.v v7, (a7)
vsetvli a6, zero, e16, m8, ta, ma
.else
vsetvli a6, zero, e16, m4, ta, ma
.endif
- vadd.vx v0, v0, a3
+ vadd.vx v0, v0, t0
vmax.vx v0, v0, zero
vmin.vx v0, v0, a5
.if \width == 8
@@ -499,14 +499,14 @@ func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
vsetivli zero, \width, e16, mf2, ta, ma
.endif
vse16.v v0, (a0)
- vse16.v v1, (t4)
- vse16.v v2, (t5)
- vse16.v v3, (t6)
+ vse16.v v1, (t1)
+ vse16.v v2, (t2)
+ vse16.v v3, (t3)
.if \width == 8
- vse16.v v4, (t0)
- vse16.v v5, (t1)
- vse16.v v6, (t2)
- vse16.v v7, (t3)
+ vse16.v v4, (t4)
+ vse16.v v5, (t5)
+ vse16.v v6, (t6)
+ vse16.v v7, (a7)
.endif
ret
endfunc
@@ -536,17 +536,12 @@ endconst
.macro idct4_adds type, depth
func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
csrwi vxrm, 0
- addi sp, sp, -64
+ addi sp, sp, -16
lla t0, ff_h264_scan8
sd s0, (sp)
li t1, 32 * (\depth / 8)
mv s0, sp
sd ra, 8(sp)
- sd s1, 16(sp)
- sd s2, 24(sp)
- sd s3, 32(sp)
- sd s4, 40(sp)
- sd s5, 48(sp)
vsetivli zero, 16, e8, m1, ta, ma
vle8.v v8, (t0)
.if \depth == 8
@@ -570,23 +565,23 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
vmand.mm v1, v1, v2
.endif
vsetvli zero, zero, e16, m2, ta, ma
- vmv.x.s s2, v0
- vmv.x.s s3, v1
- li s1, 16
- mv s4, a0
- mv s5, a1
+ vmv.x.s a4, v0
+ vmv.x.s a7, v1
+ mv t4, a0
+ mv t5, a1
mv a1, a2
mv a2, a3
+ li a3, 16
1:
- andi t0, s2, 1
- addi s1, s1, -1
- srli s2, s2, 1
+ andi t0, a4, 1
+ addi a3, a3, -1
+ srli a4, a4, 1
.ifc \type, 16
beqz t0, 3f # if (nnz)
.endif
- lw t2, (s5) # block_offset[i]
- andi t1, s3, 1
- add a0, s4, t2
+ lw t2, (t5) # block_offset[i]
+ andi t1, a7, 1
+ add a0, t4, t2
.ifc \type, 16
bnez t1, 2f # if (nnz == 1 && block[i * 16])
.else
@@ -600,19 +595,14 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
.endif
jal ff_h264_idct4_dc_add_\depth\()_rvv
3:
- srli s3, s3, 1
- addi s5, s5, 4
+ srli a7, a7, 1
+ addi t5, t5, 4
addi a1, a1, 16 * 2 * (\depth / 8)
- bnez s1, 1b
+ bnez a3, 1b
- ld s5, 48(sp)
- ld s4, 40(sp)
- ld s3, 32(sp)
- ld s2, 24(sp)
- ld s1, 16(sp)
ld ra, 8(sp)
ld s0, 0(sp)
- addi sp, sp, 64
+ addi sp, sp, 16
ret
endfunc
.endm
--
2.45.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2024-07-18 19:36 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-07-18 19:35 [FFmpeg-devel] [PATCH 1/5] lavc/h264dsp: factor some mostly identical R-V V code Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 2/5] lavc/h264dsp: move R-V V idct_dc_add Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 3/5] lavc/h264dsp: correct VL and LMUL in idct_dc_add Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 4/5] lavc/h264dsp: reuse the R-V V IDCT DC add functions Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 5/5] lavc/h264dsp: reduce spills in R-V V idct_add16 Rémi Denis-Courmont
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git