Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/5] lavc/h264dsp: factor some mostly identical R-V V code
@ 2024-07-18 19:35 Rémi Denis-Courmont
  2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 2/5] lavc/h264dsp: move R-V V idct_dc_add Rémi Denis-Courmont
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-18 19:35 UTC (permalink / raw)
  To: ffmpeg-devel

---
 libavcodec/riscv/h264idct_rvv.S | 108 ++++++--------------------------
 1 file changed, 18 insertions(+), 90 deletions(-)

diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index 48de65ec0b..505f491308 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -418,8 +418,8 @@ const ff_h264_scan8
 endconst
 
 #if (__riscv_xlen == 64)
-.irp    depth, 8, 16
-func ff_h264_idct_add16_\depth\()_rvv, zve32x
+.macro  idct4_adds type, depth
+func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
         csrwi   vxrm, 0
         addi    sp, sp, -96
         lla     t0, ff_h264_scan8
@@ -455,9 +455,13 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x
 .endif
         vmsne.vi  v1, v16, 0
         vsetvli   zero, zero, e8, m1, ta, ma
+.ifc \type, 16
         vmseq.vi  v2, v12, 1
+.endif
         vmsne.vi  v0, v12, 0
+.ifc \type, 16
         vmand.mm  v1, v1, v2
+.endif
         vsetvli   zero, zero, e16, m2, ta, ma
         vmv.x.s   s2, v0
         vmv.x.s   s3, v1
@@ -470,96 +474,9 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x
         andi    t0, s2, 1
         addi    s1, s1, -1
         srli    s2, s2, 1
+.ifc \type, 16
         beqz    t0, 3f     # if (nnz)
-        lw      t2, (s5)   # block_offset[i]
-        andi    t1, s3, 1
-        mv      a1, s6
-        mv      a2, s7
-        add     a0, s4, t2
-.if \depth > 8
-        mv      a5, s8
 .endif
-        bnez    t1, 2f    # if (nnz == 1 && block[i * 16])
-        jal     .Lidct_add4_\depth\()_rvv
-        j       3f
-2:
-.if \depth == 8
-        call    ff_h264_idct_dc_add_\depth\()_c
-.else
-        jalr    s9
-.endif
-3:
-        srli    s3, s3, 1
-        addi    s5, s5, 4
-        addi    s6, s6, 16 * 2 * (\depth / 8)
-        bnez    s1, 1b
-
-.if \depth > 8
-        ld      s9, 80(sp)
-        ld      s8, 72(sp)
-.endif
-        ld      s7, 64(sp)
-        ld      s6, 56(sp)
-        ld      s5, 48(sp)
-        ld      s4, 40(sp)
-        ld      s3, 32(sp)
-        ld      s2, 24(sp)
-        ld      s1, 16(sp)
-        ld      ra,  8(sp)
-        ld      s0,  0(sp)
-        addi    sp, sp, 96
-        ret
-endfunc
-
-func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
-        csrwi   vxrm, 0
-        addi    sp, sp, -96
-        lla     t0, ff_h264_scan8
-        sd      s0,   (sp)
-        li      t1, 32 * (\depth / 8)
-        mv      s0, sp
-        sd      ra,  8(sp)
-        sd      s1, 16(sp)
-        sd      s2, 24(sp)
-        sd      s3, 32(sp)
-        sd      s4, 40(sp)
-        sd      s5, 48(sp)
-        sd      s6, 56(sp)
-        sd      s7, 64(sp)
-.if \depth > 8
-        sd      s8, 72(sp)
-        sd      s9, 80(sp)
-        mv      s8, a5
-        mv      s9, a6
-.endif
-        vsetivli  zero, 16, e8, m1, ta, ma
-        vle8.v    v8, (t0)
-.if \depth == 8
-        vlse16.v  v16, (a2), t1
-.else
-        vlse32.v  v16, (a2), t1
-.endif
-        vluxei8.v v12, (a4), v8
-.if \depth == 8
-        vsetvli   zero, zero, e16, m2, ta, ma
-.else
-        vsetvli   zero, zero, e32, m4, ta, ma
-.endif
-        vmsne.vi  v1, v16, 0
-        vsetvli   zero, zero, e8, m1, ta, ma
-        vmsne.vi  v0, v12, 0
-        vsetvli   zero, zero, e16, m2, ta, ma
-        vmv.x.s   s2, v0
-        vmv.x.s   s3, v1
-        li      s1, 16
-        mv      s4, a0
-        mv      s5, a1
-        mv      s6, a2
-        mv      s7, a3
-1:
-        andi    t0, s2, 1
-        addi    s1, s1, -1
-        srli    s2, s2, 1
         lw      t2, (s5)   # block_offset[i]
         andi    t1, s3, 1
         mv      a1, s6
@@ -568,11 +485,17 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
 .if \depth > 8
         mv      a5, s8
 .endif
+.ifc \type, 16
+        bnez    t1, 2f     # if (nnz == 1 && block[i * 16])
+.else
         beqz    t0, 2f     # if (nnzc[scan8[i]])
+.endif
         jal     .Lidct_add4_\depth\()_rvv
         j       3f
 2:
+.ifnc \type, 16
         beqz    t1, 3f    # if (block[i * 16])
+.endif
 .if \depth == 8
         call    ff_h264_idct_dc_add_\depth\()_c
 .else
@@ -600,6 +523,11 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
         addi    sp, sp, 96
         ret
 endfunc
+.endm
+
+.irp    depth, 8, 16
+idct4_adds 16, \depth
+idct4_adds 16intra, \depth
 
 func ff_h264_idct8_add4_\depth\()_rvv, zve32x
         csrwi       vxrm, 0
-- 
2.45.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH 2/5] lavc/h264dsp: move R-V V idct_dc_add
  2024-07-18 19:35 [FFmpeg-devel] [PATCH 1/5] lavc/h264dsp: factor some mostly identical R-V V code Rémi Denis-Courmont
@ 2024-07-18 19:35 ` Rémi Denis-Courmont
  2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 3/5] lavc/h264dsp: correct VL and LMUL in idct_dc_add Rémi Denis-Courmont
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-18 19:35 UTC (permalink / raw)
  To: ffmpeg-devel

From: "J. Dekker" <jdek@itanimul.li>

No functional changes. This just moves the assembler so that it can be
referenced by other functions in h264idct_rvv.S with local jumps.

Edited-by: Rémi Denis-Courmont <remi@remlab.net>
---
 libavcodec/riscv/h264dsp_rvv.S  | 103 -------------------------------
 libavcodec/riscv/h264idct_rvv.S | 105 ++++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 103 deletions(-)

diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
index 5c70709cf2..ed6a16a9c4 100644
--- a/libavcodec/riscv/h264dsp_rvv.S
+++ b/libavcodec/riscv/h264dsp_rvv.S
@@ -1,7 +1,6 @@
 /*
  * SPDX-License-Identifier: BSD-2-Clause
  *
- * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
  * Copyright © 2024 Rémi Denis-Courmont.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -326,105 +325,3 @@ func ff_h264_h_loop_filter_luma_mbaff_8_rvv, zve32x
         vssseg6e8.v v8, (a0), a1
         ret
 endfunc
-
-.macro idct_dc_add8 width
-func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
-.if \width == 8
-        vsetivli        zero, \width, e16, m1, ta, ma
-.else
-        vsetivli        zero, \width, e16, mf2, ta, ma
-.endif
-        lh              a3, 0(a1)
-        addi            a3, a3, 32
-        srai            a3, a3, 6
-        sh              zero, 0(a1)
-.if \width == 8
-        vlse64.v        v24, (a0), a2
-        vsetvli         t0, zero, e16, m8, ta, ma
-.else
-        vlse32.v        v24, (a0), a2
-        vsetvli         t0, zero, e16, m4, ta, ma
-.endif
-        vzext.vf2       v0, v24
-        vadd.vx         v0, v0, a3
-        vmax.vx         v0, v0, zero
-.if \width == 8
-        vsetvli         zero, zero, e8, m4, ta, ma
-.else
-        vsetvli         zero, zero, e8, m2, ta, ma
-.endif
-        vnclipu.wi      v24, v0, 0
-        vsetivli        zero, \width, e8, m1, ta, ma
-.if \width == 8
-        vsse64.v        v24, (a0), a2
-.else
-        vsse32.v        v24, (a0), a2
-.endif
-        ret
-endfunc
-.endm
-
-idct_dc_add8 4
-idct_dc_add8 8
-
-.macro idct_dc_add width
-func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
-        vsetivli        zero, \width, e16, m1, ta, ma
-        lw              a3, 0(a1)
-        addi            a3, a3, 32
-        srai            a3, a3, 6
-        sw              zero, 0(a1)
-        add             t4, a0, a2
-        sh1add          t5, a2, a0
-        sh1add          t6, a2, t4
-.if \width == 8
-        sh2add          t0, a2, a0
-        sh2add          t1, a2, t4
-        sh2add          t2, a2, t5
-        sh2add          t3, a2, t6
-.endif
-        vle16.v         v0, (a0)
-        vle16.v         v1, (t4)
-        vle16.v         v2, (t5)
-        vle16.v         v3, (t6)
-.if \width == 8
-        vle16.v         v4, (t0)
-        vle16.v         v5, (t1)
-        vle16.v         v6, (t2)
-        vle16.v         v7, (t3)
-        vsetvli         a6, zero, e16, m8, ta, ma
-.else
-        vsetvli         a6, zero, e16, m4, ta, ma
-.endif
-        vadd.vx         v0, v0, a3
-        vmax.vx         v0, v0, zero
-        vmin.vx         v0, v0, a5
-        vsetivli        zero, \width, e16, m1, ta, ma
-        vse16.v         v0, (a0)
-        vse16.v         v1, (t4)
-        vse16.v         v2, (t5)
-        vse16.v         v3, (t6)
-.if \width == 8
-        vse16.v         v4, (t0)
-        vse16.v         v5, (t1)
-        vse16.v         v6, (t2)
-        vse16.v         v7, (t3)
-.endif
-        ret
-endfunc
-.endm
-
-idct_dc_add 4
-idct_dc_add 8
-
-.irp depth,9,10,12,14
-func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x
-        li              a5, (1 << \depth) - 1
-        j               ff_h264_idct4_dc_add_16_rvv
-endfunc
-
-func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
-        li              a5, (1 << \depth) - 1
-        j               ff_h264_idct8_dc_add_16_rvv
-endfunc
-.endr
diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index 505f491308..37b27fc92a 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -1,4 +1,7 @@
 /*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
  * Copyright © 2024 Rémi Denis-Courmont.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -412,6 +415,108 @@ func ff_h264_idct8_add_\depth\()_rvv, zve32x
 endfunc
 .endr
 
+.macro idct_dc_add8 width
+func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
+.if \width == 8
+        vsetivli        zero, \width, e16, m1, ta, ma
+.else
+        vsetivli        zero, \width, e16, mf2, ta, ma
+.endif
+        lh              a3, 0(a1)
+        addi            a3, a3, 32
+        srai            a3, a3, 6
+        sh              zero, 0(a1)
+.if \width == 8
+        vlse64.v        v24, (a0), a2
+        vsetvli         t0, zero, e16, m8, ta, ma
+.else
+        vlse32.v        v24, (a0), a2
+        vsetvli         t0, zero, e16, m4, ta, ma
+.endif
+        vzext.vf2       v0, v24
+        vadd.vx         v0, v0, a3
+        vmax.vx         v0, v0, zero
+.if \width == 8
+        vsetvli         zero, zero, e8, m4, ta, ma
+.else
+        vsetvli         zero, zero, e8, m2, ta, ma
+.endif
+        vnclipu.wi      v24, v0, 0
+        vsetivli        zero, \width, e8, m1, ta, ma
+.if \width == 8
+        vsse64.v        v24, (a0), a2
+.else
+        vsse32.v        v24, (a0), a2
+.endif
+        ret
+endfunc
+.endm
+
+idct_dc_add8 4
+idct_dc_add8 8
+
+.macro idct_dc_add width
+func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
+        vsetivli        zero, \width, e16, m1, ta, ma
+        lw              a3, 0(a1)
+        addi            a3, a3, 32
+        srai            a3, a3, 6
+        sw              zero, 0(a1)
+        add             t4, a0, a2
+        sh1add          t5, a2, a0
+        sh1add          t6, a2, t4
+.if \width == 8
+        sh2add          t0, a2, a0
+        sh2add          t1, a2, t4
+        sh2add          t2, a2, t5
+        sh2add          t3, a2, t6
+.endif
+        vle16.v         v0, (a0)
+        vle16.v         v1, (t4)
+        vle16.v         v2, (t5)
+        vle16.v         v3, (t6)
+.if \width == 8
+        vle16.v         v4, (t0)
+        vle16.v         v5, (t1)
+        vle16.v         v6, (t2)
+        vle16.v         v7, (t3)
+        vsetvli         a6, zero, e16, m8, ta, ma
+.else
+        vsetvli         a6, zero, e16, m4, ta, ma
+.endif
+        vadd.vx         v0, v0, a3
+        vmax.vx         v0, v0, zero
+        vmin.vx         v0, v0, a5
+        vsetivli        zero, \width, e16, m1, ta, ma
+        vse16.v         v0, (a0)
+        vse16.v         v1, (t4)
+        vse16.v         v2, (t5)
+        vse16.v         v3, (t6)
+.if \width == 8
+        vse16.v         v4, (t0)
+        vse16.v         v5, (t1)
+        vse16.v         v6, (t2)
+        vse16.v         v7, (t3)
+.endif
+        ret
+endfunc
+.endm
+
+idct_dc_add 4
+idct_dc_add 8
+
+.irp depth,9,10,12,14
+func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x
+        li              a5, (1 << \depth) - 1
+        j               ff_h264_idct4_dc_add_16_rvv
+endfunc
+
+func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
+        li              a5, (1 << \depth) - 1
+        j               ff_h264_idct8_dc_add_16_rvv
+endfunc
+.endr
+
 const ff_h264_scan8
         .byte   014, 015, 024, 025, 016, 017, 026, 027
         .byte   034, 035, 044, 045, 036, 037, 046, 047
-- 
2.45.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH 3/5] lavc/h264dsp: correct VL and LMUL in idct_dc_add
  2024-07-18 19:35 [FFmpeg-devel] [PATCH 1/5] lavc/h264dsp: factor some mostly identical R-V V code Rémi Denis-Courmont
  2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 2/5] lavc/h264dsp: move R-V V idct_dc_add Rémi Denis-Courmont
@ 2024-07-18 19:35 ` Rémi Denis-Courmont
  2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 4/5] lavc/h264dsp: reuse the R-V V IDCT DC add functions Rémi Denis-Courmont
  2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 5/5] lavc/h264dsp: reduce spills in R-V V idct_add16 Rémi Denis-Courmont
  3 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-18 19:35 UTC (permalink / raw)
  To: ffmpeg-devel

T-Head C908 (cycles):
h264_idct4_dc_add_8bpp_c:        94.7
h264_idct4_dc_add_8bpp_rvv_i32:  55.0 (before)
h264_idct4_dc_add_8bpp_rvv_i32:  34.5 (after)
h264_idct4_dc_add_9bpp_c:        94.7
h264_idct4_dc_add_9bpp_rvv_i32:  43.5 (before)
h264_idct4_dc_add_9bpp_rvv_i32:  38.2 (after)
h264_idct4_dc_add_10bpp_c:       94.7
h264_idct4_dc_add_10bpp_rvv_i32: 43.5 (before)
h264_idct4_dc_add_10bpp_rvv_i32: 38.2 (after)
h264_idct4_dc_add_12bpp_c:       94.7
h264_idct4_dc_add_12bpp_rvv_i32: 43.7 (before)
h264_idct4_dc_add_12bpp_rvv_i32: 38.5 (after)
h264_idct4_dc_add_14bpp_c:       94.7
h264_idct4_dc_add_14bpp_rvv_i32: 43.7 (before)
h264_idct4_dc_add_14bpp_rvv_i32: 38.5 (after)
---
 libavcodec/riscv/h264idct_rvv.S | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index 37b27fc92a..2648e06aeb 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -416,22 +416,23 @@ endfunc
 .endr
 
 .macro idct_dc_add8 width
-func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
+func ff_h264_idct\width\()_dc_add_8_rvv, zve64x
 .if \width == 8
-        vsetivli        zero, \width, e16, m1, ta, ma
+        vsetivli        zero, \width, e8, mf2, ta, ma
 .else
-        vsetivli        zero, \width, e16, mf2, ta, ma
+        vsetivli        zero, \width, e8, mf4, ta, ma
 .endif
         lh              a3, 0(a1)
         addi            a3, a3, 32
         srai            a3, a3, 6
         sh              zero, 0(a1)
 .if \width == 8
+        li              a6, \width * \width
         vlse64.v        v24, (a0), a2
-        vsetvli         t0, zero, e16, m8, ta, ma
+        vsetvli         zero, a6, e16, m8, ta, ma
 .else
         vlse32.v        v24, (a0), a2
-        vsetvli         t0, zero, e16, m4, ta, ma
+        vsetivli        zero, \width * \width, e16, m2, ta, ma
 .endif
         vzext.vf2       v0, v24
         vadd.vx         v0, v0, a3
@@ -439,13 +440,14 @@ func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
 .if \width == 8
         vsetvli         zero, zero, e8, m4, ta, ma
 .else
-        vsetvli         zero, zero, e8, m2, ta, ma
+        vsetvli         zero, zero, e8, m1, ta, ma
 .endif
         vnclipu.wi      v24, v0, 0
-        vsetivli        zero, \width, e8, m1, ta, ma
 .if \width == 8
+        vsetivli        zero, \width, e8, mf2, ta, ma
         vsse64.v        v24, (a0), a2
 .else
+        vsetivli        zero, \width, e8, mf4, ta, ma
         vsse32.v        v24, (a0), a2
 .endif
         ret
@@ -457,7 +459,11 @@ idct_dc_add8 8
 
 .macro idct_dc_add width
 func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
+.if \width == 8
         vsetivli        zero, \width, e16, m1, ta, ma
+.else
+        vsetivli        zero, \width, e16, mf2, ta, ma
+.endif
         lw              a3, 0(a1)
         addi            a3, a3, 32
         srai            a3, a3, 6
@@ -487,7 +493,11 @@ func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
         vadd.vx         v0, v0, a3
         vmax.vx         v0, v0, zero
         vmin.vx         v0, v0, a5
+.if \width == 8
         vsetivli        zero, \width, e16, m1, ta, ma
+.else
+        vsetivli        zero, \width, e16, mf2, ta, ma
+.endif
         vse16.v         v0, (a0)
         vse16.v         v1, (t4)
         vse16.v         v2, (t5)
-- 
2.45.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH 4/5] lavc/h264dsp: reuse the R-V V IDCT DC add functions
  2024-07-18 19:35 [FFmpeg-devel] [PATCH 1/5] lavc/h264dsp: factor some mostly identical R-V V code Rémi Denis-Courmont
  2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 2/5] lavc/h264dsp: move R-V V idct_dc_add Rémi Denis-Courmont
  2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 3/5] lavc/h264dsp: correct VL and LMUL in idct_dc_add Rémi Denis-Courmont
@ 2024-07-18 19:35 ` Rémi Denis-Courmont
  2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 5/5] lavc/h264dsp: reduce spills in R-V V idct_add16 Rémi Denis-Courmont
  3 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-18 19:35 UTC (permalink / raw)
  To: ffmpeg-devel

This reuses the DC bypass functions from the multiple IDCT functions, to
leverage vector code.

As an added bonus, the caller functions can now rely on the callee functions
to preserve their parameters, thus cutting down on stack spills.
---
 libavcodec/riscv/h264idct_rvv.S | 76 +++++++--------------------------
 1 file changed, 16 insertions(+), 60 deletions(-)

diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index 2648e06aeb..c42db6ef29 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -536,7 +536,7 @@ endconst
 .macro  idct4_adds type, depth
 func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
         csrwi   vxrm, 0
-        addi    sp, sp, -96
+        addi    sp, sp, -64
         lla     t0, ff_h264_scan8
         sd      s0,   (sp)
         li      t1, 32 * (\depth / 8)
@@ -547,14 +547,6 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
         sd      s3, 32(sp)
         sd      s4, 40(sp)
         sd      s5, 48(sp)
-        sd      s6, 56(sp)
-        sd      s7, 64(sp)
-.if \depth > 8
-        sd      s8, 72(sp)
-        sd      s9, 80(sp)
-        mv      s8, a5
-        mv      s9, a6
-.endif
         vsetivli  zero, 16, e8, m1, ta, ma
         vle8.v    v8, (t0)
 .if \depth == 8
@@ -583,8 +575,8 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
         li      s1, 16
         mv      s4, a0
         mv      s5, a1
-        mv      s6, a2
-        mv      s7, a3
+        mv      a1, a2
+        mv      a2, a3
 1:
         andi    t0, s2, 1
         addi    s1, s1, -1
@@ -594,12 +586,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
 .endif
         lw      t2, (s5)   # block_offset[i]
         andi    t1, s3, 1
-        mv      a1, s6
-        mv      a2, s7
         add     a0, s4, t2
-.if \depth > 8
-        mv      a5, s8
-.endif
 .ifc \type, 16
         bnez    t1, 2f     # if (nnz == 1 && block[i * 16])
 .else
@@ -611,23 +598,13 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
 .ifnc \type, 16
         beqz    t1, 3f    # if (block[i * 16])
 .endif
-.if \depth == 8
-        call    ff_h264_idct_dc_add_\depth\()_c
-.else
-        jalr    s9
-.endif
+        jal     ff_h264_idct4_dc_add_\depth\()_rvv
 3:
         srli    s3, s3, 1
         addi    s5, s5, 4
-        addi    s6, s6, 16 * 2 * (\depth / 8)
+        addi    a1, a1, 16 * 2 * (\depth / 8)
         bnez    s1, 1b
 
-.if \depth > 8
-        ld      s9, 80(sp)
-        ld      s8, 72(sp)
-.endif
-        ld      s7, 64(sp)
-        ld      s6, 56(sp)
         ld      s5, 48(sp)
         ld      s4, 40(sp)
         ld      s3, 32(sp)
@@ -635,7 +612,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
         ld      s1, 16(sp)
         ld      ra,  8(sp)
         ld      s0,  0(sp)
-        addi    sp, sp, 96
+        addi    sp, sp, 64
         ret
 endfunc
 .endm
@@ -646,7 +623,7 @@ idct4_adds 16intra, \depth
 
 func ff_h264_idct8_add4_\depth\()_rvv, zve32x
         csrwi       vxrm, 0
-        addi    sp, sp, -96
+        addi    sp, sp, -64
         lla     t0, ff_h264_scan8
         sd      s0,   (sp)
         li      t1, 4 * 32 * (\depth / 8)
@@ -658,14 +635,6 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
         sd      s3, 32(sp)
         sd      s4, 40(sp)
         sd      s5, 48(sp)
-        sd      s6, 56(sp)
-        sd      s7, 64(sp)
-.if \depth > 8
-        sd      s8, 72(sp)
-        sd      s9, 80(sp)
-        mv      s8, a5
-        mv      s9, a6
-.endif
         vsetivli  zero, 4, e8, mf4, ta, ma
         vlse8.v   v8, (t0), t2
 .if \depth == 8
@@ -689,8 +658,8 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
         li      s1, 4
         mv      s4, a0
         mv      s5, a1
-        mv      s6, a2
-        mv      s7, a3
+        mv      a1, a2
+        mv      a2, a3
 1:
         andi    t0, s2, 1
         addi    s1, s1, -1
@@ -698,33 +667,23 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
         beqz    t0, 3f     # if (nnz)
         lw      t2, (s5)   # block_offset[i]
         andi    t1, s3, 1
-        mv      a1, s6
-        mv      a2, s7
         add     a0, s4, t2
-.if \depth > 8
-        mv      a5, s8
-.endif
         bnez    t1, 2f    # if (nnz == 1 && block[i * 16])
         jal     .Lidct8_add_\depth\()_rvv
-        j       3f
-2:
 .if \depth == 8
-        call    ff_h264_idct8_dc_add_\depth\()_c
+        j       3f
 .else
-        jalr    s9
+        j       4f        # idct8_add_16 updates a1
 .endif
+2:
+        jal     ff_h264_idct8_dc_add_\depth\()_rvv
 3:
+        addi    a1, a1, 4 * 16 * 2 * (\depth / 8)
+4:
         srli    s3, s3, 1
         addi    s5, s5, 4 * 4
-        addi    s6, s6, 4 * 16 * 2 * (\depth / 8)
         bnez    s1, 1b
 
-.if \depth > 8
-        ld      s9, 80(sp)
-        ld      s8, 72(sp)
-.endif
-        ld      s7, 64(sp)
-        ld      s6, 56(sp)
         ld      s5, 48(sp)
         ld      s4, 40(sp)
         ld      s3, 32(sp)
@@ -732,7 +691,7 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
         ld      s1, 16(sp)
         ld      ra,  8(sp)
         ld      s0,  0(sp)
-        addi    sp, sp, 96
+        addi    sp, sp, 64
         ret
 endfunc
 .endr
@@ -740,19 +699,16 @@ endfunc
 .irp    depth, 9, 10, 12, 14
 func ff_h264_idct_add16_\depth\()_rvv, zve32x
         li      a5, (1 << \depth) - 1
-        lla     a6, ff_h264_idct_dc_add_\depth\()_c
         j       ff_h264_idct_add16_16_rvv
 endfunc
 
 func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
         li      a5, (1 << \depth) - 1
-        lla     a6, ff_h264_idct_dc_add_\depth\()_c
         j       ff_h264_idct_add16intra_16_rvv
 endfunc
 
 func ff_h264_idct8_add4_\depth\()_rvv, zve32x
         li      a5, (1 << \depth) - 1
-        lla     a6, ff_h264_idct8_dc_add_\depth\()_c
         j       ff_h264_idct8_add4_16_rvv
 endfunc
 .endr
-- 
2.45.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH 5/5] lavc/h264dsp: reduce spills in R-V V idct_add16
  2024-07-18 19:35 [FFmpeg-devel] [PATCH 1/5] lavc/h264dsp: factor some mostly identical R-V V code Rémi Denis-Courmont
                   ` (2 preceding siblings ...)
  2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 4/5] lavc/h264dsp: reuse the R-V V IDCT DC add functions Rémi Denis-Courmont
@ 2024-07-18 19:35 ` Rémi Denis-Courmont
  3 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-18 19:35 UTC (permalink / raw)
  To: ffmpeg-devel

---
 libavcodec/riscv/h264idct_rvv.S | 100 ++++++++++++++------------------
 1 file changed, 45 insertions(+), 55 deletions(-)

diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index c42db6ef29..c74ea18c19 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -422,9 +422,9 @@ func ff_h264_idct\width\()_dc_add_8_rvv, zve64x
 .else
         vsetivli        zero, \width, e8, mf4, ta, ma
 .endif
-        lh              a3, 0(a1)
-        addi            a3, a3, 32
-        srai            a3, a3, 6
+        lh              t0, 0(a1)
+        addi            t0, t0, 32
+        srai            t0, t0, 6
         sh              zero, 0(a1)
 .if \width == 8
         li              a6, \width * \width
@@ -435,7 +435,7 @@ func ff_h264_idct\width\()_dc_add_8_rvv, zve64x
         vsetivli        zero, \width * \width, e16, m2, ta, ma
 .endif
         vzext.vf2       v0, v24
-        vadd.vx         v0, v0, a3
+        vadd.vx         v0, v0, t0
         vmax.vx         v0, v0, zero
 .if \width == 8
         vsetvli         zero, zero, e8, m4, ta, ma
@@ -464,33 +464,33 @@ func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
 .else
         vsetivli        zero, \width, e16, mf2, ta, ma
 .endif
-        lw              a3, 0(a1)
-        addi            a3, a3, 32
-        srai            a3, a3, 6
+        lw              t0, 0(a1)
+        addi            t0, t0, 32
+        srai            t0, t0, 6
         sw              zero, 0(a1)
-        add             t4, a0, a2
-        sh1add          t5, a2, a0
-        sh1add          t6, a2, t4
+        add             t1, a0, a2
+        sh1add          t2, a2, a0
+        sh1add          t3, a2, t1
 .if \width == 8
-        sh2add          t0, a2, a0
-        sh2add          t1, a2, t4
-        sh2add          t2, a2, t5
-        sh2add          t3, a2, t6
+        sh2add          t4, a2, a0
+        sh2add          t5, a2, t1
+        sh2add          t6, a2, t2
+        sh2add          a7, a2, t3
 .endif
         vle16.v         v0, (a0)
-        vle16.v         v1, (t4)
-        vle16.v         v2, (t5)
-        vle16.v         v3, (t6)
+        vle16.v         v1, (t1)
+        vle16.v         v2, (t2)
+        vle16.v         v3, (t3)
 .if \width == 8
-        vle16.v         v4, (t0)
-        vle16.v         v5, (t1)
-        vle16.v         v6, (t2)
-        vle16.v         v7, (t3)
+        vle16.v         v4, (t4)
+        vle16.v         v5, (t5)
+        vle16.v         v6, (t6)
+        vle16.v         v7, (a7)
         vsetvli         a6, zero, e16, m8, ta, ma
 .else
         vsetvli         a6, zero, e16, m4, ta, ma
 .endif
-        vadd.vx         v0, v0, a3
+        vadd.vx         v0, v0, t0
         vmax.vx         v0, v0, zero
         vmin.vx         v0, v0, a5
 .if \width == 8
@@ -499,14 +499,14 @@ func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
         vsetivli        zero, \width, e16, mf2, ta, ma
 .endif
         vse16.v         v0, (a0)
-        vse16.v         v1, (t4)
-        vse16.v         v2, (t5)
-        vse16.v         v3, (t6)
+        vse16.v         v1, (t1)
+        vse16.v         v2, (t2)
+        vse16.v         v3, (t3)
 .if \width == 8
-        vse16.v         v4, (t0)
-        vse16.v         v5, (t1)
-        vse16.v         v6, (t2)
-        vse16.v         v7, (t3)
+        vse16.v         v4, (t4)
+        vse16.v         v5, (t5)
+        vse16.v         v6, (t6)
+        vse16.v         v7, (a7)
 .endif
         ret
 endfunc
@@ -536,17 +536,12 @@ endconst
 .macro  idct4_adds type, depth
 func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
         csrwi   vxrm, 0
-        addi    sp, sp, -64
+        addi    sp, sp, -16
         lla     t0, ff_h264_scan8
         sd      s0,   (sp)
         li      t1, 32 * (\depth / 8)
         mv      s0, sp
         sd      ra,  8(sp)
-        sd      s1, 16(sp)
-        sd      s2, 24(sp)
-        sd      s3, 32(sp)
-        sd      s4, 40(sp)
-        sd      s5, 48(sp)
         vsetivli  zero, 16, e8, m1, ta, ma
         vle8.v    v8, (t0)
 .if \depth == 8
@@ -570,23 +565,23 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
         vmand.mm  v1, v1, v2
 .endif
         vsetvli   zero, zero, e16, m2, ta, ma
-        vmv.x.s   s2, v0
-        vmv.x.s   s3, v1
-        li      s1, 16
-        mv      s4, a0
-        mv      s5, a1
+        vmv.x.s   a4, v0
+        vmv.x.s   a7, v1
+        mv      t4, a0
+        mv      t5, a1
         mv      a1, a2
         mv      a2, a3
+        li      a3, 16
 1:
-        andi    t0, s2, 1
-        addi    s1, s1, -1
-        srli    s2, s2, 1
+        andi    t0, a4, 1
+        addi    a3, a3, -1
+        srli    a4, a4, 1
 .ifc \type, 16
         beqz    t0, 3f     # if (nnz)
 .endif
-        lw      t2, (s5)   # block_offset[i]
-        andi    t1, s3, 1
-        add     a0, s4, t2
+        lw      t2, (t5)   # block_offset[i]
+        andi    t1, a7, 1
+        add     a0, t4, t2
 .ifc \type, 16
         bnez    t1, 2f     # if (nnz == 1 && block[i * 16])
 .else
@@ -600,19 +595,14 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
 .endif
         jal     ff_h264_idct4_dc_add_\depth\()_rvv
 3:
-        srli    s3, s3, 1
-        addi    s5, s5, 4
+        srli    a7, a7, 1
+        addi    t5, t5, 4
         addi    a1, a1, 16 * 2 * (\depth / 8)
-        bnez    s1, 1b
+        bnez    a3, 1b
 
-        ld      s5, 48(sp)
-        ld      s4, 40(sp)
-        ld      s3, 32(sp)
-        ld      s2, 24(sp)
-        ld      s1, 16(sp)
         ld      ra,  8(sp)
         ld      s0,  0(sp)
-        addi    sp, sp, 64
+        addi    sp, sp, 16
         ret
 endfunc
 .endm
-- 
2.45.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-07-18 19:36 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-07-18 19:35 [FFmpeg-devel] [PATCH 1/5] lavc/h264dsp: factor some mostly identical R-V V code Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 2/5] lavc/h264dsp: move R-V V idct_dc_add Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 3/5] lavc/h264dsp: correct VL and LMUL in idct_dc_add Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 4/5] lavc/h264dsp: reuse the R-V V IDCT DC add functions Rémi Denis-Courmont
2024-07-18 19:35 ` [FFmpeg-devel] [PATCH 5/5] lavc/h264dsp: reduce spills in R-V V idct_add16 Rémi Denis-Courmont

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git