Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/2] lavc/aacpsdsp: rework R-V V add_squares
@ 2023-11-08 20:30 Rémi Denis-Courmont
  2023-11-08 20:30 ` [FFmpeg-devel] [PATCH 2/2] lavc/aacpsdsp: rework R-V V hybrid_synthesis_deint Rémi Denis-Courmont
  0 siblings, 1 reply; 2+ messages in thread
From: Rémi Denis-Courmont @ 2023-11-08 20:30 UTC (permalink / raw)
  To: ffmpeg-devel

Segmented loads may be slower than not. So this advantageously uses a
unit-strided load and narrowing shifts instead.

Before:
ps_add_squares_c: 60757.7
ps_add_squares_rvv_f32: 22242.5

After:
ps_add_squares_c: 60516.0
ps_add_squares_rvv_i64: 17067.7
---
 libavcodec/riscv/aacpsdsp_init.c | 3 ++-
 libavcodec/riscv/aacpsdsp_rvv.S  | 9 ++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/libavcodec/riscv/aacpsdsp_init.c b/libavcodec/riscv/aacpsdsp_init.c
index c5ec796232..f72d1bc330 100644
--- a/libavcodec/riscv/aacpsdsp_init.c
+++ b/libavcodec/riscv/aacpsdsp_init.c
@@ -46,7 +46,8 @@ av_cold void ff_psdsp_init_riscv(PSDSPContext *c)
         c->hybrid_analysis = ff_ps_hybrid_analysis_rvv;
 
         if (flags & AV_CPU_FLAG_RVB_ADDR) {
-            c->add_squares = ff_ps_add_squares_rvv;
+            if (flags & AV_CPU_FLAG_RVV_I64)
+                c->add_squares = ff_ps_add_squares_rvv;
             c->mul_pair_single = ff_ps_mul_pair_single_rvv;
             c->stereo_interpolate[0] = ff_ps_stereo_interpolate_rvv;
         }
diff --git a/libavcodec/riscv/aacpsdsp_rvv.S b/libavcodec/riscv/aacpsdsp_rvv.S
index fe250cd83b..cf872599c8 100644
--- a/libavcodec/riscv/aacpsdsp_rvv.S
+++ b/libavcodec/riscv/aacpsdsp_rvv.S
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2022 Rémi Denis-Courmont.
+ * Copyright © 2022-2023 Rémi Denis-Courmont.
  *
  * This file is part of FFmpeg.
  *
@@ -20,13 +20,16 @@
 
 #include "libavutil/riscv/asm.S"
 
-func ff_ps_add_squares_rvv, zve32f
+func ff_ps_add_squares_rvv, zve64f
+        li          t1, 32
 1:
         vsetvli     t0, a2, e32, m4, ta, ma
-        vlseg2e32.v v24, (a1)
+        vle64.v     v8, (a1)
         sub         a2, a2, t0
+        vnsrl.wx    v24, v8, zero
         vle32.v     v16, (a0)
         sh3add      a1, t0, a1
+        vnsrl.wx    v28, v8, t1
         vfmacc.vv   v16, v24, v24
         vfmacc.vv   v16, v28, v28
         vse32.v     v16, (a0)
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [FFmpeg-devel] [PATCH 2/2] lavc/aacpsdsp: rework R-V V hybrid_synthesis_deint
  2023-11-08 20:30 [FFmpeg-devel] [PATCH 1/2] lavc/aacpsdsp: rework R-V V add_squares Rémi Denis-Courmont
@ 2023-11-08 20:30 ` Rémi Denis-Courmont
  0 siblings, 0 replies; 2+ messages in thread
From: Rémi Denis-Courmont @ 2023-11-08 20:30 UTC (permalink / raw)
  To: ffmpeg-devel

Given the size of the data set, strided memory accesses cannot be avoided.
We can still do better than the current code.

ps_hybrid_synthesis_deint_c:       12065.5
ps_hybrid_synthesis_deint_rvv_i32: 13650.2 (before)
ps_hybrid_synthesis_deint_rvv_i64:  8181.0 (after)
---
 libavcodec/riscv/aacpsdsp_init.c |  8 ++---
 libavcodec/riscv/aacpsdsp_rvv.S  | 61 +++++++++++++++++---------------
 2 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/libavcodec/riscv/aacpsdsp_init.c b/libavcodec/riscv/aacpsdsp_init.c
index f72d1bc330..e094660cf3 100644
--- a/libavcodec/riscv/aacpsdsp_init.c
+++ b/libavcodec/riscv/aacpsdsp_init.c
@@ -46,16 +46,16 @@ av_cold void ff_psdsp_init_riscv(PSDSPContext *c)
         c->hybrid_analysis = ff_ps_hybrid_analysis_rvv;
 
         if (flags & AV_CPU_FLAG_RVB_ADDR) {
-            if (flags & AV_CPU_FLAG_RVV_I64)
+            if (flags & AV_CPU_FLAG_RVV_I64) {
                 c->add_squares = ff_ps_add_squares_rvv;
+                c->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_rvv;
+            }
             c->mul_pair_single = ff_ps_mul_pair_single_rvv;
             c->stereo_interpolate[0] = ff_ps_stereo_interpolate_rvv;
         }
     }
 
-    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
+    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR))
         c->hybrid_analysis_ileave = ff_ps_hybrid_analysis_ileave_rvv;
-        c->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_rvv;
-    }
 #endif
 }
diff --git a/libavcodec/riscv/aacpsdsp_rvv.S b/libavcodec/riscv/aacpsdsp_rvv.S
index cf872599c8..1dc426e01c 100644
--- a/libavcodec/riscv/aacpsdsp_rvv.S
+++ b/libavcodec/riscv/aacpsdsp_rvv.S
@@ -190,38 +190,41 @@ func ff_ps_hybrid_analysis_ileave_rvv, zve32x /* no needs for zve32f here */
         ret
 endfunc
 
-func ff_ps_hybrid_synthesis_deint_rvv, zve32x
-        slli        t1, a2, 5 + 1 + 2
-        sh2add      a0, a2, a0
-        add         a1, a1, t1
-        addi        a2, a2, -64
-        li          t1, 38 * 64 * 4
-        li          t6, 64 * 4
-        add         a4, a0, t1
-        beqz        a2, 3f
+func ff_ps_hybrid_synthesis_deint_rvv, zve64x
+        slli    t0, a2, 5 + 1 + 2
+        sh2add  a0, a2, a0
+        add     a1, a1, t0
+        addi    t2, a2, -64
+        li      t0, 38 * 64
+        li      t1, 32 * 2 * 4
+        li      t4, 8 - 16384 // offset from in[64][n][0] to in[0][n + 1][0]
+        slli    t5, a2, 5 + 1 + 2 // and from in[0][n+1][0] to in[0][n+1][s]
+        neg     t2, t2
+        li      t3, 32
+        add     a4, t4, t5
+        sh2add  t0, t0, a0
 1:
-        mv          t0, a0
-        mv          t1, a1
-        mv          t3, a3
-        mv          t4, a4
-        addi        a2, a2, 1
+        mv      t4, t2
+        addi    a3, a3, -1
 2:
-        vsetvli     t5, t3, e32, m4, ta, ma
-        vlseg2e32.v v16, (t1)
-        sub         t3, t3, t5
-        vsse32.v    v16, (t0), t6
-        mul         t2, t5, t6
-        vsse32.v    v20, (t4), t6
-        sh3add      t1, t5, t1
-        add         t0, t0, t2
-        add         t4, t4, t2
-        bnez        t3, 2b
+        vsetvli t5, t4, e32, m4, ta, ma
+        vlse64.v v16, (a1), t1 /* sizeof (float[32][2]) */
+        sub     t4, t4, t5
+        vnsrl.wx v24, v16, zero
+        slli    t6, t5, 5 + 1 + 2
+        vnsrl.wx v28, v16, t3 /* 32 */
+        add     a1, a1, t6
+        vse32.v v24, (a0)
+        sh2add  a0, t5, a0
+        vse32.v v28, (t0)
+        sh2add  t0, t5, t0
+        bnez    t4, 2b
+
+        add     a1, a1, a4
+        sh2add  a0, a2, a0
+        sh2add  t0, a2, t0
+        bnez    a3, 1b
 
-        add         a0, a0, 4
-        add         a1, a1, 32 * 2 * 4
-        add         a4, a4, 4
-        bnez        a2, 1b
-3:
         ret
 endfunc
 
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-11-08 20:30 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-08 20:30 [FFmpeg-devel] [PATCH 1/2] lavc/aacpsdsp: rework R-V V add_squares Rémi Denis-Courmont
2023-11-08 20:30 ` [FFmpeg-devel] [PATCH 2/2] lavc/aacpsdsp: rework R-V V hybrid_synthesis_deint Rémi Denis-Courmont

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git