Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/2] checkasm/lpc: test compute_autocorr
@ 2023-12-12 21:02 Rémi Denis-Courmont
  2023-12-12 21:02 ` [FFmpeg-devel] [PATCH 2/2] lavc/lpc: R-V V compute_autocorr Rémi Denis-Courmont
  2023-12-14 16:41 ` [FFmpeg-devel] [PATCH 1/2] checkasm/lpc: test compute_autocorr Michael Niedermayer
  0 siblings, 2 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2023-12-12 21:02 UTC (permalink / raw)
  To: ffmpeg-devel

---
 tests/checkasm/lpc.c | 42 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/tests/checkasm/lpc.c b/tests/checkasm/lpc.c
index 592e34c03d..4d84defec3 100644
--- a/tests/checkasm/lpc.c
+++ b/tests/checkasm/lpc.c
@@ -57,10 +57,41 @@ static void test_window(int len)
     bench_new(src, len, dst1);
 }
 
+static void test_compute_autocorr(ptrdiff_t len, int lag)
+{
+    LOCAL_ALIGNED(16, double, src, [5000]);
+    LOCAL_ALIGNED(16, double, dst0, [MAX_LPC_ORDER + 1]);
+    LOCAL_ALIGNED(16, double, dst1, [MAX_LPC_ORDER + 1]);
+
+    declare_func(void, const double *in, ptrdiff_t len, int lag, double *out);
+
+    av_assert0(lag >= 0 && lag <= MAX_LPC_ORDER);
+
+    for (size_t i = 0; i < len; i++) {
+        src[i] = (double)rnd() / (double)UINT_MAX;
+    }
+
+    call_ref(src, len, lag, dst0);
+    call_new(src, len, lag, dst1);
+
+    for (size_t i = 0; i < lag; i++) {
+        if (!double_near_abs_eps(dst0[i], dst1[i], EPS)) {
+            fprintf(stderr, "%zu: %- .12f - %- .12f = % .12g\n",
+                    i, dst0[i], dst1[i], dst0[i] - dst1[i]);
+            fail();
+            break;
+        }
+    }
+
+    bench_new(src, len, lag, dst1);
+}
+
 void checkasm_check_lpc(void)
 {
     LPCContext ctx;
-    int len = rnd() % 5000;
+    int len = 2000 + (rnd() % 3000);
+    static const int lags[] = { 10, 30, 32 };
+
     ff_lpc_init(&ctx, 32, 16, FF_LPC_TYPE_DEFAULT);
 
     if (check_func(ctx.lpc_apply_welch_window, "apply_welch_window_even")) {
@@ -72,6 +103,13 @@ void checkasm_check_lpc(void)
         test_window(len | 1);
     }
     report("apply_welch_window_odd");
-
     ff_lpc_end(&ctx);
+
+    for (size_t i = 0; i < FF_ARRAY_ELEMS(lags); i++) {
+        ff_lpc_init(&ctx, 32, lags[i], FF_LPC_TYPE_DEFAULT);
+        if (check_func(ctx.lpc_compute_autocorr, "autocorr_%d", lags[i]))
+            test_compute_autocorr(len, lags[i]);
+        report("compute_autocorr_%d", lags[i]);
+        ff_lpc_end(&ctx);
+    }
 }
-- 
2.43.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH 2/2] lavc/lpc: R-V V compute_autocorr
  2023-12-12 21:02 [FFmpeg-devel] [PATCH 1/2] checkasm/lpc: test compute_autocorr Rémi Denis-Courmont
@ 2023-12-12 21:02 ` Rémi Denis-Courmont
  2023-12-12 21:05   ` Rémi Denis-Courmont
  2023-12-14 16:41 ` [FFmpeg-devel] [PATCH 1/2] checkasm/lpc: test compute_autocorr Michael Niedermayer
  1 sibling, 1 reply; 5+ messages in thread
From: Rémi Denis-Courmont @ 2023-12-12 21:02 UTC (permalink / raw)
  To: ffmpeg-devel

The loop iterates over the length of the vector, not the order. This is
to avoid reloading the same data for each lag value. However this means
the loop only works if the maximum order is no larger than VLENB.

The loop is roughly equivalent to:

    for (size_t j = 0; j < lag; j++)
        autoc[j] = 1.;

    while (len > lag) {
        for (ptrdiff_t j = 0; j < lag; j++)
            autoc[j] += data[j] * *data;
        data++;
        len--;
    }

    while (len > 0) {
        for (ptrdiff_t j = 0; j < len; j++)
            autoc[j] += data[j] * *data;
        data++;
        len--;
    }

Since register pressure is only at 50%, it should be possible to implement
the same loop for order up to 2xVLENB. But this is left for future work.

Performance numbers are all over the place from ~1.25x to ~4x speedups,
but at least they are always noticeably better than nothing.
---
 libavcodec/riscv/lpc_init.c |  8 +++++++-
 libavcodec/riscv/lpc_rvv.S  | 29 +++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/libavcodec/riscv/lpc_init.c b/libavcodec/riscv/lpc_init.c
index c16e5745f0..ab91956f2d 100644
--- a/libavcodec/riscv/lpc_init.c
+++ b/libavcodec/riscv/lpc_init.c
@@ -22,16 +22,22 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
 #include "libavcodec/lpc.h"
 
 void ff_lpc_apply_welch_window_rvv(const int32_t *, ptrdiff_t, double *);
+void ff_lpc_compute_autocorr_rvv(const double *, ptrdiff_t, int, double *);
 
 av_cold void ff_lpc_init_riscv(LPCContext *c)
 {
 #if HAVE_RVV && (__riscv_xlen >= 64)
     int flags = av_get_cpu_flags();
 
-    if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR))
+    if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
         c->lpc_apply_welch_window = ff_lpc_apply_welch_window_rvv;
+
+        if (ff_get_rv_vlenb() >= c->max_order)
+            c->lpc_compute_autocorr = ff_lpc_compute_autocorr_rvv;
+    }
 #endif
 }
diff --git a/libavcodec/riscv/lpc_rvv.S b/libavcodec/riscv/lpc_rvv.S
index f81a2392c1..654156bf12 100644
--- a/libavcodec/riscv/lpc_rvv.S
+++ b/libavcodec/riscv/lpc_rvv.S
@@ -85,4 +85,33 @@ func ff_lpc_apply_welch_window_rvv, zve64d
 
         ret
 endfunc
+
+func ff_lpc_compute_autocorr_rvv, zve64d
+        li        t0, 1
+        vsetvli   t1, a2, e64, m8, ta, ma
+        fcvt.d.l  ft0, t0
+        vle64.v   v0, (a0)
+        sh3add    a0, a2, a0   # data += lag
+        vfmv.v.f  v16, ft0
+        bge       a2, a1, 2f
+1:
+        vfmv.f.s  ft0, v0
+        fld       ft1, (a0)    # ft1 = data[lag + i]
+        vfmacc.vf v16, ft0, v0 # v16[j] += data[i] * data[i + j]
+        addi      a1, a1, -1
+        vfslide1down.vf v0, v0, ft1
+        addi      a0, a0, 8
+        bgt       a1, a2, 1b   # while (len > lag);
+2:
+        vfmv.f.s  ft0, v0
+        vsetvli   zero, a1, e64, m8, tu, ma
+        vfmacc.vf v16, ft0, v0
+        addi      a1, a1, -1
+        vslide1down.vx v0, v0, zero
+        bnez      a1, 2b       # while (len > 0);
+
+        vsetvli   zero, a2, e64, m8, ta, ma
+        vse64.v   v16, (a3)
+        ret
+endfunc
 #endif
-- 
2.43.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] lavc/lpc: R-V V compute_autocorr
  2023-12-12 21:02 ` [FFmpeg-devel] [PATCH 2/2] lavc/lpc: R-V V compute_autocorr Rémi Denis-Courmont
@ 2023-12-12 21:05   ` Rémi Denis-Courmont
  0 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2023-12-12 21:05 UTC (permalink / raw)
  To: ffmpeg-devel

Le tiistaina 12. joulukuuta 2023, 23.02.40 EET Rémi Denis-Courmont a écrit :
> The loop iterates over the length of the vector, not the order. This is
> to avoid reloading the same data for each lag value. However this means
> the loop only works if the maximum order is no larger than VLENB.
> 
> The loop is roughly equivalent to:
> 
>     for (size_t j = 0; j < lag; j++)
>         autoc[j] = 1.;
> 
>     while (len > lag) {
>         for (ptrdiff_t j = 0; j < lag; j++)
>             autoc[j] += data[j] * *data;
>         data++;
>         len--;
>     }
> 
>     while (len > 0) {
>         for (ptrdiff_t j = 0; j < len; j++)
>             autoc[j] += data[j] * *data;
>         data++;
>         len--;
>     }
> 
> Since register pressure is only at 50%, it should be possible to implement
> the same loop for order up to 2xVLENB. But this is left for future work.
> 
> Performance numbers are all over the place from ~1.25x to ~4x speedups,
> but at least they are always noticeably better than nothing.
> ---
>  libavcodec/riscv/lpc_init.c |  8 +++++++-
>  libavcodec/riscv/lpc_rvv.S  | 29 +++++++++++++++++++++++++++++
>  2 files changed, 36 insertions(+), 1 deletion(-)
> 
> diff --git a/libavcodec/riscv/lpc_init.c b/libavcodec/riscv/lpc_init.c
> index c16e5745f0..ab91956f2d 100644
> --- a/libavcodec/riscv/lpc_init.c
> +++ b/libavcodec/riscv/lpc_init.c
> @@ -22,16 +22,22 @@
> 
>  #include "libavutil/attributes.h"
>  #include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
>  #include "libavcodec/lpc.h"
> 
>  void ff_lpc_apply_welch_window_rvv(const int32_t *, ptrdiff_t, double *);
> +void ff_lpc_compute_autocorr_rvv(const double *, ptrdiff_t, int, double *);
> 
>  av_cold void ff_lpc_init_riscv(LPCContext *c)
>  {
>  #if HAVE_RVV && (__riscv_xlen >= 64)
>      int flags = av_get_cpu_flags();
> 
> -    if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR))
> +    if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
>          c->lpc_apply_welch_window = ff_lpc_apply_welch_window_rvv;
> +
> +        if (ff_get_rv_vlenb() >= c->max_order)
> +            c->lpc_compute_autocorr = ff_lpc_compute_autocorr_rvv;
> +    }
>  #endif
>  }
> diff --git a/libavcodec/riscv/lpc_rvv.S b/libavcodec/riscv/lpc_rvv.S
> index f81a2392c1..654156bf12 100644
> --- a/libavcodec/riscv/lpc_rvv.S
> +++ b/libavcodec/riscv/lpc_rvv.S
> @@ -85,4 +85,33 @@ func ff_lpc_apply_welch_window_rvv, zve64d
> 
>          ret
>  endfunc
> +
> +func ff_lpc_compute_autocorr_rvv, zve64d
> +        li        t0, 1
> +        vsetvli   t1, a2, e64, m8, ta, ma

t1 is unused and should be zero. This is leftover from incomplete attempt to 
unroll.

> +        fcvt.d.l  ft0, t0
> +        vle64.v   v0, (a0)
> +        sh3add    a0, a2, a0   # data += lag
> +        vfmv.v.f  v16, ft0
> +        bge       a2, a1, 2f
> +1:
> +        vfmv.f.s  ft0, v0
> +        fld       ft1, (a0)    # ft1 = data[lag + i]
> +        vfmacc.vf v16, ft0, v0 # v16[j] += data[i] * data[i + j]
> +        addi      a1, a1, -1
> +        vfslide1down.vf v0, v0, ft1
> +        addi      a0, a0, 8
> +        bgt       a1, a2, 1b   # while (len > lag);
> +2:
> +        vfmv.f.s  ft0, v0
> +        vsetvli   zero, a1, e64, m8, tu, ma
> +        vfmacc.vf v16, ft0, v0
> +        addi      a1, a1, -1
> +        vslide1down.vx v0, v0, zero
> +        bnez      a1, 2b       # while (len > 0);
> +
> +        vsetvli   zero, a2, e64, m8, ta, ma
> +        vse64.v   v16, (a3)
> +        ret
> +endfunc
>  #endif


-- 
レミ・デニ-クールモン
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] checkasm/lpc: test compute_autocorr
  2023-12-12 21:02 [FFmpeg-devel] [PATCH 1/2] checkasm/lpc: test compute_autocorr Rémi Denis-Courmont
  2023-12-12 21:02 ` [FFmpeg-devel] [PATCH 2/2] lavc/lpc: R-V V compute_autocorr Rémi Denis-Courmont
@ 2023-12-14 16:41 ` Michael Niedermayer
  2023-12-14 16:56   ` Rémi Denis-Courmont
  1 sibling, 1 reply; 5+ messages in thread
From: Michael Niedermayer @ 2023-12-14 16:41 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


[-- Attachment #1.1: Type: text/plain, Size: 3417 bytes --]

On Tue, Dec 12, 2023 at 11:02:39PM +0200, Rémi Denis-Courmont wrote:
> ---
>  tests/checkasm/lpc.c | 42 ++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 40 insertions(+), 2 deletions(-)

Often this works, but not always

ffmpeg/tests/checkasm/checkasm --test=lpc
checkasm: using random seed 3321308425
SSE2:
 - lpc.apply_welch_window_even [OK]
 - lpc.apply_welch_window_odd  [OK]
 - lpc.compute_autocorr_10     [OK]
 - lpc.compute_autocorr_30     [OK]
 - lpc.compute_autocorr_32     [OK]
AVX2:
 - lpc.apply_welch_window_even [OK]
 - lpc.apply_welch_window_odd  [OK]
checkasm: all 7 tests passed
$ ffmpeg/tests/checkasm/checkasm --test=lpc
checkasm: using random seed 3962111
SSE2:
 - lpc.apply_welch_window_even [OK]
 - lpc.apply_welch_window_odd  [OK]
 - lpc.compute_autocorr_10     [OK]
 - lpc.compute_autocorr_30     [OK]
 - lpc.compute_autocorr_32     [OK]
AVX2:
 - lpc.apply_welch_window_even [OK]
 - lpc.apply_welch_window_odd  [OK]
checkasm: all 7 tests passed
$ ffmpeg/tests/checkasm/checkasm --test=lpc
checkasm: using random seed 2098118744
SSE2:
 - lpc.apply_welch_window_even [OK]
 - lpc.apply_welch_window_odd  [OK]
 - lpc.compute_autocorr_10     [OK]
 - lpc.compute_autocorr_30     [OK]
 - lpc.compute_autocorr_32     [OK]
AVX2:
 - lpc.apply_welch_window_even [OK]
 - lpc.apply_welch_window_odd  [OK]
checkasm: all 7 tests passed
$ ffmpeg/tests/checkasm/checkasm --test=lpc
checkasm: using random seed 2281870905
SSE2:
 - lpc.apply_welch_window_even [OK]
 - lpc.apply_welch_window_odd  [OK]
 - lpc.compute_autocorr_10     [OK]
 - lpc.compute_autocorr_30     [OK]
 - lpc.compute_autocorr_32     [OK]
AVX2:
 - lpc.apply_welch_window_even [OK]
 - lpc.apply_welch_window_odd  [OK]
checkasm: all 7 tests passed
$ ffmpeg/tests/checkasm/checkasm --test=lpc
checkasm: using random seed 258427554
SSE2:
 - lpc.apply_welch_window_even [OK]
 - lpc.apply_welch_window_odd  [OK]
 - lpc.compute_autocorr_10     [OK]
 - lpc.compute_autocorr_30     [OK]
 - lpc.compute_autocorr_32     [OK]
AVX2:
 - lpc.apply_welch_window_even [OK]
 - lpc.apply_welch_window_odd  [OK]
checkasm: all 7 tests passed
$ ffmpeg/tests/checkasm/checkasm --test=lpc
checkasm: using random seed 966320013
SSE2:
 - lpc.apply_welch_window_even [OK]
 - lpc.apply_welch_window_odd  [OK]
0:  976.228035341704 -  976.998462662304 = -0.7704273206
   autocorr_10_sse2 (lpc.c:81)
 - lpc.compute_autocorr_10     [FAILED]
0:  966.946397975397 -  967.716825295995 = -0.770427320599
   autocorr_30_sse2 (lpc.c:81)
 - lpc.compute_autocorr_30     [FAILED]
0:  968.085384693526 -  968.855812014127 = -0.770427320601
   autocorr_32_sse2 (lpc.c:81)
 - lpc.compute_autocorr_32     [FAILED]
AVX2:
 - lpc.apply_welch_window_even [OK]
 - lpc.apply_welch_window_odd  [OK]
checkasm: 3 of 7 tests have failed
$ ffmpeg/tests/checkasm/checkasm --test=lpc
checkasm: using random seed 470640728
SSE2:
 - lpc.apply_welch_window_even [OK]
 - lpc.apply_welch_window_odd  [OK]
 - lpc.compute_autocorr_10     [OK]
 - lpc.compute_autocorr_30     [OK]
 - lpc.compute_autocorr_32     [OK]
AVX2:
 - lpc.apply_welch_window_even [OK]
 - lpc.apply_welch_window_odd  [OK]
checkasm: all 7 tests passed


[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Opposition brings concord. Out of discord comes the fairest harmony.
-- Heraclitus

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] checkasm/lpc: test compute_autocorr
  2023-12-14 16:41 ` [FFmpeg-devel] [PATCH 1/2] checkasm/lpc: test compute_autocorr Michael Niedermayer
@ 2023-12-14 16:56   ` Rémi Denis-Courmont
  0 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2023-12-14 16:56 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le torstaina 14. joulukuuta 2023, 18.41.24 EET Michael Niedermayer a écrit :
> SSE2:
>  - lpc.apply_welch_window_even [OK]
>  - lpc.apply_welch_window_odd  [OK]
> 0:  976.228035341704 -  976.998462662304 = -0.7704273206
>    autocorr_10_sse2 (lpc.c:81)
>  - lpc.compute_autocorr_10     [FAILED]
> 0:  966.946397975397 -  967.716825295995 = -0.770427320599
>    autocorr_30_sse2 (lpc.c:81)
>  - lpc.compute_autocorr_30     [FAILED]
> 0:  968.085384693526 -  968.855812014127 = -0.770427320601

Right, it seems that the SSE optimisations craps out on odd length. The RVV 
code seems to match the C code there, so I am not sure if this is exposing an 
existing bug in the SSE code, or if odd length are illegal.

On a related note, we should probably test for odd lag values, as the C code 
has special handling for them. But from a quick glance, it seems that the SSE 
code also fails to deal with that case.

>    autocorr_32_sse2 (lpc.c:81)
>  - lpc.compute_autocorr_32     [FAILED]
> AVX2:
>  - lpc.apply_welch_window_even [OK]
>  - lpc.apply_welch_window_odd  [OK]
> checkasm: 3 of 7 tests have failed
> $ ffmpeg/tests/checkasm/checkasm --test=lpc
> checkasm: using random seed 470640728
> SSE2:
>  - lpc.apply_welch_window_even [OK]
>  - lpc.apply_welch_window_odd  [OK]
>  - lpc.compute_autocorr_10     [OK]
>  - lpc.compute_autocorr_30     [OK]
>  - lpc.compute_autocorr_32     [OK]
> AVX2:
>  - lpc.apply_welch_window_even [OK]
>  - lpc.apply_welch_window_odd  [OK]
> checkasm: all 7 tests passed
> 
> 
> [...]


-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-12-14 16:57 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-12 21:02 [FFmpeg-devel] [PATCH 1/2] checkasm/lpc: test compute_autocorr Rémi Denis-Courmont
2023-12-12 21:02 ` [FFmpeg-devel] [PATCH 2/2] lavc/lpc: R-V V compute_autocorr Rémi Denis-Courmont
2023-12-12 21:05   ` Rémi Denis-Courmont
2023-12-14 16:41 ` [FFmpeg-devel] [PATCH 1/2] checkasm/lpc: test compute_autocorr Michael Niedermayer
2023-12-14 16:56   ` Rémi Denis-Courmont

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git