* [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: run lpc benchmarks with an unmodified buffer
@ 2024-05-11 19:46 James Almer
  2024-05-11 19:46 ` [FFmpeg-devel] [PATCH 2/2] checkasm/flacdsp: sanitize lpc arguments James Almer
                   ` (8 more replies)
  0 siblings, 9 replies; 18+ messages in thread
From: James Almer @ 2024-05-11 19:46 UTC (permalink / raw)
  To: ffmpeg-devel
Signed-off-by: James Almer <jamrial@gmail.com>
---
 tests/checkasm/flacdsp.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c
index b308237db1..6561b4ed20 100644
--- a/tests/checkasm/flacdsp.c
+++ b/tests/checkasm/flacdsp.c
@@ -58,6 +58,7 @@ static void check_lpc(int pred_order)
 {
     int qlevel = rnd() % 16;
     LOCAL_ALIGNED_16(int32_t, coeffs, [32]);
+    LOCAL_ALIGNED_16(int32_t, dst,  [BUF_SIZE]);
     LOCAL_ALIGNED_16(int32_t, dst0, [BUF_SIZE]);
     LOCAL_ALIGNED_16(int32_t, dst1, [BUF_SIZE]);
 
@@ -66,14 +67,15 @@ static void check_lpc(int pred_order)
     for (int i = 0; i < 32; i++)
         coeffs[i] = rnd();
     for (int i = 0; i < BUF_SIZE; i++)
-        dst0[i] = rnd();
+        dst[i] = rnd();
 
-    memcpy(dst1, dst0, BUF_SIZE * sizeof (int32_t));
+    memcpy(dst0, dst, BUF_SIZE * sizeof (int32_t));
+    memcpy(dst1, dst, BUF_SIZE * sizeof (int32_t));
     call_ref(dst0, coeffs, pred_order, qlevel, BUF_SIZE);
     call_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE);
     if (memcmp(dst0, dst1, BUF_SIZE * sizeof (int32_t)) != 0)
        fail();
-    bench_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE);
+    bench_new(dst, coeffs, pred_order, qlevel, BUF_SIZE);
 }
 
 void checkasm_check_flacdsp(void)
-- 
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [FFmpeg-devel] [PATCH 2/2] checkasm/flacdsp: sanitize lpc arguments
  2024-05-11 19:46 [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: run lpc benchmarks with an unmodified buffer James Almer
@ 2024-05-11 19:46 ` James Almer
  2024-05-11 20:31   ` [FFmpeg-devel] [PATCH 2/2 v2] " James Almer
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 3/8] x86/flacdsp: add a SSE4 version of lpc16 James Almer
                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 18+ messages in thread
From: James Almer @ 2024-05-11 19:46 UTC (permalink / raw)
  To: ffmpeg-devel
Fixes signed integer overflows as reported by ubsan.
Signed-off-by: James Almer <jamrial@gmail.com>
---
 tests/checkasm/flacdsp.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c
index 6561b4ed20..bf25cea39c 100644
--- a/tests/checkasm/flacdsp.c
+++ b/tests/checkasm/flacdsp.c
@@ -54,9 +54,10 @@ static void check_decorrelate(uint8_t **ref_dst, uint8_t **ref_src, uint8_t **ne
     bench_new(new_dst, (int32_t **)new_src, channels, BUF_SIZE / sizeof(int32_t), 8);
 }
 
-static void check_lpc(int pred_order)
+static void check_lpc(int pred_order, int bps)
 {
     int qlevel = rnd() % 16;
+    int coeff_prec = rnd() % 16;
     LOCAL_ALIGNED_16(int32_t, coeffs, [32]);
     LOCAL_ALIGNED_16(int32_t, dst,  [BUF_SIZE]);
     LOCAL_ALIGNED_16(int32_t, dst0, [BUF_SIZE]);
@@ -64,11 +65,13 @@ static void check_lpc(int pred_order)
 
     declare_func(void, int32_t *, const int[32], int, int, int);
 
+    if (bps <= 16)
+        coeff_prec = av_clip(coeff_prec, 0, 32 - bps - av_log2(pred_order));
+
     for (int i = 0; i < 32; i++)
-        coeffs[i] = rnd();
+        coeffs[i] = av_mod_uintp2(rnd(), coeff_prec);
     for (int i = 0; i < BUF_SIZE; i++)
-        dst[i] = rnd();
-
+        dst[i] = rnd() & ((1LL << bps) - 1);
     memcpy(dst0, dst, BUF_SIZE * sizeof (int32_t));
     memcpy(dst1, dst, BUF_SIZE * sizeof (int32_t));
     call_ref(dst0, coeffs, pred_order, qlevel, BUF_SIZE);
@@ -116,10 +119,10 @@ void checkasm_check_flacdsp(void)
 
     for (i = 0; i < FF_ARRAY_ELEMS(pred_orders); i++)
         if (check_func(h.lpc16, "flac_lpc_16_%d", pred_orders[i]))
-            check_lpc(pred_orders[i]);
+            check_lpc(pred_orders[i], 16);
     for (i = 0; i < FF_ARRAY_ELEMS(pred_orders); i++)
         if (check_func(h.lpc32, "flac_lpc_32_%d", pred_orders[i]))
-            check_lpc(pred_orders[i]);
+            check_lpc(pred_orders[i], 32);
 
     report("lpc");
 }
-- 
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [FFmpeg-devel] [PATCH 2/2 v2] checkasm/flacdsp: sanitize lpc arguments
  2024-05-11 19:46 ` [FFmpeg-devel] [PATCH 2/2] checkasm/flacdsp: sanitize lpc arguments James Almer
@ 2024-05-11 20:31   ` James Almer
  0 siblings, 0 replies; 18+ messages in thread
From: James Almer @ 2024-05-11 20:31 UTC (permalink / raw)
  To: ffmpeg-devel
Fixes signed integer overflows as reported by ubsan.
Signed-off-by: James Almer <jamrial@gmail.com>
---
Now allowing negative values.
 tests/checkasm/flacdsp.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c
index 6561b4ed20..d694c1093b 100644
--- a/tests/checkasm/flacdsp.c
+++ b/tests/checkasm/flacdsp.c
@@ -21,6 +21,7 @@
 #include <string.h>
 #include "checkasm.h"
 #include "libavcodec/flacdsp.h"
+#include "libavcodec/mathops.h"
 #include "libavutil/common.h"
 #include "libavutil/internal.h"
 #include "libavutil/intreadwrite.h"
@@ -54,9 +55,10 @@ static void check_decorrelate(uint8_t **ref_dst, uint8_t **ref_src, uint8_t **ne
     bench_new(new_dst, (int32_t **)new_src, channels, BUF_SIZE / sizeof(int32_t), 8);
 }
 
-static void check_lpc(int pred_order)
+static void check_lpc(int pred_order, int bps)
 {
     int qlevel = rnd() % 16;
+    int coeff_prec = (rnd() % 15) + 1;
     LOCAL_ALIGNED_16(int32_t, coeffs, [32]);
     LOCAL_ALIGNED_16(int32_t, dst,  [BUF_SIZE]);
     LOCAL_ALIGNED_16(int32_t, dst0, [BUF_SIZE]);
@@ -64,10 +66,13 @@ static void check_lpc(int pred_order)
 
     declare_func(void, int32_t *, const int[32], int, int, int);
 
+    if (bps <= 16)
+        coeff_prec = av_clip(coeff_prec, 0, 32 - bps - av_log2(pred_order));
+
     for (int i = 0; i < 32; i++)
-        coeffs[i] = rnd();
+        coeffs[i] = sign_extend(rnd(), coeff_prec);
     for (int i = 0; i < BUF_SIZE; i++)
-        dst[i] = rnd();
+        dst[i] = sign_extend(rnd(), bps);
 
     memcpy(dst0, dst, BUF_SIZE * sizeof (int32_t));
     memcpy(dst1, dst, BUF_SIZE * sizeof (int32_t));
@@ -116,10 +121,10 @@ void checkasm_check_flacdsp(void)
 
     for (i = 0; i < FF_ARRAY_ELEMS(pred_orders); i++)
         if (check_func(h.lpc16, "flac_lpc_16_%d", pred_orders[i]))
-            check_lpc(pred_orders[i]);
+            check_lpc(pred_orders[i], 16);
     for (i = 0; i < FF_ARRAY_ELEMS(pred_orders); i++)
         if (check_func(h.lpc32, "flac_lpc_32_%d", pred_orders[i]))
-            check_lpc(pred_orders[i]);
+            check_lpc(pred_orders[i], 32);
 
     report("lpc");
 }
-- 
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [FFmpeg-devel] [PATCH 3/8] x86/flacdsp: add a SSE4 version of lpc16
  2024-05-11 19:46 [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: run lpc benchmarks with an unmodified buffer James Almer
  2024-05-11 19:46 ` [FFmpeg-devel] [PATCH 2/2] checkasm/flacdsp: sanitize lpc arguments James Almer
@ 2024-05-12 16:06 ` James Almer
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 4/8] avcodec/flacdsp: split off wasted bit handling into dsp functions James Almer
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 18+ messages in thread
From: James Almer @ 2024-05-12 16:06 UTC (permalink / raw)
  To: ffmpeg-devel
flac_lpc_16_13_c: 2841.3
flac_lpc_16_13_sse4: 2151.8
flac_lpc_16_16_c: 3382.8
flac_lpc_16_16_sse4: 2228.3
flac_lpc_16_29_c: 5800.3
flac_lpc_16_29_sse4: 3727.3
flac_lpc_16_32_c: 5972.8
flac_lpc_16_32_sse4: 4052.3
Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/x86/flacdsp.asm    | 13 +++++++------
 libavcodec/x86/flacdsp_init.c |  3 +++
 2 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
index 4b2fd65435..f38eb7db76 100644
--- a/libavcodec/x86/flacdsp.asm
+++ b/libavcodec/x86/flacdsp.asm
@@ -38,9 +38,9 @@ SECTION .text
 %endif
 %endmacro
 
-%macro LPC_32 1
+%macro LPC_32 3
 INIT_XMM %1
-cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
+cglobal flac_lpc_%2, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
     sub    lend, pred_orderd
     jle .ret
     movsxdifnidn pred_orderq, pred_orderd
@@ -67,14 +67,14 @@ ALIGN 16
     jl .loop_order
 .end_order:
     PMACSDQL m2, m0, m1, m2, m0
-    psrlq  m2, m4
+    %3     m2, m4
     movd   m0, [decodedq]
     paddd  m0, m2
     movd   [decodedq], m0
     sub  lend, 2
     jl .ret
     PMACSDQL m3, m1, m0, m3, m1
-    psrlq  m3, m4
+    %3     m3, m4
     movd   m1, [decodedq+4]
     paddd  m1, m3
     movd   [decodedq+4], m1
@@ -83,10 +83,11 @@ ALIGN 16
     RET
 %endmacro
 
+LPC_32 sse4, 16, psrad
+LPC_32 sse4, 32, psrlq
 %if HAVE_XOP_EXTERNAL
-LPC_32 xop
+LPC_32 xop,  32, psrlq
 %endif
-LPC_32 sse4
 
 ;----------------------------------------------------------------------------------
 ;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
index 87daed7005..dee4bf88fc 100644
--- a/libavcodec/x86/flacdsp_init.c
+++ b/libavcodec/x86/flacdsp_init.c
@@ -23,6 +23,8 @@
 #include "libavutil/x86/cpu.h"
 #include "config.h"
 
+void ff_flac_lpc_16_sse4(int32_t *samples, const int coeffs[32], int order,
+                         int qlevel, int len);
 void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
                          int qlevel, int len);
 void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
@@ -93,6 +95,7 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
         }
     }
     if (EXTERNAL_SSE4(cpu_flags)) {
+        c->lpc16 = ff_flac_lpc_16_sse4;
         c->lpc32 = ff_flac_lpc_32_sse4;
     }
     if (EXTERNAL_AVX(cpu_flags)) {
-- 
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [FFmpeg-devel] [PATCH 4/8] avcodec/flacdsp: split off wasted bit handling into dsp functions
  2024-05-11 19:46 [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: run lpc benchmarks with an unmodified buffer James Almer
  2024-05-11 19:46 ` [FFmpeg-devel] [PATCH 2/2] checkasm/flacdsp: sanitize lpc arguments James Almer
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 3/8] x86/flacdsp: add a SSE4 version of lpc16 James Almer
@ 2024-05-12 16:06 ` James Almer
  2024-05-12 16:15   ` Andreas Rheinhardt
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 5/8] checkasm/flacdsp: add a test for wasted32 James Almer
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 18+ messages in thread
From: James Almer @ 2024-05-12 16:06 UTC (permalink / raw)
  To: ffmpeg-devel
Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/flacdec.c |  8 ++------
 libavcodec/flacdsp.c | 20 ++++++++++++++++++++
 libavcodec/flacdsp.h |  3 +++
 3 files changed, 25 insertions(+), 6 deletions(-)
diff --git a/libavcodec/flacdec.c b/libavcodec/flacdec.c
index 91bbdc657d..6e6a2896b4 100644
--- a/libavcodec/flacdec.c
+++ b/libavcodec/flacdec.c
@@ -602,13 +602,9 @@ static inline int decode_subframe(FLACContext *s, int channel)
 
     if (wasted) {
         if (wasted+bps == 33) {
-            int i;
-            for (i = 0; i < s->blocksize; i++)
-                s->decoded_33bps[i] = (uint64_t)decoded[i] << wasted;
+            s->dsp.wasted33(s->decoded_33bps, decoded, wasted, s->blocksize);
         } else if (wasted < 32) {
-            int i;
-            for (i = 0; i < s->blocksize; i++)
-                decoded[i] = (unsigned)decoded[i] << wasted;
+            s->dsp.wasted32(decoded, wasted, s->blocksize);
         }
     }
 
diff --git a/libavcodec/flacdsp.c b/libavcodec/flacdsp.c
index 71b4ac44aa..610831348a 100644
--- a/libavcodec/flacdsp.c
+++ b/libavcodec/flacdsp.c
@@ -84,11 +84,31 @@ static void flac_lpc_32_c(int32_t *decoded, const int coeffs[32],
 
 }
 
+static void flac_wasted_32_c(int32_t *decoded, int wasted, int len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        decoded[i] = (unsigned)decoded[i] << wasted;
+}
+
+static void flac_wasted_33_c(int64_t *decoded, const int32_t *residual,
+                             int wasted, int len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        decoded[i] = (uint64_t)residual[i] << wasted;
+}
+
 av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int channels)
 {
     c->lpc16        = flac_lpc_16_c;
     c->lpc32        = flac_lpc_32_c;
 
+    c->wasted32     = flac_wasted_32_c;
+    c->wasted33     = flac_wasted_33_c;
+
     switch (fmt) {
     case AV_SAMPLE_FMT_S32:
         c->decorrelate[0] = flac_decorrelate_indep_c_32;
diff --git a/libavcodec/flacdsp.h b/libavcodec/flacdsp.h
index 15149c026e..5a59c0c864 100644
--- a/libavcodec/flacdsp.h
+++ b/libavcodec/flacdsp.h
@@ -30,6 +30,9 @@ typedef struct FLACDSPContext {
                   int qlevel, int len);
     void (*lpc32)(int32_t *samples, const int coeffs[32], int order,
                   int qlevel, int len);
+    void (*wasted32)(int32_t *decoded, int wasted, int len);
+    void (*wasted33)(int64_t *decoded, const int32_t *residual,
+                     int wasted, int len);
     void (*lpc16_encode)(int32_t *res, const int32_t *smp, int len, int order,
                          const int32_t coefs[32], int shift);
     void (*lpc32_encode)(int32_t *res, const int32_t *smp, int len, int order,
-- 
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [FFmpeg-devel] [PATCH 5/8] checkasm/flacdsp: add a test for wasted32
  2024-05-11 19:46 [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: run lpc benchmarks with an unmodified buffer James Almer
                   ` (2 preceding siblings ...)
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 4/8] avcodec/flacdsp: split off wasted bit handling into dsp functions James Almer
@ 2024-05-12 16:06 ` James Almer
  2024-05-12 16:38   ` Rémi Denis-Courmont
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 6/8] x86/flacdsp: add a SSE2 version of wasted32 James Almer
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 18+ messages in thread
From: James Almer @ 2024-05-12 16:06 UTC (permalink / raw)
  To: ffmpeg-devel
Signed-off-by: James Almer <jamrial@gmail.com>
---
 tests/checkasm/flacdsp.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c
index d694c1093b..343bee41ed 100644
--- a/tests/checkasm/flacdsp.c
+++ b/tests/checkasm/flacdsp.c
@@ -83,6 +83,27 @@ static void check_lpc(int pred_order, int bps)
     bench_new(dst, coeffs, pred_order, qlevel, BUF_SIZE);
 }
 
+static void check_wasted32(void)
+{
+    int wasted = rnd() % 32;
+    LOCAL_ALIGNED_16(int32_t, dst,  [BUF_SIZE]);
+    LOCAL_ALIGNED_16(int32_t, dst0, [BUF_SIZE]);
+    LOCAL_ALIGNED_16(int32_t, dst1, [BUF_SIZE]);
+
+    declare_func(void, int32_t *, int, int);
+
+    for (int i = 0; i < BUF_SIZE; i++)
+        dst[i] = rnd();
+
+    memcpy(dst0, dst, BUF_SIZE * sizeof (int32_t));
+    memcpy(dst1, dst, BUF_SIZE * sizeof (int32_t));
+    call_ref(dst0, wasted, BUF_SIZE);
+    call_new(dst1, wasted, BUF_SIZE);
+    if (memcmp(dst0, dst1, BUF_SIZE * sizeof (int32_t)) != 0)
+       fail();
+    bench_new(dst, wasted, BUF_SIZE);
+}
+
 void checkasm_check_flacdsp(void)
 {
     LOCAL_ALIGNED_16(uint8_t, ref_dst, [BUF_SIZE*MAX_CHANNELS]);
@@ -126,5 +147,8 @@ void checkasm_check_flacdsp(void)
         if (check_func(h.lpc32, "flac_lpc_32_%d", pred_orders[i]))
             check_lpc(pred_orders[i], 32);
 
+    if (check_func(h.wasted32, "flac_wasted_32"))
+        check_wasted32();
+
     report("lpc");
 }
-- 
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [FFmpeg-devel] [PATCH 6/8] x86/flacdsp: add a SSE2 version of wasted32
  2024-05-11 19:46 [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: run lpc benchmarks with an unmodified buffer James Almer
                   ` (3 preceding siblings ...)
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 5/8] checkasm/flacdsp: add a test for wasted32 James Almer
@ 2024-05-12 16:06 ` James Almer
  2024-05-12 18:51   ` [FFmpeg-devel] [PATCH 6/8 v2] " James Almer
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 7/8] checkasm/flacdsp: add a test for wasted33 James Almer
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 18+ messages in thread
From: James Almer @ 2024-05-12 16:06 UTC (permalink / raw)
  To: ffmpeg-devel
flac_wasted_32_c: 851.3
flac_wasted_32_sse2: 53.3
Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/x86/flacdsp.asm    | 15 +++++++++++++++
 libavcodec/x86/flacdsp_init.c |  3 +++
 2 files changed, 18 insertions(+)
diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
index f38eb7db76..3a940059c7 100644
--- a/libavcodec/x86/flacdsp.asm
+++ b/libavcodec/x86/flacdsp.asm
@@ -89,6 +89,21 @@ LPC_32 sse4, 32, psrlq
 LPC_32 xop,  32, psrlq
 %endif
 
+INIT_XMM sse2
+cglobal flac_wasted_32, 3,3,2, decoded, wasted, len
+    shl   lend, 2
+    lea   decodedq, [decodedq+lenq]
+    neg   lenq
+    movd  m1, wastedd
+ALIGN 16
+.loop:
+    mova  m0, [decodedq+lenq]
+    pslld m0, m1
+    mova  [decodedq+lenq], m0
+    add lenq, mmsize
+    jl .loop
+    RET
+
 ;----------------------------------------------------------------------------------
 ;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
 ;                                                   int len, int shift);
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
index dee4bf88fc..67aa118760 100644
--- a/libavcodec/x86/flacdsp_init.c
+++ b/libavcodec/x86/flacdsp_init.c
@@ -30,6 +30,8 @@ void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
 void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
                         int qlevel, int len);
 
+void ff_flac_wasted_32_sse2(int32_t *decoded, int wasted, int len);
+
 #define DECORRELATE_FUNCS(fmt, opt)                                                      \
 void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
                                           int len, int shift);                           \
@@ -63,6 +65,7 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_SSE2(cpu_flags)) {
+        c->wasted32 = ff_flac_wasted_32_sse2;
         if (fmt == AV_SAMPLE_FMT_S16) {
             c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
             c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
-- 
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [FFmpeg-devel] [PATCH 7/8] checkasm/flacdsp: add a test for wasted33
  2024-05-11 19:46 [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: run lpc benchmarks with an unmodified buffer James Almer
                   ` (4 preceding siblings ...)
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 6/8] x86/flacdsp: add a SSE2 version of wasted32 James Almer
@ 2024-05-12 16:06 ` James Almer
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 8/8] x86/flacdsp: add SSE4 and AVX2 versions of wasted33 James Almer
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 18+ messages in thread
From: James Almer @ 2024-05-12 16:06 UTC (permalink / raw)
  To: ffmpeg-devel
Signed-off-by: James Almer <jamrial@gmail.com>
---
 tests/checkasm/flacdsp.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c
index 343bee41ed..6abdd255a3 100644
--- a/tests/checkasm/flacdsp.c
+++ b/tests/checkasm/flacdsp.c
@@ -104,6 +104,25 @@ static void check_wasted32(void)
     bench_new(dst, wasted, BUF_SIZE);
 }
 
+static void check_wasted33(void)
+{
+    int wasted = rnd() % 33;
+    LOCAL_ALIGNED_16(int32_t, residuals, [BUF_SIZE]);
+    LOCAL_ALIGNED_16(int64_t, dst0, [BUF_SIZE]);
+    LOCAL_ALIGNED_16(int64_t, dst1, [BUF_SIZE]);
+
+    declare_func(void, int64_t *, const int32_t *, int, int);
+
+    for (int i = 0; i < BUF_SIZE; i++)
+        residuals[i] = rnd();
+
+    call_ref(dst0, residuals, wasted, BUF_SIZE);
+    call_new(dst1, residuals, wasted, BUF_SIZE);
+    if (memcmp(dst0, dst1, BUF_SIZE * sizeof (int64_t)) != 0)
+       fail();
+    bench_new(dst0, residuals, wasted, BUF_SIZE);
+}
+
 void checkasm_check_flacdsp(void)
 {
     LOCAL_ALIGNED_16(uint8_t, ref_dst, [BUF_SIZE*MAX_CHANNELS]);
@@ -149,6 +168,8 @@ void checkasm_check_flacdsp(void)
 
     if (check_func(h.wasted32, "flac_wasted_32"))
         check_wasted32();
+    if (check_func(h.wasted33, "flac_wasted_33"))
+        check_wasted33();
 
     report("lpc");
 }
-- 
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [FFmpeg-devel] [PATCH 8/8] x86/flacdsp: add SSE4 and AVX2 versions of wasted33
  2024-05-11 19:46 [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: run lpc benchmarks with an unmodified buffer James Almer
                   ` (5 preceding siblings ...)
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 7/8] checkasm/flacdsp: add a test for wasted33 James Almer
@ 2024-05-12 16:06 ` James Almer
  2024-05-12 18:53   ` [FFmpeg-devel] [PATCH 8/8 v2] x86/flacdsp: add an SSE4 version " James Almer
  2024-05-12 20:36 ` [FFmpeg-devel] [PATCH 09/10] avcodec/flacdsp: split off lpc33 into a dsp function James Almer
  2024-05-12 20:36 ` [FFmpeg-devel] [PATCH 10/10] checkasm/flacdsp: add a test for lpc33 James Almer
  8 siblings, 1 reply; 18+ messages in thread
From: James Almer @ 2024-05-12 16:06 UTC (permalink / raw)
  To: ffmpeg-devel
flac_wasted_33_c: 214.1
flac_wasted_33_sse4: 133.6
flac_wasted_33_avx2: 93.1
Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/x86/flacdsp.asm    | 24 ++++++++++++++++++++++++
 libavcodec/x86/flacdsp_init.c |  6 ++++++
 2 files changed, 30 insertions(+)
diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
index 3a940059c7..84cd4dd465 100644
--- a/libavcodec/x86/flacdsp.asm
+++ b/libavcodec/x86/flacdsp.asm
@@ -104,6 +104,30 @@ ALIGN 16
     jl .loop
     RET
 
+%macro WASTED_33 1
+cglobal flac_wasted_33, 4,4,2, decoded, residuals, wasted, len
+    shl   lend, 2
+    lea   decodedq, [decodedq+lenq*2]
+    add   residualsq, lenq
+    neg   lenq
+    movd  xm1, wastedd
+ALIGN 16
+.loop:
+    pmovsxdq  m0, [residualsq+lenq]
+    psllq m0, xm1
+    mov%1  [decodedq+lenq*2], m0
+    add lenq, mmsize / 2
+    jl .loop
+    RET
+%endmacro
+
+INIT_XMM sse4
+WASTED_33 a
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+WASTED_33 u
+%endif
+
 ;----------------------------------------------------------------------------------
 ;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
 ;                                                   int len, int shift);
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
index 67aa118760..22482f8787 100644
--- a/libavcodec/x86/flacdsp_init.c
+++ b/libavcodec/x86/flacdsp_init.c
@@ -31,6 +31,8 @@ void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
                         int qlevel, int len);
 
 void ff_flac_wasted_32_sse2(int32_t *decoded, int wasted, int len);
+void ff_flac_wasted_33_sse4(int64_t *decoded, const int32_t *residual, int wasted, int len);
+void ff_flac_wasted_33_avx2(int64_t *decoded, const int32_t *residual, int wasted, int len);
 
 #define DECORRELATE_FUNCS(fmt, opt)                                                      \
 void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
@@ -100,6 +102,7 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
     if (EXTERNAL_SSE4(cpu_flags)) {
         c->lpc16 = ff_flac_lpc_16_sse4;
         c->lpc32 = ff_flac_lpc_32_sse4;
+        c->wasted33 = ff_flac_wasted_33_sse4;
     }
     if (EXTERNAL_AVX(cpu_flags)) {
         if (fmt == AV_SAMPLE_FMT_S16) {
@@ -117,5 +120,8 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
     if (EXTERNAL_XOP(cpu_flags)) {
         c->lpc32 = ff_flac_lpc_32_xop;
     }
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->wasted33 = ff_flac_wasted_33_avx2;
+    }
 #endif /* HAVE_X86ASM */
 }
-- 
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH 4/8] avcodec/flacdsp: split off wasted bit handling into dsp functions
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 4/8] avcodec/flacdsp: split off wasted bit handling into dsp functions James Almer
@ 2024-05-12 16:15   ` Andreas Rheinhardt
  2024-05-12 16:44     ` James Almer
  0 siblings, 1 reply; 18+ messages in thread
From: Andreas Rheinhardt @ 2024-05-12 16:15 UTC (permalink / raw)
  To: ffmpeg-devel
James Almer:
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  libavcodec/flacdec.c |  8 ++------
>  libavcodec/flacdsp.c | 20 ++++++++++++++++++++
>  libavcodec/flacdsp.h |  3 +++
>  3 files changed, 25 insertions(+), 6 deletions(-)
> 
> diff --git a/libavcodec/flacdec.c b/libavcodec/flacdec.c
> index 91bbdc657d..6e6a2896b4 100644
> --- a/libavcodec/flacdec.c
> +++ b/libavcodec/flacdec.c
> @@ -602,13 +602,9 @@ static inline int decode_subframe(FLACContext *s, int channel)
>  
>      if (wasted) {
>          if (wasted+bps == 33) {
> -            int i;
> -            for (i = 0; i < s->blocksize; i++)
> -                s->decoded_33bps[i] = (uint64_t)decoded[i] << wasted;
> +            s->dsp.wasted33(s->decoded_33bps, decoded, wasted, s->blocksize);
>          } else if (wasted < 32) {
> -            int i;
> -            for (i = 0; i < s->blocksize; i++)
> -                decoded[i] = (unsigned)decoded[i] << wasted;
> +            s->dsp.wasted32(decoded, wasted, s->blocksize);
>          }
>      }
>  
> diff --git a/libavcodec/flacdsp.c b/libavcodec/flacdsp.c
> index 71b4ac44aa..610831348a 100644
> --- a/libavcodec/flacdsp.c
> +++ b/libavcodec/flacdsp.c
> @@ -84,11 +84,31 @@ static void flac_lpc_32_c(int32_t *decoded, const int coeffs[32],
>  
>  }
>  
> +static void flac_wasted_32_c(int32_t *decoded, int wasted, int len)
> +{
> +    int i;
> +
> +    for (i = 0; i < len; i++)
> +        decoded[i] = (unsigned)decoded[i] << wasted;
> +}
> +
> +static void flac_wasted_33_c(int64_t *decoded, const int32_t *residual,
> +                             int wasted, int len)
> +{
> +    int i;
> +
> +    for (i = 0; i < len; i++)
for (int i = 0
> +        decoded[i] = (uint64_t)residual[i] << wasted;
> +}
> +
>  av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int channels)
>  {
>      c->lpc16        = flac_lpc_16_c;
>      c->lpc32        = flac_lpc_32_c;
>  
> +    c->wasted32     = flac_wasted_32_c;
> +    c->wasted33     = flac_wasted_33_c;
> +
>      switch (fmt) {
>      case AV_SAMPLE_FMT_S32:
>          c->decorrelate[0] = flac_decorrelate_indep_c_32;
> diff --git a/libavcodec/flacdsp.h b/libavcodec/flacdsp.h
> index 15149c026e..5a59c0c864 100644
> --- a/libavcodec/flacdsp.h
> +++ b/libavcodec/flacdsp.h
> @@ -30,6 +30,9 @@ typedef struct FLACDSPContext {
>                    int qlevel, int len);
>      void (*lpc32)(int32_t *samples, const int coeffs[32], int order,
>                    int qlevel, int len);
> +    void (*wasted32)(int32_t *decoded, int wasted, int len);
> +    void (*wasted33)(int64_t *decoded, const int32_t *residual,
> +                     int wasted, int len);
>      void (*lpc16_encode)(int32_t *res, const int32_t *smp, int len, int order,
>                           const int32_t coefs[32], int shift);
>      void (*lpc32_encode)(int32_t *res, const int32_t *smp, int len, int order,
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/8] checkasm/flacdsp: add a test for wasted32
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 5/8] checkasm/flacdsp: add a test for wasted32 James Almer
@ 2024-05-12 16:38   ` Rémi Denis-Courmont
  2024-05-12 16:42     ` [FFmpeg-devel] [PATCH 5/8 v2] " James Almer
  0 siblings, 1 reply; 18+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-12 16:38 UTC (permalink / raw)
  To: ffmpeg-devel
Le sunnuntaina 12. toukokuuta 2024, 19.06.54 EEST James Almer a écrit :
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  tests/checkasm/flacdsp.c | 24 ++++++++++++++++++++++++
>  1 file changed, 24 insertions(+)
> 
> diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c
> index d694c1093b..343bee41ed 100644
> --- a/tests/checkasm/flacdsp.c
> +++ b/tests/checkasm/flacdsp.c
> @@ -83,6 +83,27 @@ static void check_lpc(int pred_order, int bps)
>      bench_new(dst, coeffs, pred_order, qlevel, BUF_SIZE);
>  }
> 
> +static void check_wasted32(void)
> +{
> +    int wasted = rnd() % 32;
> +    LOCAL_ALIGNED_16(int32_t, dst,  [BUF_SIZE]);
> +    LOCAL_ALIGNED_16(int32_t, dst0, [BUF_SIZE]);
> +    LOCAL_ALIGNED_16(int32_t, dst1, [BUF_SIZE]);
> +
> +    declare_func(void, int32_t *, int, int);
> +
> +    for (int i = 0; i < BUF_SIZE; i++)
> +        dst[i] = rnd();
> +
> +    memcpy(dst0, dst, BUF_SIZE * sizeof (int32_t));
> +    memcpy(dst1, dst, BUF_SIZE * sizeof (int32_t));
> +    call_ref(dst0, wasted, BUF_SIZE);
> +    call_new(dst1, wasted, BUF_SIZE);
> +    if (memcmp(dst0, dst1, BUF_SIZE * sizeof (int32_t)) != 0)
> +       fail();
> +    bench_new(dst, wasted, BUF_SIZE);
> +}
> +
>  void checkasm_check_flacdsp(void)
>  {
>      LOCAL_ALIGNED_16(uint8_t, ref_dst, [BUF_SIZE*MAX_CHANNELS]);
> @@ -126,5 +147,8 @@ void checkasm_check_flacdsp(void)
>          if (check_func(h.lpc32, "flac_lpc_32_%d", pred_orders[i]))
>              check_lpc(pred_orders[i], 32);
> 
> +    if (check_func(h.wasted32, "flac_wasted_32"))
> +        check_wasted32();
> +
>      report("lpc");
Missing report?
>  }
-- 
Rémi Denis-Courmont
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [FFmpeg-devel] [PATCH 5/8 v2] checkasm/flacdsp: add a test for wasted32
  2024-05-12 16:38   ` Rémi Denis-Courmont
@ 2024-05-12 16:42     ` James Almer
  0 siblings, 0 replies; 18+ messages in thread
From: James Almer @ 2024-05-12 16:42 UTC (permalink / raw)
  To: ffmpeg-devel
Signed-off-by: James Almer <jamrial@gmail.com>
---
 tests/checkasm/flacdsp.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c
index d694c1093b..ee0897ed01 100644
--- a/tests/checkasm/flacdsp.c
+++ b/tests/checkasm/flacdsp.c
@@ -83,6 +83,27 @@ static void check_lpc(int pred_order, int bps)
     bench_new(dst, coeffs, pred_order, qlevel, BUF_SIZE);
 }
 
+static void check_wasted32(void)
+{
+    int wasted = rnd() % 32;
+    LOCAL_ALIGNED_16(int32_t, dst,  [BUF_SIZE]);
+    LOCAL_ALIGNED_16(int32_t, dst0, [BUF_SIZE]);
+    LOCAL_ALIGNED_16(int32_t, dst1, [BUF_SIZE]);
+
+    declare_func(void, int32_t *, int, int);
+
+    for (int i = 0; i < BUF_SIZE; i++)
+        dst[i] = rnd();
+
+    memcpy(dst0, dst, BUF_SIZE * sizeof (int32_t));
+    memcpy(dst1, dst, BUF_SIZE * sizeof (int32_t));
+    call_ref(dst0, wasted, BUF_SIZE);
+    call_new(dst1, wasted, BUF_SIZE);
+    if (memcmp(dst0, dst1, BUF_SIZE * sizeof (int32_t)) != 0)
+       fail();
+    bench_new(dst, wasted, BUF_SIZE);
+}
+
 void checkasm_check_flacdsp(void)
 {
     LOCAL_ALIGNED_16(uint8_t, ref_dst, [BUF_SIZE*MAX_CHANNELS]);
@@ -127,4 +148,9 @@ void checkasm_check_flacdsp(void)
             check_lpc(pred_orders[i], 32);
 
     report("lpc");
+
+    if (check_func(h.wasted32, "flac_wasted_32"))
+        check_wasted32();
+
+    report("wasted");
 }
-- 
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH 4/8] avcodec/flacdsp: split off wasted bit handling into dsp functions
  2024-05-12 16:15   ` Andreas Rheinhardt
@ 2024-05-12 16:44     ` James Almer
  0 siblings, 0 replies; 18+ messages in thread
From: James Almer @ 2024-05-12 16:44 UTC (permalink / raw)
  To: ffmpeg-devel
On 5/12/2024 1:15 PM, Andreas Rheinhardt wrote:
> James Almer:
>> Signed-off-by: James Almer <jamrial@gmail.com>
>> ---
>>   libavcodec/flacdec.c |  8 ++------
>>   libavcodec/flacdsp.c | 20 ++++++++++++++++++++
>>   libavcodec/flacdsp.h |  3 +++
>>   3 files changed, 25 insertions(+), 6 deletions(-)
>>
>> diff --git a/libavcodec/flacdec.c b/libavcodec/flacdec.c
>> index 91bbdc657d..6e6a2896b4 100644
>> --- a/libavcodec/flacdec.c
>> +++ b/libavcodec/flacdec.c
>> @@ -602,13 +602,9 @@ static inline int decode_subframe(FLACContext *s, int channel)
>>   
>>       if (wasted) {
>>           if (wasted+bps == 33) {
>> -            int i;
>> -            for (i = 0; i < s->blocksize; i++)
>> -                s->decoded_33bps[i] = (uint64_t)decoded[i] << wasted;
>> +            s->dsp.wasted33(s->decoded_33bps, decoded, wasted, s->blocksize);
>>           } else if (wasted < 32) {
>> -            int i;
>> -            for (i = 0; i < s->blocksize; i++)
>> -                decoded[i] = (unsigned)decoded[i] << wasted;
>> +            s->dsp.wasted32(decoded, wasted, s->blocksize);
>>           }
>>       }
>>   
>> diff --git a/libavcodec/flacdsp.c b/libavcodec/flacdsp.c
>> index 71b4ac44aa..610831348a 100644
>> --- a/libavcodec/flacdsp.c
>> +++ b/libavcodec/flacdsp.c
>> @@ -84,11 +84,31 @@ static void flac_lpc_32_c(int32_t *decoded, const int coeffs[32],
>>   
>>   }
>>   
>> +static void flac_wasted_32_c(int32_t *decoded, int wasted, int len)
>> +{
>> +    int i;
>> +
>> +    for (i = 0; i < len; i++)
>> +        decoded[i] = (unsigned)decoded[i] << wasted;
>> +}
>> +
>> +static void flac_wasted_33_c(int64_t *decoded, const int32_t *residual,
>> +                             int wasted, int len)
>> +{
>> +    int i;
>> +
>> +    for (i = 0; i < len; i++)
> 
> for (int i = 0
Fixed locally.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [FFmpeg-devel] [PATCH 6/8 v2] x86/flacdsp: add a SSE2 version of wasted32
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 6/8] x86/flacdsp: add a SSE2 version of wasted32 James Almer
@ 2024-05-12 18:51   ` James Almer
  2024-05-12 20:22     ` Lynne via ffmpeg-devel
  0 siblings, 1 reply; 18+ messages in thread
From: James Almer @ 2024-05-12 18:51 UTC (permalink / raw)
  To: ffmpeg-devel
flac_wasted_32_c: 851.3
flac_wasted_32_sse2: 41.3
Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/x86/flacdsp.asm    | 24 ++++++++++++++++++++++++
 libavcodec/x86/flacdsp_init.c |  3 +++
 2 files changed, 27 insertions(+)
diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
index f38eb7db76..21b2439bc0 100644
--- a/libavcodec/x86/flacdsp.asm
+++ b/libavcodec/x86/flacdsp.asm
@@ -89,6 +89,30 @@ LPC_32 sse4, 32, psrlq
 LPC_32 xop,  32, psrlq
 %endif
 
+INIT_XMM sse2
+cglobal flac_wasted_32, 3,3,5, decoded, wasted, len
+    shl   lend, 2
+    add   decodedq, lenq
+    neg   lenq
+    movd  m4, wastedd
+ALIGN 16
+.loop:
+    mova  m0, [decodedq+lenq+mmsize*0]
+    mova  m1, [decodedq+lenq+mmsize*1]
+    mova  m2, [decodedq+lenq+mmsize*2]
+    mova  m3, [decodedq+lenq+mmsize*3]
+    pslld m0, m4
+    pslld m1, m4
+    pslld m2, m4
+    pslld m3, m4
+    mova  [decodedq+lenq+mmsize*0], m0
+    mova  [decodedq+lenq+mmsize*1], m1
+    mova  [decodedq+lenq+mmsize*2], m2
+    mova  [decodedq+lenq+mmsize*3], m3
+    add lenq, mmsize * 4
+    jl .loop
+    RET
+
 ;----------------------------------------------------------------------------------
 ;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
 ;                                                   int len, int shift);
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
index dee4bf88fc..67aa118760 100644
--- a/libavcodec/x86/flacdsp_init.c
+++ b/libavcodec/x86/flacdsp_init.c
@@ -30,6 +30,8 @@ void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
 void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
                         int qlevel, int len);
 
+void ff_flac_wasted_32_sse2(int32_t *decoded, int wasted, int len);
+
 #define DECORRELATE_FUNCS(fmt, opt)                                                      \
 void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
                                           int len, int shift);                           \
@@ -63,6 +65,7 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_SSE2(cpu_flags)) {
+        c->wasted32 = ff_flac_wasted_32_sse2;
         if (fmt == AV_SAMPLE_FMT_S16) {
             c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
             c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
-- 
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [FFmpeg-devel] [PATCH 8/8 v2] x86/flacdsp: add an SSE4 version of wasted33
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 8/8] x86/flacdsp: add SSE4 and AVX2 versions of wasted33 James Almer
@ 2024-05-12 18:53   ` James Almer
  0 siblings, 0 replies; 18+ messages in thread
From: James Almer @ 2024-05-12 18:53 UTC (permalink / raw)
  To: ffmpeg-devel
flac_wasted_33_c: 214.1
flac_wasted_33_sse4: 103.2
Signed-off-by: James Almer <jamrial@gmail.com>
---
Removed the AVX2 one as the lane crossing in pmovsxdq removed pretty much all
speed up for processing twice the amount of data.
 libavcodec/x86/flacdsp.asm    | 25 +++++++++++++++++++++++++
 libavcodec/x86/flacdsp_init.c |  2 ++
 2 files changed, 27 insertions(+)
diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
index 21b2439bc0..15fcec4f08 100644
--- a/libavcodec/x86/flacdsp.asm
+++ b/libavcodec/x86/flacdsp.asm
@@ -113,6 +113,31 @@ ALIGN 16
     jl .loop
     RET
 
+INIT_XMM sse4
+cglobal flac_wasted_33, 4,4,5, decoded, residuals, wasted, len
+    shl   lend, 2
+    lea   decodedq, [decodedq+lenq*2]
+    add   residualsq, lenq
+    neg   lenq
+    movd  m4, wastedd
+ALIGN 16
+.loop:
+    pmovsxdq  m0, [residualsq+lenq+mmsize*0]
+    pmovsxdq  m1, [residualsq+lenq+mmsize/2]
+    pmovsxdq  m2, [residualsq+lenq+mmsize*1]
+    pmovsxdq  m3, [residualsq+lenq+mmsize*1+mmsize/2]
+    psllq m0, m4
+    psllq m1, m4
+    psllq m2, m4
+    psllq m3, m4
+    mova  [decodedq+lenq*2+mmsize*0], m0
+    mova  [decodedq+lenq*2+mmsize*1], m1
+    mova  [decodedq+lenq*2+mmsize*2], m2
+    mova  [decodedq+lenq*2+mmsize*3], m3
+    add lenq, mmsize * 2
+    jl .loop
+    RET
+
 ;----------------------------------------------------------------------------------
 ;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
 ;                                                   int len, int shift);
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
index 67aa118760..fa993d3466 100644
--- a/libavcodec/x86/flacdsp_init.c
+++ b/libavcodec/x86/flacdsp_init.c
@@ -31,6 +31,7 @@ void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
                         int qlevel, int len);
 
 void ff_flac_wasted_32_sse2(int32_t *decoded, int wasted, int len);
+void ff_flac_wasted_33_sse4(int64_t *decoded, const int32_t *residual, int wasted, int len);
 
 #define DECORRELATE_FUNCS(fmt, opt)                                                      \
 void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
@@ -100,6 +101,7 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
     if (EXTERNAL_SSE4(cpu_flags)) {
         c->lpc16 = ff_flac_lpc_16_sse4;
         c->lpc32 = ff_flac_lpc_32_sse4;
+        c->wasted33 = ff_flac_wasted_33_sse4;
     }
     if (EXTERNAL_AVX(cpu_flags)) {
         if (fmt == AV_SAMPLE_FMT_S16) {
-- 
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* Re: [FFmpeg-devel] [PATCH 6/8 v2] x86/flacdsp: add a SSE2 version of wasted32
  2024-05-12 18:51   ` [FFmpeg-devel] [PATCH 6/8 v2] " James Almer
@ 2024-05-12 20:22     ` Lynne via ffmpeg-devel
  0 siblings, 0 replies; 18+ messages in thread
From: Lynne via ffmpeg-devel @ 2024-05-12 20:22 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne
On 12/05/2024 20:51, James Almer wrote:
> flac_wasted_32_c: 851.3
> flac_wasted_32_sse2: 41.3
> 
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>   libavcodec/x86/flacdsp.asm    | 24 ++++++++++++++++++++++++
>   libavcodec/x86/flacdsp_init.c |  3 +++
>   2 files changed, 27 insertions(+)
> 
> diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
> index f38eb7db76..21b2439bc0 100644
> --- a/libavcodec/x86/flacdsp.asm
> +++ b/libavcodec/x86/flacdsp.asm
> @@ -89,6 +89,30 @@ LPC_32 sse4, 32, psrlq
>   LPC_32 xop,  32, psrlq
>   %endif
>   
> +INIT_XMM sse2
> +cglobal flac_wasted_32, 3,3,5, decoded, wasted, len
> +    shl   lend, 2
> +    add   decodedq, lenq
> +    neg   lenq
> +    movd  m4, wastedd
> +ALIGN 16
> +.loop:
> +    mova  m0, [decodedq+lenq+mmsize*0]
> +    mova  m1, [decodedq+lenq+mmsize*1]
> +    mova  m2, [decodedq+lenq+mmsize*2]
> +    mova  m3, [decodedq+lenq+mmsize*3]
> +    pslld m0, m4
> +    pslld m1, m4
> +    pslld m2, m4
> +    pslld m3, m4
> +    mova  [decodedq+lenq+mmsize*0], m0
> +    mova  [decodedq+lenq+mmsize*1], m1
> +    mova  [decodedq+lenq+mmsize*2], m2
> +    mova  [decodedq+lenq+mmsize*3], m3
> +    add lenq, mmsize * 4
> +    jl .loop
> +    RET
Looks good
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [FFmpeg-devel] [PATCH 09/10] avcodec/flacdsp: split off lpc33 into a dsp function
  2024-05-11 19:46 [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: run lpc benchmarks with an unmodified buffer James Almer
                   ` (6 preceding siblings ...)
  2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 8/8] x86/flacdsp: add SSE4 and AVX2 versions of wasted33 James Almer
@ 2024-05-12 20:36 ` James Almer
  2024-05-12 20:36 ` [FFmpeg-devel] [PATCH 10/10] checkasm/flacdsp: add a test for lpc33 James Almer
  8 siblings, 0 replies; 18+ messages in thread
From: James Almer @ 2024-05-12 20:36 UTC (permalink / raw)
  To: ffmpeg-devel
Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/flacdec.c |  7 +------
 libavcodec/flacdsp.c | 15 +++++++++++++++
 libavcodec/flacdsp.h |  2 ++
 3 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/libavcodec/flacdec.c b/libavcodec/flacdec.c
index 6e6a2896b4..460d3bc01e 100644
--- a/libavcodec/flacdec.c
+++ b/libavcodec/flacdec.c
@@ -509,12 +509,7 @@ static int decode_subframe_lpc_33bps(FLACContext *s, int64_t *decoded,
     if ((ret = decode_residuals(s, residual, pred_order)) < 0)
         return ret;
 
-    for (i = pred_order; i < s->blocksize; i++, decoded++) {
-        int64_t sum = 0;
-        for (j = 0; j < pred_order; j++)
-            sum += (int64_t)coeffs[j] * (uint64_t)decoded[j];
-        decoded[j] = residual[i] + (sum >> qlevel);
-    }
+    s->dsp.lpc33(decoded, residual, coeffs, pred_order, qlevel, s->blocksize);
 
     return 0;
 }
diff --git a/libavcodec/flacdsp.c b/libavcodec/flacdsp.c
index 27d3e923ee..f5362bf66f 100644
--- a/libavcodec/flacdsp.c
+++ b/libavcodec/flacdsp.c
@@ -84,6 +84,20 @@ static void flac_lpc_32_c(int32_t *decoded, const int coeffs[32],
 
 }
 
+static void flac_lpc_33_c(int64_t *decoded, const int32_t *residual,
+                          const int coeffs[32], int pred_order,
+                          int qlevel, int len)
+{
+    int i, j;
+
+    for (i = pred_order; i < len; i++, decoded++) {
+        int64_t sum = 0;
+        for (j = 0; j < pred_order; j++)
+            sum += (int64_t)coeffs[j] * (uint64_t)decoded[j];
+        decoded[j] = residual[i] + (sum >> qlevel);
+    }
+}
+
 static void flac_wasted_32_c(int32_t *decoded, int wasted, int len)
 {
     for (int i = 0; i < len; i++)
@@ -101,6 +115,7 @@ av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int cha
 {
     c->lpc16        = flac_lpc_16_c;
     c->lpc32        = flac_lpc_32_c;
+    c->lpc33        = flac_lpc_33_c;
 
     c->wasted32     = flac_wasted_32_c;
     c->wasted33     = flac_wasted_33_c;
diff --git a/libavcodec/flacdsp.h b/libavcodec/flacdsp.h
index 5a59c0c864..3b7b35a112 100644
--- a/libavcodec/flacdsp.h
+++ b/libavcodec/flacdsp.h
@@ -30,6 +30,8 @@ typedef struct FLACDSPContext {
                   int qlevel, int len);
     void (*lpc32)(int32_t *samples, const int coeffs[32], int order,
                   int qlevel, int len);
+    void (*lpc33)(int64_t *samples, const int32_t *residual, const int coeffs[32],
+                  int pred_order, int qlevel, int len);
     void (*wasted32)(int32_t *decoded, int wasted, int len);
     void (*wasted33)(int64_t *decoded, const int32_t *residual,
                      int wasted, int len);
-- 
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [FFmpeg-devel] [PATCH 10/10] checkasm/flacdsp: add a test for lpc33
  2024-05-11 19:46 [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: run lpc benchmarks with an unmodified buffer James Almer
                   ` (7 preceding siblings ...)
  2024-05-12 20:36 ` [FFmpeg-devel] [PATCH 09/10] avcodec/flacdsp: split off lpc33 into a dsp function James Almer
@ 2024-05-12 20:36 ` James Almer
  8 siblings, 0 replies; 18+ messages in thread
From: James Almer @ 2024-05-12 20:36 UTC (permalink / raw)
  To: ffmpeg-devel
Signed-off-by: James Almer <jamrial@gmail.com>
---
 tests/checkasm/flacdsp.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c
index 96b7d05f7e..6f8e8817b5 100644
--- a/tests/checkasm/flacdsp.c
+++ b/tests/checkasm/flacdsp.c
@@ -83,6 +83,35 @@ static void check_lpc(int pred_order, int bps)
     bench_new(dst, coeffs, pred_order, qlevel, BUF_SIZE);
 }
 
+static void check_lpc33(int pred_order)
+{
+    int qlevel = rnd() % 16;
+    int coeff_prec = (rnd() % 15) + 1;
+    LOCAL_ALIGNED_16(int64_t, dst,  [BUF_SIZE]);
+    LOCAL_ALIGNED_16(int64_t, dst0, [BUF_SIZE]);
+    LOCAL_ALIGNED_16(int64_t, dst1, [BUF_SIZE]);
+    LOCAL_ALIGNED_16(int32_t, residuals, [BUF_SIZE]);
+    LOCAL_ALIGNED_16(int32_t, coeffs, [32]);
+
+    declare_func(void, int64_t *, const int32_t *, const int[32], int, int, int);
+
+    for (int i = 0; i < 32; i++)
+        coeffs[i] = sign_extend(rnd(), coeff_prec);
+
+    for (int i = 0; i < BUF_SIZE; i++) {
+        residuals[i] = sign_extend(rnd(), pred_order);
+        dst[i] = sign_extend64(((int64_t)rnd() << 1) | (rnd() & 1), 33);
+    }
+
+    memcpy(dst0, dst, BUF_SIZE * sizeof (int64_t));
+    memcpy(dst1, dst, BUF_SIZE * sizeof (int64_t));
+    call_ref(dst0, residuals, coeffs, pred_order, qlevel, BUF_SIZE);
+    call_new(dst1, residuals, coeffs, pred_order, qlevel, BUF_SIZE);
+    if (memcmp(dst0, dst1, BUF_SIZE * sizeof (int64_t)) != 0)
+       fail();
+    bench_new(dst, residuals, coeffs, pred_order, qlevel, BUF_SIZE);
+}
+
 static void check_wasted32(void)
 {
     int wasted = rnd() % 32;
@@ -165,6 +194,9 @@ void checkasm_check_flacdsp(void)
     for (i = 0; i < FF_ARRAY_ELEMS(pred_orders); i++)
         if (check_func(h.lpc32, "flac_lpc_32_%d", pred_orders[i]))
             check_lpc(pred_orders[i], 32);
+    for (i = 0; i < FF_ARRAY_ELEMS(pred_orders); i++)
+        if (check_func(h.lpc33, "flac_lpc_33_%d", pred_orders[i]))
+            check_lpc33(pred_orders[i]);
 
     report("lpc");
 
-- 
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 18+ messages in thread
end of thread, other threads:[~2024-05-12 20:36 UTC | newest]
Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-11 19:46 [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: run lpc benchmarks with an unmodified buffer James Almer
2024-05-11 19:46 ` [FFmpeg-devel] [PATCH 2/2] checkasm/flacdsp: sanitize lpc arguments James Almer
2024-05-11 20:31   ` [FFmpeg-devel] [PATCH 2/2 v2] " James Almer
2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 3/8] x86/flacdsp: add a SSE4 version of lpc16 James Almer
2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 4/8] avcodec/flacdsp: split off wasted bit handling into dsp functions James Almer
2024-05-12 16:15   ` Andreas Rheinhardt
2024-05-12 16:44     ` James Almer
2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 5/8] checkasm/flacdsp: add a test for wasted32 James Almer
2024-05-12 16:38   ` Rémi Denis-Courmont
2024-05-12 16:42     ` [FFmpeg-devel] [PATCH 5/8 v2] " James Almer
2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 6/8] x86/flacdsp: add a SSE2 version of wasted32 James Almer
2024-05-12 18:51   ` [FFmpeg-devel] [PATCH 6/8 v2] " James Almer
2024-05-12 20:22     ` Lynne via ffmpeg-devel
2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 7/8] checkasm/flacdsp: add a test for wasted33 James Almer
2024-05-12 16:06 ` [FFmpeg-devel] [PATCH 8/8] x86/flacdsp: add SSE4 and AVX2 versions of wasted33 James Almer
2024-05-12 18:53   ` [FFmpeg-devel] [PATCH 8/8 v2] x86/flacdsp: add an SSE4 version " James Almer
2024-05-12 20:36 ` [FFmpeg-devel] [PATCH 09/10] avcodec/flacdsp: split off lpc33 into a dsp function James Almer
2024-05-12 20:36 ` [FFmpeg-devel] [PATCH 10/10] checkasm/flacdsp: add a test for lpc33 James Almer
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git