* [FFmpeg-devel] [PATCH 2/3 v2] x86/takdsp: add avx2 versions of all functions
2023-12-22 12:12 [FFmpeg-devel] [PATCH 1/3 v2] checkasm/takdsp: add decorrelate_sf test James Almer
@ 2023-12-22 12:12 ` James Almer
2023-12-22 12:12 ` [FFmpeg-devel] [PATCH 3/3] avcodec/takdsp: fix const correctness James Almer
2023-12-22 12:16 ` [FFmpeg-devel] [PATCH 1/3 v2] checkasm/takdsp: add decorrelate_sf test Martin Storsjö
2 siblings, 0 replies; 5+ messages in thread
From: James Almer @ 2023-12-22 12:12 UTC (permalink / raw)
To: ffmpeg-devel
On an Intel Core i7 12700k:
decorrelate_ls_c: 814.3
decorrelate_ls_sse2: 165.8
decorrelate_ls_avx2: 101.3
decorrelate_sf_c: 1602.6
decorrelate_sf_sse4: 640.1
decorrelate_sf_avx2: 324.6
decorrelate_sm_c: 1564.8
decorrelate_sm_sse2: 379.3
decorrelate_sm_avx2: 203.3
decorrelate_sr_c: 785.3
decorrelate_sr_sse2: 176.3
decorrelate_sr_avx2: 99.8
Signed-off-by: James Almer <jamrial@gmail.com>
---
No changes since last version
libavcodec/x86/takdsp.asm | 36 ++++++++++++++++++++++--------------
libavcodec/x86/takdsp_init.c | 11 +++++++++++
2 files changed, 33 insertions(+), 14 deletions(-)
diff --git a/libavcodec/x86/takdsp.asm b/libavcodec/x86/takdsp.asm
index be8e1ab553..a5501cc285 100644
--- a/libavcodec/x86/takdsp.asm
+++ b/libavcodec/x86/takdsp.asm
@@ -28,7 +28,7 @@ pd_128: times 4 dd 128
SECTION .text
-INIT_XMM sse2
+%macro TAK_DECORRELATE 0
cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
shl lengthd, 2
add p1q, lengthq
@@ -73,10 +73,8 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
mova m1, [p2q+lengthq]
mova m3, [p1q+lengthq+mmsize]
mova m4, [p2q+lengthq+mmsize]
- mova m2, m1
- mova m5, m4
- psrad m2, 1
- psrad m5, 1
+ psrad m2, m1, 1
+ psrad m5, m4, 1
psubd m0, m2
psubd m3, m5
paddd m1, m0
@@ -88,29 +86,39 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
add lengthq, mmsize*2
jl .loop
RET
+%endmacro
-INIT_XMM sse4
+INIT_XMM sse2
+TAK_DECORRELATE
+INIT_YMM avx2
+TAK_DECORRELATE
+
+%macro TAK_DECORRELATE_SF 0
cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
shl lengthd, 2
add p1q, lengthq
add p2q, lengthq
neg lengthq
- movd m2, dshiftm
- movd m3, dfactorm
- pshufd m3, m3, 0
- mova m4, [pd_128]
+ movd xm2, dshiftm
+ VPBROADCASTD m3, dfactorm
+ VBROADCASTI128 m4, [pd_128]
.loop:
- mova m0, [p1q+lengthq]
mova m1, [p2q+lengthq]
- psrad m1, m2
+ psrad m1, xm2
pmulld m1, m3
paddd m1, m4
psrad m1, 8
- pslld m1, m2
- psubd m1, m0
+ pslld m1, xm2
+ psubd m1, [p1q+lengthq]
mova [p1q+lengthq], m1
add lengthq, mmsize
jl .loop
RET
+%endmacro
+
+INIT_XMM sse4
+TAK_DECORRELATE_SF
+INIT_YMM avx2
+TAK_DECORRELATE_SF
diff --git a/libavcodec/x86/takdsp_init.c b/libavcodec/x86/takdsp_init.c
index b2e6e639ee..c99a057b24 100644
--- a/libavcodec/x86/takdsp_init.c
+++ b/libavcodec/x86/takdsp_init.c
@@ -24,9 +24,13 @@
#include "config.h"
void ff_tak_decorrelate_ls_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_ls_avx2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sr_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sr_avx2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sm_avx2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sf_sse4(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
+void ff_tak_decorrelate_sf_avx2(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
{
@@ -42,5 +46,12 @@ av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
if (EXTERNAL_SSE4(cpu_flags)) {
c->decorrelate_sf = ff_tak_decorrelate_sf_sse4;
}
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->decorrelate_ls = ff_tak_decorrelate_ls_avx2;
+ c->decorrelate_sr = ff_tak_decorrelate_sr_avx2;
+ c->decorrelate_sm = ff_tak_decorrelate_sm_avx2;
+ c->decorrelate_sf = ff_tak_decorrelate_sf_avx2;
+ }
#endif
}
--
2.43.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 3/3] avcodec/takdsp: fix const correctness
2023-12-22 12:12 [FFmpeg-devel] [PATCH 1/3 v2] checkasm/takdsp: add decorrelate_sf test James Almer
2023-12-22 12:12 ` [FFmpeg-devel] [PATCH 2/3 v2] x86/takdsp: add avx2 versions of all functions James Almer
@ 2023-12-22 12:12 ` James Almer
2023-12-22 12:17 ` Martin Storsjö
2023-12-22 12:16 ` [FFmpeg-devel] [PATCH 1/3 v2] checkasm/takdsp: add decorrelate_sf test Martin Storsjö
2 siblings, 1 reply; 5+ messages in thread
From: James Almer @ 2023-12-22 12:12 UTC (permalink / raw)
To: ffmpeg-devel
Signed-off-by: James Almer <jamrial@gmail.com>
---
libavcodec/riscv/takdsp_init.c | 4 ++--
libavcodec/takdsp.c | 6 +++---
libavcodec/takdsp.h | 6 +++---
libavcodec/x86/takdsp_init.c | 12 ++++++------
tests/checkasm/takdsp.c | 6 +++---
5 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/libavcodec/riscv/takdsp_init.c b/libavcodec/riscv/takdsp_init.c
index 0b4ec18086..2d5c974459 100644
--- a/libavcodec/riscv/takdsp_init.c
+++ b/libavcodec/riscv/takdsp_init.c
@@ -25,8 +25,8 @@
#include "libavutil/riscv/cpu.h"
#include "libavcodec/takdsp.h"
-void ff_decorrelate_ls_rvv(int32_t *p1, int32_t *p2, int length);
-void ff_decorrelate_sr_rvv(int32_t *p1, int32_t *p2, int length);
+void ff_decorrelate_ls_rvv(const int32_t *p1, int32_t *p2, int length);
+void ff_decorrelate_sr_rvv(int32_t *p1, const int32_t *p2, int length);
av_cold void ff_takdsp_init_riscv(TAKDSPContext *dsp)
{
diff --git a/libavcodec/takdsp.c b/libavcodec/takdsp.c
index 25cac558ce..51b6658de4 100644
--- a/libavcodec/takdsp.c
+++ b/libavcodec/takdsp.c
@@ -23,7 +23,7 @@
#include "takdsp.h"
#include "config.h"
-static void decorrelate_ls(int32_t *p1, int32_t *p2, int length)
+static void decorrelate_ls(const int32_t *p1, int32_t *p2, int length)
{
int i;
@@ -34,7 +34,7 @@ static void decorrelate_ls(int32_t *p1, int32_t *p2, int length)
}
}
-static void decorrelate_sr(int32_t *p1, int32_t *p2, int length)
+static void decorrelate_sr(int32_t *p1, const int32_t *p2, int length)
{
int i;
@@ -58,7 +58,7 @@ static void decorrelate_sm(int32_t *p1, int32_t *p2, int length)
}
}
-static void decorrelate_sf(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor)
+static void decorrelate_sf(int32_t *p1, const int32_t *p2, int length, int dshift, int dfactor)
{
int i;
diff --git a/libavcodec/takdsp.h b/libavcodec/takdsp.h
index 55f1a10cd3..13b5e530b2 100644
--- a/libavcodec/takdsp.h
+++ b/libavcodec/takdsp.h
@@ -22,10 +22,10 @@
#include <stdint.h>
typedef struct TAKDSPContext {
- void (*decorrelate_ls)(int32_t *p1, int32_t *p2, int length);
- void (*decorrelate_sr)(int32_t *p1, int32_t *p2, int length);
+ void (*decorrelate_ls)(const int32_t *p1, int32_t *p2, int length);
+ void (*decorrelate_sr)(int32_t *p1, const int32_t *p2, int length);
void (*decorrelate_sm)(int32_t *p1, int32_t *p2, int length);
- void (*decorrelate_sf)(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
+ void (*decorrelate_sf)(int32_t *p1, const int32_t *p2, int length, int dshift, int dfactor);
} TAKDSPContext;
void ff_takdsp_init(TAKDSPContext *c);
diff --git a/libavcodec/x86/takdsp_init.c b/libavcodec/x86/takdsp_init.c
index c99a057b24..9553f8442c 100644
--- a/libavcodec/x86/takdsp_init.c
+++ b/libavcodec/x86/takdsp_init.c
@@ -23,14 +23,14 @@
#include "libavutil/x86/cpu.h"
#include "config.h"
-void ff_tak_decorrelate_ls_sse2(int32_t *p1, int32_t *p2, int length);
-void ff_tak_decorrelate_ls_avx2(int32_t *p1, int32_t *p2, int length);
-void ff_tak_decorrelate_sr_sse2(int32_t *p1, int32_t *p2, int length);
-void ff_tak_decorrelate_sr_avx2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_ls_sse2(const int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_ls_avx2(const int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sr_sse2(int32_t *p1, const int32_t *p2, int length);
+void ff_tak_decorrelate_sr_avx2(int32_t *p1, const int32_t *p2, int length);
void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sm_avx2(int32_t *p1, int32_t *p2, int length);
-void ff_tak_decorrelate_sf_sse4(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
-void ff_tak_decorrelate_sf_avx2(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
+void ff_tak_decorrelate_sf_sse4(int32_t *p1, const int32_t *p2, int length, int dshift, int dfactor);
+void ff_tak_decorrelate_sf_avx2(int32_t *p1, const int32_t *p2, int length, int dshift, int dfactor);
av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
{
diff --git a/tests/checkasm/takdsp.c b/tests/checkasm/takdsp.c
index 78528b1c5d..fd4122f34b 100644
--- a/tests/checkasm/takdsp.c
+++ b/tests/checkasm/takdsp.c
@@ -37,7 +37,7 @@
#define BUF_SIZE 1024
static void test_decorrelate_ls(TAKDSPContext *s) {
- declare_func(void, int32_t *, int32_t *, int);
+ declare_func(void, const int32_t *, int32_t *, int);
if (check_func(s->decorrelate_ls, "decorrelate_ls")) {
LOCAL_ALIGNED_32(int32_t, p1, [BUF_SIZE]);
@@ -62,7 +62,7 @@ static void test_decorrelate_ls(TAKDSPContext *s) {
}
static void test_decorrelate_sr(TAKDSPContext *s) {
- declare_func(void, int32_t *, int32_t *, int);
+ declare_func(void, int32_t *, const int32_t *, int);
if (check_func(s->decorrelate_sr, "decorrelate_sr")) {
LOCAL_ALIGNED_32(int32_t, p1, [BUF_SIZE]);
@@ -115,7 +115,7 @@ static void test_decorrelate_sm(TAKDSPContext *s) {
}
static void test_decorrelate_sf(TAKDSPContext *s) {
- declare_func(void, int32_t *, int32_t *, int, int, int);
+ declare_func(void, int32_t *, const int32_t *, int, int, int);
if (check_func(s->decorrelate_sf, "decorrelate_sf")) {
LOCAL_ALIGNED_32(int32_t, p1, [BUF_SIZE]);
--
2.43.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/3 v2] checkasm/takdsp: add decorrelate_sf test
2023-12-22 12:12 [FFmpeg-devel] [PATCH 1/3 v2] checkasm/takdsp: add decorrelate_sf test James Almer
2023-12-22 12:12 ` [FFmpeg-devel] [PATCH 2/3 v2] x86/takdsp: add avx2 versions of all functions James Almer
2023-12-22 12:12 ` [FFmpeg-devel] [PATCH 3/3] avcodec/takdsp: fix const correctness James Almer
@ 2023-12-22 12:16 ` Martin Storsjö
2 siblings, 0 replies; 5+ messages in thread
From: Martin Storsjö @ 2023-12-22 12:16 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Fri, 22 Dec 2023, James Almer wrote:
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
> Fixes broken logic as reported by Martin.
>
> tests/checkasm/takdsp.c | 35 ++++++++++++++++++++++++++++++++---
> 1 file changed, 32 insertions(+), 3 deletions(-)
> +static void test_decorrelate_sf(TAKDSPContext *s) {
> + declare_func(void, int32_t *, int32_t *, int, int, int);
> +
> + if (check_func(s->decorrelate_sf, "decorrelate_sf")) {
> + LOCAL_ALIGNED_32(int32_t, p1, [BUF_SIZE]);
> + LOCAL_ALIGNED_32(int32_t, p1_2, [BUF_SIZE]);
> + LOCAL_ALIGNED_32(int32_t, p2, [BUF_SIZE]);
> + int dshift, dfactor;
> +
> + randomize(p1, BUF_SIZE);
> + memcpy(p1_2, p1, BUF_SIZE * sizeof(*p1));
> + randomize(p2, BUF_SIZE);
> + dshift = (rnd() & 0xF) + 1;
> + dfactor = sign_extend(rnd(), 10);
> +
> + call_ref(p1, p2, BUF_SIZE, dshift, dfactor);
> + call_new(p1_2, p2, BUF_SIZE, dshift, dfactor);
> +
> + if (memcmp(p1, p1_2, BUF_SIZE) != 0) {
This still needs a *sizeof(*p1)
Other than that, this looks good, thanks!
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread