* [FFmpeg-devel] [PATCH 1/2] checkasm/takdsp: add decorrelate_sf test
@ 2023-12-22 1:15 James Almer
2023-12-22 1:15 ` [FFmpeg-devel] [PATCH 2/2] x86/takdsp: add avx2 versions of all functions James Almer
2023-12-22 9:52 ` [FFmpeg-devel] [PATCH 1/2] checkasm/takdsp: add decorrelate_sf test Martin Storsjö
0 siblings, 2 replies; 8+ messages in thread
From: James Almer @ 2023-12-22 1:15 UTC (permalink / raw)
To: ffmpeg-devel
Signed-off-by: James Almer <jamrial@gmail.com>
---
tests/checkasm/takdsp.c | 36 +++++++++++++++++++++++++++++++++---
1 file changed, 33 insertions(+), 3 deletions(-)
diff --git a/tests/checkasm/takdsp.c b/tests/checkasm/takdsp.c
index 495b7242c5..8df93cfd52 100644
--- a/tests/checkasm/takdsp.c
+++ b/tests/checkasm/takdsp.c
@@ -24,6 +24,7 @@
#include "libavutil/mem_internal.h"
#include "libavcodec/takdsp.h"
+#include "libavcodec/mathops.h"
#include "checkasm.h"
@@ -33,8 +34,9 @@
buf[i] = rnd(); \
} while (0)
-static void test_decorrelate_ls(TAKDSPContext *s) {
#define BUF_SIZE 1024
+
+static void test_decorrelate_ls(TAKDSPContext *s) {
declare_func(void, int32_t *, int32_t *, int);
if (check_func(s->decorrelate_ls, "decorrelate_ls")) {
@@ -60,7 +62,6 @@ static void test_decorrelate_ls(TAKDSPContext *s) {
}
static void test_decorrelate_sr(TAKDSPContext *s) {
-#define BUF_SIZE 1024
declare_func(void, int32_t *, int32_t *, int);
if (check_func(s->decorrelate_sr, "decorrelate_sr")) {
@@ -86,7 +87,6 @@ static void test_decorrelate_sr(TAKDSPContext *s) {
}
static void test_decorrelate_sm(TAKDSPContext *s) {
-#define BUF_SIZE 1024
declare_func(void, int32_t *, int32_t *, int);
if (check_func(s->decorrelate_sm, "decorrelate_sm")) {
@@ -113,6 +113,35 @@ static void test_decorrelate_sm(TAKDSPContext *s) {
report("decorrelate_sm");
}
+static void test_decorrelate_sf(TAKDSPContext *s) {
+ declare_func(void, int32_t *, int32_t *, int, int, int);
+
+ if (check_func(s->decorrelate_sf, "decorrelate_sf")) {
+ LOCAL_ALIGNED_32(int32_t, p1, [BUF_SIZE]);
+ LOCAL_ALIGNED_32(int32_t, p1_2, [BUF_SIZE]);
+ LOCAL_ALIGNED_32(int32_t, p2, [BUF_SIZE]);
+ LOCAL_ALIGNED_32(int32_t, p2_2, [BUF_SIZE]);
+ int dshift, dfactor;
+
+ randomize(p1, BUF_SIZE);
+ memcpy(p1, p1_2, BUF_SIZE);
+ randomize(p2, BUF_SIZE);
+ memcpy(p2_2, p2, BUF_SIZE);
+ dshift = (rnd() & 0xF) + 1;
+ dfactor = sign_extend(rnd(), 10);
+ call_ref(p1, p2, BUF_SIZE, dshift, dfactor);
+ call_new(p1_2, p2_2, BUF_SIZE, dshift, dfactor);
+
+ if (memcmp(p2, p2_2, BUF_SIZE) != 0){
+ fail();
+ }
+
+ bench_new(p1, p2, BUF_SIZE, dshift, dfactor);
+ }
+
+ report("decorrelate_sf");
+}
+
void checkasm_check_takdsp(void)
{
TAKDSPContext s = { 0 };
@@ -121,4 +150,5 @@ void checkasm_check_takdsp(void)
test_decorrelate_ls(&s);
test_decorrelate_sr(&s);
test_decorrelate_sm(&s);
+ test_decorrelate_sf(&s);
}
--
2.43.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* [FFmpeg-devel] [PATCH 2/2] x86/takdsp: add avx2 versions of all functions
2023-12-22 1:15 [FFmpeg-devel] [PATCH 1/2] checkasm/takdsp: add decorrelate_sf test James Almer
@ 2023-12-22 1:15 ` James Almer
2023-12-22 23:08 ` Michael Niedermayer
2023-12-22 9:52 ` [FFmpeg-devel] [PATCH 1/2] checkasm/takdsp: add decorrelate_sf test Martin Storsjö
1 sibling, 1 reply; 8+ messages in thread
From: James Almer @ 2023-12-22 1:15 UTC (permalink / raw)
To: ffmpeg-devel
On an Intel Core i7 12700k:
decorrelate_ls_c: 814.3
decorrelate_ls_sse2: 165.8
decorrelate_ls_avx2: 101.3
decorrelate_sf_c: 1602.6
decorrelate_sf_sse4: 640.1
decorrelate_sf_avx2: 324.6
decorrelate_sm_c: 1564.8
decorrelate_sm_sse2: 379.3
decorrelate_sm_avx2: 203.3
decorrelate_sr_c: 785.3
decorrelate_sr_sse2: 176.3
decorrelate_sr_avx2: 99.8
Signed-off-by: James Almer <jamrial@gmail.com>
---
libavcodec/x86/takdsp.asm | 36 ++++++++++++++++++++++--------------
libavcodec/x86/takdsp_init.c | 11 +++++++++++
2 files changed, 33 insertions(+), 14 deletions(-)
diff --git a/libavcodec/x86/takdsp.asm b/libavcodec/x86/takdsp.asm
index be8e1ab553..a5501cc285 100644
--- a/libavcodec/x86/takdsp.asm
+++ b/libavcodec/x86/takdsp.asm
@@ -28,7 +28,7 @@ pd_128: times 4 dd 128
SECTION .text
-INIT_XMM sse2
+%macro TAK_DECORRELATE 0
cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
shl lengthd, 2
add p1q, lengthq
@@ -73,10 +73,8 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
mova m1, [p2q+lengthq]
mova m3, [p1q+lengthq+mmsize]
mova m4, [p2q+lengthq+mmsize]
- mova m2, m1
- mova m5, m4
- psrad m2, 1
- psrad m5, 1
+ psrad m2, m1, 1
+ psrad m5, m4, 1
psubd m0, m2
psubd m3, m5
paddd m1, m0
@@ -88,29 +86,39 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
add lengthq, mmsize*2
jl .loop
RET
+%endmacro
-INIT_XMM sse4
+INIT_XMM sse2
+TAK_DECORRELATE
+INIT_YMM avx2
+TAK_DECORRELATE
+
+%macro TAK_DECORRELATE_SF 0
cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
shl lengthd, 2
add p1q, lengthq
add p2q, lengthq
neg lengthq
- movd m2, dshiftm
- movd m3, dfactorm
- pshufd m3, m3, 0
- mova m4, [pd_128]
+ movd xm2, dshiftm
+ VPBROADCASTD m3, dfactorm
+ VBROADCASTI128 m4, [pd_128]
.loop:
- mova m0, [p1q+lengthq]
mova m1, [p2q+lengthq]
- psrad m1, m2
+ psrad m1, xm2
pmulld m1, m3
paddd m1, m4
psrad m1, 8
- pslld m1, m2
- psubd m1, m0
+ pslld m1, xm2
+ psubd m1, [p1q+lengthq]
mova [p1q+lengthq], m1
add lengthq, mmsize
jl .loop
RET
+%endmacro
+
+INIT_XMM sse4
+TAK_DECORRELATE_SF
+INIT_YMM avx2
+TAK_DECORRELATE_SF
diff --git a/libavcodec/x86/takdsp_init.c b/libavcodec/x86/takdsp_init.c
index b2e6e639ee..c99a057b24 100644
--- a/libavcodec/x86/takdsp_init.c
+++ b/libavcodec/x86/takdsp_init.c
@@ -24,9 +24,13 @@
#include "config.h"
void ff_tak_decorrelate_ls_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_ls_avx2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sr_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sr_avx2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sm_avx2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sf_sse4(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
+void ff_tak_decorrelate_sf_avx2(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
{
@@ -42,5 +46,12 @@ av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
if (EXTERNAL_SSE4(cpu_flags)) {
c->decorrelate_sf = ff_tak_decorrelate_sf_sse4;
}
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->decorrelate_ls = ff_tak_decorrelate_ls_avx2;
+ c->decorrelate_sr = ff_tak_decorrelate_sr_avx2;
+ c->decorrelate_sm = ff_tak_decorrelate_sm_avx2;
+ c->decorrelate_sf = ff_tak_decorrelate_sf_avx2;
+ }
#endif
}
--
2.43.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/2] checkasm/takdsp: add decorrelate_sf test
2023-12-22 1:15 [FFmpeg-devel] [PATCH 1/2] checkasm/takdsp: add decorrelate_sf test James Almer
2023-12-22 1:15 ` [FFmpeg-devel] [PATCH 2/2] x86/takdsp: add avx2 versions of all functions James Almer
@ 2023-12-22 9:52 ` Martin Storsjö
1 sibling, 0 replies; 8+ messages in thread
From: Martin Storsjö @ 2023-12-22 9:52 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Thu, 21 Dec 2023, James Almer wrote:
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
> tests/checkasm/takdsp.c | 36 +++++++++++++++++++++++++++++++++---
> 1 file changed, 33 insertions(+), 3 deletions(-)
> @@ -113,6 +113,35 @@ static void test_decorrelate_sm(TAKDSPContext *s) {
> report("decorrelate_sm");
> }
>
> +static void test_decorrelate_sf(TAKDSPContext *s) {
> + declare_func(void, int32_t *, int32_t *, int, int, int);
> +
> + if (check_func(s->decorrelate_sf, "decorrelate_sf")) {
> + LOCAL_ALIGNED_32(int32_t, p1, [BUF_SIZE]);
> + LOCAL_ALIGNED_32(int32_t, p1_2, [BUF_SIZE]);
> + LOCAL_ALIGNED_32(int32_t, p2, [BUF_SIZE]);
> + LOCAL_ALIGNED_32(int32_t, p2_2, [BUF_SIZE]);
> + int dshift, dfactor;
> +
> + randomize(p1, BUF_SIZE);
> + memcpy(p1, p1_2, BUF_SIZE);
The source/destination of the memcpy is flipped here. And it needs a
*sizeof(*p1).
> + randomize(p2, BUF_SIZE);
> + memcpy(p2_2, p2, BUF_SIZE);
> + dshift = (rnd() & 0xF) + 1;
> + dfactor = sign_extend(rnd(), 10);
> + call_ref(p1, p2, BUF_SIZE, dshift, dfactor);
> + call_new(p1_2, p2_2, BUF_SIZE, dshift, dfactor);
This function only mutates p1, not p2, so the copy of p2_2 is not entirely
necessary.
> +
> + if (memcmp(p2, p2_2, BUF_SIZE) != 0){
As we're mutating p1, that one is the one that should be checked. Also
*sizeof(*p1) in the memcmp. And space between ) and {.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] x86/takdsp: add avx2 versions of all functions
2023-12-22 1:15 ` [FFmpeg-devel] [PATCH 2/2] x86/takdsp: add avx2 versions of all functions James Almer
@ 2023-12-22 23:08 ` Michael Niedermayer
2023-12-22 23:30 ` James Almer
2023-12-22 23:52 ` [FFmpeg-devel] [PATCH 2/2 v2] " James Almer
0 siblings, 2 replies; 8+ messages in thread
From: Michael Niedermayer @ 2023-12-22 23:08 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 1786 bytes --]
On Thu, Dec 21, 2023 at 10:15:49PM -0300, James Almer wrote:
> On an Intel Core i7 12700k:
>
> decorrelate_ls_c: 814.3
> decorrelate_ls_sse2: 165.8
> decorrelate_ls_avx2: 101.3
> decorrelate_sf_c: 1602.6
> decorrelate_sf_sse4: 640.1
> decorrelate_sf_avx2: 324.6
> decorrelate_sm_c: 1564.8
> decorrelate_sm_sse2: 379.3
> decorrelate_sm_avx2: 203.3
> decorrelate_sr_c: 785.3
> decorrelate_sr_sse2: 176.3
> decorrelate_sr_avx2: 99.8
>
> Signed-off-by: James Almer <jamrial@gmail.com>
on AMD Ryzen 9 3950X 16-Core Processor
Illegal instruction (core dumped)
threads=1
tests/Makefile:308: recipe for target 'fate-lossless-tak' failed
make: *** [fate-lossless-tak] Error 132
(gdb) disassemble $rip-32, $rip+32
Dump of assembler code from 0x55555651a580 to 0x55555651a5c0:
0x000055555651a580: or $0x17,%al
0x000055555651a582: movdqa %xmm1,(%rdi,%rdx,1)
0x000055555651a587: add $0x10,%rdx
0x000055555651a58b: jl 0x55555651a562
0x000055555651a58d: retq
0x000055555651a58e: nop
0x000055555651a58f: nop
0x000055555651a590: shl $0x2,%edx
0x000055555651a593: add %rdx,%rdi
0x000055555651a596: add %rdx,%rsi
0x000055555651a599: neg %rdx
0x000055555651a59c: vmovd %ecx,%xmm2
=> 0x000055555651a5a0: vpbroadcastd %r8d,%ymm3
0x000055555651a5a6: vbroadcasti128 0x4bc751(%rip),%ymm4 # 0x5555569d6d00
0x000055555651a5af: vmovdqa (%rsi,%rdx,1),%ymm1
0x000055555651a5b4: vpsrad %xmm2,%ymm1,%ymm1
0x000055555651a5b8: vpmulld %ymm3,%ymm1,%ymm1
0x000055555651a5bd: vpaddd %ymm4,%ymm1,%ymm1
End of assembler dump.
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Why not whip the teacher when the pupil misbehaves? -- Diogenes of Sinope
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] x86/takdsp: add avx2 versions of all functions
2023-12-22 23:08 ` Michael Niedermayer
@ 2023-12-22 23:30 ` James Almer
2023-12-22 23:52 ` [FFmpeg-devel] [PATCH 2/2 v2] " James Almer
1 sibling, 0 replies; 8+ messages in thread
From: James Almer @ 2023-12-22 23:30 UTC (permalink / raw)
To: ffmpeg-devel
On 12/22/2023 8:08 PM, Michael Niedermayer wrote:
> On Thu, Dec 21, 2023 at 10:15:49PM -0300, James Almer wrote:
>> On an Intel Core i7 12700k:
>>
>> decorrelate_ls_c: 814.3
>> decorrelate_ls_sse2: 165.8
>> decorrelate_ls_avx2: 101.3
>> decorrelate_sf_c: 1602.6
>> decorrelate_sf_sse4: 640.1
>> decorrelate_sf_avx2: 324.6
>> decorrelate_sm_c: 1564.8
>> decorrelate_sm_sse2: 379.3
>> decorrelate_sm_avx2: 203.3
>> decorrelate_sr_c: 785.3
>> decorrelate_sr_sse2: 176.3
>> decorrelate_sr_avx2: 99.8
>>
>> Signed-off-by: James Almer <jamrial@gmail.com>
>
> on AMD Ryzen 9 3950X 16-Core Processor
>
> Illegal instruction (core dumped)
> threads=1
> tests/Makefile:308: recipe for target 'fate-lossless-tak' failed
> make: *** [fate-lossless-tak] Error 132
>
> (gdb) disassemble $rip-32, $rip+32
> Dump of assembler code from 0x55555651a580 to 0x55555651a5c0:
> 0x000055555651a580: or $0x17,%al
> 0x000055555651a582: movdqa %xmm1,(%rdi,%rdx,1)
> 0x000055555651a587: add $0x10,%rdx
> 0x000055555651a58b: jl 0x55555651a562
> 0x000055555651a58d: retq
> 0x000055555651a58e: nop
> 0x000055555651a58f: nop
> 0x000055555651a590: shl $0x2,%edx
> 0x000055555651a593: add %rdx,%rdi
> 0x000055555651a596: add %rdx,%rsi
> 0x000055555651a599: neg %rdx
> 0x000055555651a59c: vmovd %ecx,%xmm2
> => 0x000055555651a5a0: vpbroadcastd %r8d,%ymm3
Right, on linux the fifth argument is on a gpr, and vpbroadcastd with
gpr source is avx512.
Will fix and resend.
> 0x000055555651a5a6: vbroadcasti128 0x4bc751(%rip),%ymm4 # 0x5555569d6d00
> 0x000055555651a5af: vmovdqa (%rsi,%rdx,1),%ymm1
> 0x000055555651a5b4: vpsrad %xmm2,%ymm1,%ymm1
> 0x000055555651a5b8: vpmulld %ymm3,%ymm1,%ymm1
> 0x000055555651a5bd: vpaddd %ymm4,%ymm1,%ymm1
> End of assembler dump.
>
>
> [...]
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* [FFmpeg-devel] [PATCH 2/2 v2] x86/takdsp: add avx2 versions of all functions
2023-12-22 23:08 ` Michael Niedermayer
2023-12-22 23:30 ` James Almer
@ 2023-12-22 23:52 ` James Almer
2023-12-23 10:44 ` Lynne
1 sibling, 1 reply; 8+ messages in thread
From: James Almer @ 2023-12-22 23:52 UTC (permalink / raw)
To: ffmpeg-devel
On an Intel Core i7 12700k:
decorrelate_ls_c: 814.3
decorrelate_ls_sse2: 165.8
decorrelate_ls_avx2: 101.3
decorrelate_sf_c: 1602.6
decorrelate_sf_sse4: 640.1
decorrelate_sf_avx2: 324.6
decorrelate_sm_c: 1564.8
decorrelate_sm_sse2: 379.3
decorrelate_sm_avx2: 203.3
decorrelate_sr_c: 785.3
decorrelate_sr_sse2: 176.3
decorrelate_sr_avx2: 99.8
Signed-off-by: James Almer <jamrial@gmail.com>
---
libavcodec/x86/takdsp.asm | 41 ++++++++++++++++++++++++------------
libavcodec/x86/takdsp_init.c | 11 ++++++++++
2 files changed, 38 insertions(+), 14 deletions(-)
diff --git a/libavcodec/x86/takdsp.asm b/libavcodec/x86/takdsp.asm
index be8e1ab553..d55c5f39aa 100644
--- a/libavcodec/x86/takdsp.asm
+++ b/libavcodec/x86/takdsp.asm
@@ -28,7 +28,7 @@ pd_128: times 4 dd 128
SECTION .text
-INIT_XMM sse2
+%macro TAK_DECORRELATE 0
cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
shl lengthd, 2
add p1q, lengthq
@@ -73,10 +73,8 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
mova m1, [p2q+lengthq]
mova m3, [p1q+lengthq+mmsize]
mova m4, [p2q+lengthq+mmsize]
- mova m2, m1
- mova m5, m4
- psrad m2, 1
- psrad m5, 1
+ psrad m2, m1, 1
+ psrad m5, m4, 1
psubd m0, m2
psubd m3, m5
paddd m1, m0
@@ -88,29 +86,44 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
add lengthq, mmsize*2
jl .loop
RET
+%endmacro
-INIT_XMM sse4
+INIT_XMM sse2
+TAK_DECORRELATE
+INIT_YMM avx2
+TAK_DECORRELATE
+
+%macro TAK_DECORRELATE_SF 0
cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
shl lengthd, 2
add p1q, lengthq
add p2q, lengthq
neg lengthq
- movd m2, dshiftm
- movd m3, dfactorm
- pshufd m3, m3, 0
- mova m4, [pd_128]
+ movd xm2, dshiftm
+%if UNIX64
+ movd xm3, dfactorm
+ VPBROADCASTD m3, xm3
+%else
+ VPBROADCASTD m3, dfactorm
+%endif
+ VBROADCASTI128 m4, [pd_128]
.loop:
- mova m0, [p1q+lengthq]
mova m1, [p2q+lengthq]
- psrad m1, m2
+ psrad m1, xm2
pmulld m1, m3
paddd m1, m4
psrad m1, 8
- pslld m1, m2
- psubd m1, m0
+ pslld m1, xm2
+ psubd m1, [p1q+lengthq]
mova [p1q+lengthq], m1
add lengthq, mmsize
jl .loop
RET
+%endmacro
+
+INIT_XMM sse4
+TAK_DECORRELATE_SF
+INIT_YMM avx2
+TAK_DECORRELATE_SF
diff --git a/libavcodec/x86/takdsp_init.c b/libavcodec/x86/takdsp_init.c
index 12b62b8247..9553f8442c 100644
--- a/libavcodec/x86/takdsp_init.c
+++ b/libavcodec/x86/takdsp_init.c
@@ -24,9 +24,13 @@
#include "config.h"
void ff_tak_decorrelate_ls_sse2(const int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_ls_avx2(const int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sr_sse2(int32_t *p1, const int32_t *p2, int length);
+void ff_tak_decorrelate_sr_avx2(int32_t *p1, const int32_t *p2, int length);
void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sm_avx2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sf_sse4(int32_t *p1, const int32_t *p2, int length, int dshift, int dfactor);
+void ff_tak_decorrelate_sf_avx2(int32_t *p1, const int32_t *p2, int length, int dshift, int dfactor);
av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
{
@@ -42,5 +46,12 @@ av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
if (EXTERNAL_SSE4(cpu_flags)) {
c->decorrelate_sf = ff_tak_decorrelate_sf_sse4;
}
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->decorrelate_ls = ff_tak_decorrelate_ls_avx2;
+ c->decorrelate_sr = ff_tak_decorrelate_sr_avx2;
+ c->decorrelate_sm = ff_tak_decorrelate_sm_avx2;
+ c->decorrelate_sf = ff_tak_decorrelate_sf_avx2;
+ }
#endif
}
--
2.43.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2 v2] x86/takdsp: add avx2 versions of all functions
2023-12-22 23:52 ` [FFmpeg-devel] [PATCH 2/2 v2] " James Almer
@ 2023-12-23 10:44 ` Lynne
2023-12-23 11:46 ` Andreas Rheinhardt
0 siblings, 1 reply; 8+ messages in thread
From: Lynne @ 2023-12-23 10:44 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Dec 23, 2023, 00:53 by jamrial@gmail.com:
> On an Intel Core i7 12700k:
>
> decorrelate_ls_c: 814.3
> decorrelate_ls_sse2: 165.8
> decorrelate_ls_avx2: 101.3
> decorrelate_sf_c: 1602.6
> decorrelate_sf_sse4: 640.1
> decorrelate_sf_avx2: 324.6
> decorrelate_sm_c: 1564.8
> decorrelate_sm_sse2: 379.3
> decorrelate_sm_avx2: 203.3
> decorrelate_sr_c: 785.3
> decorrelate_sr_sse2: 176.3
> decorrelate_sr_avx2: 99.8
>
> Signed-off-by: James Almer <jamrial@gmail.com>
>
Even better on a Zen3:
checkasm: all 8 tests passed
decorrelate_ls_c: 111.1
decorrelate_ls_sse2: 272.6
decorrelate_ls_avx2: 94.1
decorrelate_sf_c: 170.6
decorrelate_sf_sse4: 400.1
decorrelate_sf_avx2: 196.1
decorrelate_sm_c: 187.6
decorrelate_sm_sse2: 383.1
decorrelate_sm_avx2: 179.1
decorrelate_sr_c: 102.6
decorrelate_sr_sse2: 272.6
decorrelate_sr_avx2: 94.1
Tested, decoding works fine too, LGTM
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2 v2] x86/takdsp: add avx2 versions of all functions
2023-12-23 10:44 ` Lynne
@ 2023-12-23 11:46 ` Andreas Rheinhardt
0 siblings, 0 replies; 8+ messages in thread
From: Andreas Rheinhardt @ 2023-12-23 11:46 UTC (permalink / raw)
To: ffmpeg-devel
Lynne:
> Dec 23, 2023, 00:53 by jamrial@gmail.com:
>
>> On an Intel Core i7 12700k:
>>
>> decorrelate_ls_c: 814.3
>> decorrelate_ls_sse2: 165.8
>> decorrelate_ls_avx2: 101.3
>> decorrelate_sf_c: 1602.6
>> decorrelate_sf_sse4: 640.1
>> decorrelate_sf_avx2: 324.6
>> decorrelate_sm_c: 1564.8
>> decorrelate_sm_sse2: 379.3
>> decorrelate_sm_avx2: 203.3
>> decorrelate_sr_c: 785.3
>> decorrelate_sr_sse2: 176.3
>> decorrelate_sr_avx2: 99.8
>>
>> Signed-off-by: James Almer <jamrial@gmail.com>
>>
>
> Even better on a Zen3:
> checkasm: all 8 tests passed
> decorrelate_ls_c: 111.1
> decorrelate_ls_sse2: 272.6
> decorrelate_ls_avx2: 94.1
> decorrelate_sf_c: 170.6
> decorrelate_sf_sse4: 400.1
> decorrelate_sf_avx2: 196.1
> decorrelate_sm_c: 187.6
> decorrelate_sm_sse2: 383.1
> decorrelate_sm_avx2: 179.1
> decorrelate_sr_c: 102.6
> decorrelate_sr_sse2: 272.6
> decorrelate_sr_avx2: 94.1
>
The SSE2 version is worse than the C version? Does this happen for more
DSP code?
(For decorrelate_sf_c, the C version is still the best and the gain of
AVX2 over C is not good for the other three either.)
- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2023-12-23 11:45 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-22 1:15 [FFmpeg-devel] [PATCH 1/2] checkasm/takdsp: add decorrelate_sf test James Almer
2023-12-22 1:15 ` [FFmpeg-devel] [PATCH 2/2] x86/takdsp: add avx2 versions of all functions James Almer
2023-12-22 23:08 ` Michael Niedermayer
2023-12-22 23:30 ` James Almer
2023-12-22 23:52 ` [FFmpeg-devel] [PATCH 2/2 v2] " James Almer
2023-12-23 10:44 ` Lynne
2023-12-23 11:46 ` Andreas Rheinhardt
2023-12-22 9:52 ` [FFmpeg-devel] [PATCH 1/2] checkasm/takdsp: add decorrelate_sf test Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git