* [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm
@ 2024-05-25 20:57 Lynne via ffmpeg-devel
2024-05-25 22:12 ` Michael Niedermayer
` (3 more replies)
0 siblings, 4 replies; 14+ messages in thread
From: Lynne via ffmpeg-devel @ 2024-05-25 20:57 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Lynne
The inline asm function had issues running under checkasm.
So I came to finish what I started, and wrote the last part
of LPC computation in assembly.
autocorr_10_c: 135525.8
autocorr_10_sse2: 50729.8
autocorr_10_fma3: 19007.8
autocorr_30_c: 390100.8
autocorr_30_sse2: 142478.8
autocorr_30_fma3: 50559.8
autocorr_32_c: 407058.3
autocorr_32_sse2: 151633.3
autocorr_32_fma3: 50517.3
---
libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
2 files changed, 100 insertions(+), 78 deletions(-)
diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
index a585c17ef5..790841b7f4 100644
--- a/libavcodec/x86/lpc.asm
+++ b/libavcodec/x86/lpc.asm
@@ -32,6 +32,8 @@ dec_tab_sse2: times 2 dq -2.0
dec_tab_scalar: times 2 dq -1.0
seq_tab_sse2: dq 1.0, 0.0
+autoc_init_tab: times 4 dq 1.0
+
SECTION .text
%macro APPLY_WELCH_FN 0
@@ -261,3 +263,92 @@ APPLY_WELCH_FN
INIT_YMM avx2
APPLY_WELCH_FN
%endif
+
+%macro COMPUTE_AUTOCORR_FN 0
+cglobal lpc_compute_autocorr, 4, 7, 8, data, len, lag, autoc, lag_p, data_l, len_p
+
+ shl lagd, 3
+ shl lenq, 3
+ xor lag_pq, lag_pq
+
+.lag_l:
+ movaps m8, [autoc_init_tab]
+
+ mov len_pq, lag_pq
+
+ lea data_lq, [lag_pq + mmsize - 8]
+ neg data_lq ; -j - mmsize
+ add data_lq, dataq ; data[-j - mmsize]
+.len_l:
+ ; We waste the upper value here on SSE2,
+ ; but we use it on AVX.
+ movupd xm0, [dataq + len_pq] ; data[i]
+ movupd m1, [data_lq + len_pq] ; data[i - j]
+
+%if cpuflag(avx)
+ vbroadcastsd m0, xm0
+ vperm2f128 m1, m1, m1, 0x01
+%endif
+
+ shufpd m0, m0, m0, 1100b
+ shufpd m1, m1, m1, 0101b
+
+%if cpuflag(fma3)
+ fmaddpd m8, m0, m1, m8 ; sum += data[i]*data[i-j]
+%else
+ mulpd m0, m1
+ addpd m8, m0 ; sum += data[i]*data[i-j]
+%endif
+
+ add len_pq, 8
+ cmp len_pq, lenq
+ jl .len_l
+
+ movups [autocq + lag_pq], m8 ; autoc[j] = sum
+ add lag_pq, mmsize
+ cmp lag_pq, lagq
+ jl .lag_l
+
+ ; The tail computation is guaranteed never to happen
+ ; as long as we're doing multiples of 4, rather than 2.
+ ; It is trivial to convert this to avx if ever needed.
+%if !cpuflag(avx)
+ jg .end
+ ; If lag_p == lag fallthrough
+
+.tail:
+ movaps xm2, [autoc_init_tab]
+
+ mov len_pq, lag_pq
+ sub len_pq, mmsize
+
+ lea data_lq, [lag_pq]
+ neg data_lq ; -j
+ add data_lq, dataq ; data[-j]
+
+.tail_l:
+ movupd xm0, [dataq + len_pq]
+ movupd xm1, [data_lq + len_pq]
+
+ mulpd xm0, xm1
+ addpd xm2, xm0 ; sum += data[i]*data[i-j]
+
+ add len_pq, mmsize
+ cmp len_pq, lenq
+ jl .tail_l
+
+ shufpd xm1, xm2, xm2, 01b
+ addpd xm2, xm1
+
+ movhpd [autocq + lag_pq], xm2
+%endif
+
+.end:
+ RET
+
+%endmacro
+
+INIT_XMM sse2
+COMPUTE_AUTOCORR_FN
+INIT_YMM fma3
+COMPUTE_AUTOCORR_FN
diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c
index f2fca53799..96469fae40 100644
--- a/libavcodec/x86/lpc_init.c
+++ b/libavcodec/x86/lpc_init.c
@@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t *data, ptrdiff_t len,
double *w_data);
void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len,
double *w_data);
-
-DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
-
-#if HAVE_SSE2_INLINE
-
-static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
- double *autoc)
-{
- int j;
-
- if((x86_reg)data & 15)
- data++;
-
- for(j=0; j<lag; j+=2){
- x86_reg i = -len*sizeof(double);
- if(j == lag-2) {
- __asm__ volatile(
- "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
- "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
- "movsd "MANGLE(pd_1)", %%xmm2 \n\t"
- "1: \n\t"
- "movapd (%2,%0), %%xmm3 \n\t"
- "movupd -8(%3,%0), %%xmm4 \n\t"
- "movapd (%3,%0), %%xmm5 \n\t"
- "mulpd %%xmm3, %%xmm4 \n\t"
- "mulpd %%xmm3, %%xmm5 \n\t"
- "mulpd -16(%3,%0), %%xmm3 \n\t"
- "addpd %%xmm4, %%xmm1 \n\t"
- "addpd %%xmm5, %%xmm0 \n\t"
- "addpd %%xmm3, %%xmm2 \n\t"
- "add $16, %0 \n\t"
- "jl 1b \n\t"
- "movhlps %%xmm0, %%xmm3 \n\t"
- "movhlps %%xmm1, %%xmm4 \n\t"
- "movhlps %%xmm2, %%xmm5 \n\t"
- "addsd %%xmm3, %%xmm0 \n\t"
- "addsd %%xmm4, %%xmm1 \n\t"
- "addsd %%xmm5, %%xmm2 \n\t"
- "movsd %%xmm0, (%1) \n\t"
- "movsd %%xmm1, 8(%1) \n\t"
- "movsd %%xmm2, 16(%1) \n\t"
- :"+&r"(i)
- :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
- NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
- :"memory"
- );
- } else {
- __asm__ volatile(
- "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
- "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
- "1: \n\t"
- "movapd (%3,%0), %%xmm3 \n\t"
- "movupd -8(%4,%0), %%xmm4 \n\t"
- "mulpd %%xmm3, %%xmm4 \n\t"
- "mulpd (%4,%0), %%xmm3 \n\t"
- "addpd %%xmm4, %%xmm1 \n\t"
- "addpd %%xmm3, %%xmm0 \n\t"
- "add $16, %0 \n\t"
- "jl 1b \n\t"
- "movhlps %%xmm0, %%xmm3 \n\t"
- "movhlps %%xmm1, %%xmm4 \n\t"
- "addsd %%xmm3, %%xmm0 \n\t"
- "addsd %%xmm4, %%xmm1 \n\t"
- "movsd %%xmm0, %1 \n\t"
- "movsd %%xmm1, %2 \n\t"
- :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
- :"r"(data+len), "r"(data+len-j)
- NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
- );
- }
- }
-}
-
-#endif /* HAVE_SSE2_INLINE */
+void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
+ double *autoc);
+void ff_lpc_compute_autocorr_fma3(const double *data, ptrdiff_t len, int lag,
+ double *autoc);
av_cold void ff_lpc_init_x86(LPCContext *c)
{
int cpu_flags = av_get_cpu_flags();
-#if HAVE_SSE2_INLINE
- if (INLINE_SSE2_SLOW(cpu_flags))
- c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
-#endif
+ if (EXTERNAL_SSE2(cpu_flags))
+ c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
+
+ if (EXTERNAL_FMA3(cpu_flags))
+ c->lpc_compute_autocorr = ff_lpc_compute_autocorr_fma3;
if (EXTERNAL_SSE2(cpu_flags))
c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;
--
2.43.0.381.gb435a96ce8
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm
2024-05-25 20:57 [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm Lynne via ffmpeg-devel
@ 2024-05-25 22:12 ` Michael Niedermayer
2024-05-25 22:31 ` James Almer
` (2 subsequent siblings)
3 siblings, 0 replies; 14+ messages in thread
From: Michael Niedermayer @ 2024-05-25 22:12 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 2739 bytes --]
On Sat, May 25, 2024 at 10:57:21PM +0200, Lynne via ffmpeg-devel wrote:
> The inline asm function had issues running under checkasm.
> So I came to finish what I started, and wrote the last part
> of LPC computation in assembly.
>
> autocorr_10_c: 135525.8
> autocorr_10_sse2: 50729.8
> autocorr_10_fma3: 19007.8
> autocorr_30_c: 390100.8
> autocorr_30_sse2: 142478.8
> autocorr_30_fma3: 50559.8
> autocorr_32_c: 407058.3
> autocorr_32_sse2: 151633.3
> autocorr_32_fma3: 50517.3
> ---
> libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
> 2 files changed, 100 insertions(+), 78 deletions(-)
breaks build on x86-32
src/libavcodec/x86/lpc.asm:352: error: symbol `m8' undefined
src/libavcodec/x86/lpc.asm:275: ... from macro `COMPUTE_AUTOCORR_FN' defined here
src//libavutil/x86/x86inc.asm:1636: ... from macro `movaps' defined here
src//libavutil/x86/x86inc.asm:1501: ... from macro `RUN_AVX_INSTR' defined here
src/libavcodec/x86/lpc.asm:352: error: symbol `m8' undefined
src/libavcodec/x86/lpc.asm:300: ... from macro `COMPUTE_AUTOCORR_FN' defined here
src//libavutil/x86/x86inc.asm:1535: ... from macro `addpd' defined here
src//libavutil/x86/x86inc.asm:1501: ... from macro `RUN_AVX_INSTR' defined here
src/libavcodec/x86/lpc.asm:352: error: symbol `m8' undefined
src/libavcodec/x86/lpc.asm:307: ... from macro `COMPUTE_AUTOCORR_FN' defined here
src//libavutil/x86/x86inc.asm:1659: ... from macro `movups' defined here
src//libavutil/x86/x86inc.asm:1501: ... from macro `RUN_AVX_INSTR' defined here
src/libavcodec/x86/lpc.asm:354: error: symbol `m8' undefined
src/libavcodec/x86/lpc.asm:275: ... from macro `COMPUTE_AUTOCORR_FN' defined here
src//libavutil/x86/x86inc.asm:1636: ... from macro `movaps' defined here
src//libavutil/x86/x86inc.asm:1501: ... from macro `RUN_AVX_INSTR' defined here
src/libavcodec/x86/lpc.asm:354: error: symbol `m8' undefined
src/libavcodec/x86/lpc.asm:297: ... from macro `COMPUTE_AUTOCORR_FN' defined here
src//libavutil/x86/x86inc.asm:1937: ... from macro `fmaddpd' defined here
src/libavcodec/x86/lpc.asm:354: error: symbol `m8' undefined
src/libavcodec/x86/lpc.asm:307: ... from macro `COMPUTE_AUTOCORR_FN' defined here
src//libavutil/x86/x86inc.asm:1659: ... from macro `movups' defined here
src//libavutil/x86/x86inc.asm:1501: ... from macro `RUN_AVX_INSTR' defined here
make: *** [src/ffbuild/common.mak:103: libavcodec/x86/lpc.o] Error 1
make: *** Waiting for unfinished jobs....
thx
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
No great genius has ever existed without some touch of madness. -- Aristotle
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm
2024-05-25 20:57 [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm Lynne via ffmpeg-devel
2024-05-25 22:12 ` Michael Niedermayer
@ 2024-05-25 22:31 ` James Almer
2024-05-25 22:45 ` James Almer
` (2 more replies)
2024-05-26 0:39 ` James Almer
2024-05-26 1:42 ` [FFmpeg-devel] [PATCH v2] " Lynne via ffmpeg-devel
3 siblings, 3 replies; 14+ messages in thread
From: James Almer @ 2024-05-25 22:31 UTC (permalink / raw)
To: ffmpeg-devel
On 5/25/2024 5:57 PM, Lynne via ffmpeg-devel wrote:
> The inline asm function had issues running under checkasm.
> So I came to finish what I started, and wrote the last part
> of LPC computation in assembly.
>
> autocorr_10_c: 135525.8
> autocorr_10_sse2: 50729.8
> autocorr_10_fma3: 19007.8
> autocorr_30_c: 390100.8
> autocorr_30_sse2: 142478.8
> autocorr_30_fma3: 50559.8
> autocorr_32_c: 407058.3
> autocorr_32_sse2: 151633.3
> autocorr_32_fma3: 50517.3
> ---
> libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
> 2 files changed, 100 insertions(+), 78 deletions(-)
>
> diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
> index a585c17ef5..790841b7f4 100644
> --- a/libavcodec/x86/lpc.asm
> +++ b/libavcodec/x86/lpc.asm
> @@ -32,6 +32,8 @@ dec_tab_sse2: times 2 dq -2.0
> dec_tab_scalar: times 2 dq -1.0
> seq_tab_sse2: dq 1.0, 0.0
>
> +autoc_init_tab: times 4 dq 1.0
> +
> SECTION .text
>
> %macro APPLY_WELCH_FN 0
> @@ -261,3 +263,92 @@ APPLY_WELCH_FN
> INIT_YMM avx2
> APPLY_WELCH_FN
> %endif
> +
> +%macro COMPUTE_AUTOCORR_FN 0
> +cglobal lpc_compute_autocorr, 4, 7, 8, data, len, lag, autoc, lag_p, data_l, len_p
Already mentioned, but it should be 3 not 8.
> +
> + shl lagd, 3
> + shl lenq, 3
> + xor lag_pq, lag_pq
> +
> +.lag_l:
> + movaps m8, [autoc_init_tab]
m2
> +
> + mov len_pq, lag_pq
> +
> + lea data_lq, [lag_pq + mmsize - 8]
> + neg data_lq ; -j - mmsize
> + add data_lq, dataq ; data[-j - mmsize]
> +.len_l:
> + ; We waste the upper value here on SSE2,
> + ; but we use it on AVX.
> + movupd xm0, [dataq + len_pq] ; data[i]
movsd
> + movupd m1, [data_lq + len_pq] ; data[i - j]
> +
> +%if cpuflag(avx)
%if mmsize == 32 here and everywhere else.
> + vbroadcastsd m0, xm0
This is AVX2. AVX only has memory input argument. So use that and save
the movsd from above for the FMA3 version.
> + vperm2f128 m1, m1, m1, 0x01
Aren't you loading 16 extra bytes for no reason if you're just going to
use the upper 16 bytes from the load above?
> +%endif
> +
> + shufpd m0, m0, m0, 1100b
The last argument has two bits, not four. What you're doing here is a
splat/broadcast, so you don't need it for FMA3.
> + shufpd m1, m1, m1, 0101b
The upper two bits of imm8 are ignored.
> +
> +%if cpuflag(fma3)
> + fmaddpd m8, m0, m1, m8 ; sum += data[i]*data[i-j]
> +%else
> + mulpd m0, m1
> + addpd m8, m0 ; sum += data[i]*data[i-j]
> +%endif
> +
> + add len_pq, 8
> + cmp len_pq, lenq
> + jl .len_l
> +
> + movups [autocq + lag_pq], m8 ; autoc[j] = sum
> + add lag_pq, mmsize
> + cmp lag_pq, lagq
> + jl .lag_l
> +
> + ; The tail computation is guaranteed never to happen
> + ; as long as we're doing multiples of 4, rather than 2.
> + ; It is trivial to convert this to avx if ever needed.
> +%if !cpuflag(avx)
This doesn't seem to be tested as is. Maybe the checkasm should try
other lag values?
> + jg .end
> + ; If lag_p == lag fallthrough
> +
> +.tail:
> + movaps xm2, [autoc_init_tab]
> +
> + mov len_pq, lag_pq
> + sub len_pq, mmsize
> +
> + lea data_lq, [lag_pq]
> + neg data_lq ; -j
> + add data_lq, dataq ; data[-j]
> +
> +.tail_l:
> + movupd xm0, [dataq + len_pq]
> + movupd xm1, [data_lq + len_pq]
> +
> + mulpd xm0, xm1
> + addpd xm2, xm0 ; sum += data[i]*data[i-j]
> +
> + add len_pq, mmsize
> + cmp len_pq, lenq
> + jl .tail_l
> +
> + shufpd xm1, xm2, xm2, 01b
> + addpd xm2, xm1
> +
> + movhpd [autocq + lag_pq], xm2
> +%endif
> +
> +.end:
> + RET
> +
> +%endmacro
> +
> +INIT_XMM sse2
> +COMPUTE_AUTOCORR_FN
> +INIT_YMM fma3
> +COMPUTE_AUTOCORR_FN
> diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c
> index f2fca53799..96469fae40 100644
> --- a/libavcodec/x86/lpc_init.c
> +++ b/libavcodec/x86/lpc_init.c
> @@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t *data, ptrdiff_t len,
> double *w_data);
> void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len,
> double *w_data);
> -
> -DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
> -
> -#if HAVE_SSE2_INLINE
> -
> -static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
> - double *autoc)
> -{
> - int j;
> -
> - if((x86_reg)data & 15)
> - data++;
> -
> - for(j=0; j<lag; j+=2){
> - x86_reg i = -len*sizeof(double);
> - if(j == lag-2) {
> - __asm__ volatile(
> - "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
> - "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
> - "movsd "MANGLE(pd_1)", %%xmm2 \n\t"
> - "1: \n\t"
> - "movapd (%2,%0), %%xmm3 \n\t"
> - "movupd -8(%3,%0), %%xmm4 \n\t"
> - "movapd (%3,%0), %%xmm5 \n\t"
> - "mulpd %%xmm3, %%xmm4 \n\t"
> - "mulpd %%xmm3, %%xmm5 \n\t"
> - "mulpd -16(%3,%0), %%xmm3 \n\t"
> - "addpd %%xmm4, %%xmm1 \n\t"
> - "addpd %%xmm5, %%xmm0 \n\t"
> - "addpd %%xmm3, %%xmm2 \n\t"
> - "add $16, %0 \n\t"
> - "jl 1b \n\t"
> - "movhlps %%xmm0, %%xmm3 \n\t"
> - "movhlps %%xmm1, %%xmm4 \n\t"
> - "movhlps %%xmm2, %%xmm5 \n\t"
> - "addsd %%xmm3, %%xmm0 \n\t"
> - "addsd %%xmm4, %%xmm1 \n\t"
> - "addsd %%xmm5, %%xmm2 \n\t"
> - "movsd %%xmm0, (%1) \n\t"
> - "movsd %%xmm1, 8(%1) \n\t"
> - "movsd %%xmm2, 16(%1) \n\t"
> - :"+&r"(i)
> - :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
> - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
> - :"memory"
> - );
> - } else {
> - __asm__ volatile(
> - "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
> - "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
> - "1: \n\t"
> - "movapd (%3,%0), %%xmm3 \n\t"
> - "movupd -8(%4,%0), %%xmm4 \n\t"
> - "mulpd %%xmm3, %%xmm4 \n\t"
> - "mulpd (%4,%0), %%xmm3 \n\t"
> - "addpd %%xmm4, %%xmm1 \n\t"
> - "addpd %%xmm3, %%xmm0 \n\t"
> - "add $16, %0 \n\t"
> - "jl 1b \n\t"
> - "movhlps %%xmm0, %%xmm3 \n\t"
> - "movhlps %%xmm1, %%xmm4 \n\t"
> - "addsd %%xmm3, %%xmm0 \n\t"
> - "addsd %%xmm4, %%xmm1 \n\t"
> - "movsd %%xmm0, %1 \n\t"
> - "movsd %%xmm1, %2 \n\t"
> - :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
> - :"r"(data+len), "r"(data+len-j)
> - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
> - );
> - }
> - }
> -}
> -
> -#endif /* HAVE_SSE2_INLINE */
> +void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
> + double *autoc);
> +void ff_lpc_compute_autocorr_fma3(const double *data, ptrdiff_t len, int lag,
> + double *autoc);
>
> av_cold void ff_lpc_init_x86(LPCContext *c)
> {
> int cpu_flags = av_get_cpu_flags();
>
> -#if HAVE_SSE2_INLINE
> - if (INLINE_SSE2_SLOW(cpu_flags))
> - c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
> -#endif
> + if (EXTERNAL_SSE2(cpu_flags))
> + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
> +
> + if (EXTERNAL_FMA3(cpu_flags))
> + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_fma3;
>
> if (EXTERNAL_SSE2(cpu_flags))
> c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm
2024-05-25 22:31 ` James Almer
@ 2024-05-25 22:45 ` James Almer
2024-05-26 0:02 ` Lynne via ffmpeg-devel
2024-05-25 23:24 ` Lynne via ffmpeg-devel
2024-05-26 5:45 ` Rémi Denis-Courmont
2 siblings, 1 reply; 14+ messages in thread
From: James Almer @ 2024-05-25 22:45 UTC (permalink / raw)
To: ffmpeg-devel
On 5/25/2024 7:31 PM, James Almer wrote:
> On 5/25/2024 5:57 PM, Lynne via ffmpeg-devel wrote:
>> The inline asm function had issues running under checkasm.
>> So I came to finish what I started, and wrote the last part
>> of LPC computation in assembly.
>>
>> autocorr_10_c: 135525.8
>> autocorr_10_sse2: 50729.8
>> autocorr_10_fma3: 19007.8
>> autocorr_30_c: 390100.8
>> autocorr_30_sse2: 142478.8
>> autocorr_30_fma3: 50559.8
>> autocorr_32_c: 407058.3
>> autocorr_32_sse2: 151633.3
>> autocorr_32_fma3: 50517.3
>> ---
>> libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
>> libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
>> 2 files changed, 100 insertions(+), 78 deletions(-)
>>
>> diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
>> index a585c17ef5..790841b7f4 100644
>> --- a/libavcodec/x86/lpc.asm
>> +++ b/libavcodec/x86/lpc.asm
>> @@ -32,6 +32,8 @@ dec_tab_sse2: times 2 dq -2.0
>> dec_tab_scalar: times 2 dq -1.0
>> seq_tab_sse2: dq 1.0, 0.0
>> +autoc_init_tab: times 4 dq 1.0
>> +
>> SECTION .text
>> %macro APPLY_WELCH_FN 0
>> @@ -261,3 +263,92 @@ APPLY_WELCH_FN
>> INIT_YMM avx2
>> APPLY_WELCH_FN
>> %endif
>> +
>> +%macro COMPUTE_AUTOCORR_FN 0
>> +cglobal lpc_compute_autocorr, 4, 7, 8, data, len, lag, autoc, lag_p,
>> data_l, len_p
>
> Already mentioned, but it should be 3 not 8.
>
>> +
>> + shl lagd, 3
>> + shl lenq, 3
>> + xor lag_pq, lag_pq
>> +
>> +.lag_l:
>> + movaps m8, [autoc_init_tab]
>
> m2
>
>> +
>> + mov len_pq, lag_pq
>> +
>> + lea data_lq, [lag_pq + mmsize - 8]
>> + neg data_lq ; -j - mmsize
>> + add data_lq, dataq ; data[-j - mmsize]
>> +.len_l:
>> + ; We waste the upper value here on SSE2,
>> + ; but we use it on AVX.
>> + movupd xm0, [dataq + len_pq] ; data[i]
>
> movsd
>
>> + movupd m1, [data_lq + len_pq] ; data[i - j]
>> +
>> +%if cpuflag(avx)
>
> %if mmsize == 32 here and everywhere else.
>
>> + vbroadcastsd m0, xm0
>
> This is AVX2. AVX only has memory input argument. So use that and save
> the movsd from above for the FMA3 version.
>
>> + vperm2f128 m1, m1, m1, 0x01
>
> Aren't you loading 16 extra bytes for no reason if you're just going to
> use the upper 16 bytes from the load above?
Nevermind, this is swapping lanes.
That aside, these versions are barely better and sometimes worse in all
my tests on win64 with GCC with certain seeds.
For example, seed 4022958484 gives me:
autocorr_10_c: 21345.6
autocorr_10_sse2: 16434.6
autocorr_10_fma3: 24154.6
autocorr_30_c: 59239.1
autocorr_30_sse2: 46114.6
autocorr_30_fma3: 64147.1
autocorr_32_c: 63022.1
autocorr_32_sse2: 50040.1
autocorr_32_fma3: 66594.1
But seed 2236774811 gives me:
autocorr_10_c: 37135.3
autocorr_10_sse2: 26492.3
autocorr_10_fma3: 32943.3
autocorr_30_c: 102266.8
autocorr_30_sse2: 72933.3
autocorr_30_fma3: 85808.3
autocorr_32_c: 106537.8
autocorr_32_sse2: 77623.3
autocorr_32_fma3: 85844.3
But if i force len to always be 4999 instead of its value varying
depending on seed, i consistently get things like:
autocorr_10_c: 40447.3
autocorr_10_sse2: 39526.8
autocorr_10_fma3: 42955.3
autocorr_30_c: 111362.3
autocorr_30_sse2: 111408.3
autocorr_30_fma3: 116781.8
autocorr_32_c: 122388.3
autocorr_32_sse2: 119125.3
autocorr_32_fma3: 114239.3
It would help if someone else could confirm this, but overall i don't
see any worthwhile gain here. The old inline version, for those seeds
where it worked, was somewhat faster.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm
2024-05-25 22:31 ` James Almer
2024-05-25 22:45 ` James Almer
@ 2024-05-25 23:24 ` Lynne via ffmpeg-devel
2024-05-25 23:41 ` James Almer
2024-05-26 5:45 ` Rémi Denis-Courmont
2 siblings, 1 reply; 14+ messages in thread
From: Lynne via ffmpeg-devel @ 2024-05-25 23:24 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Lynne
[-- Attachment #1.1.1.1: Type: text/plain, Size: 3832 bytes --]
On 26/05/2024 00:31, James Almer wrote:
> On 5/25/2024 5:57 PM, Lynne via ffmpeg-devel wrote:
>> The inline asm function had issues running under checkasm.
>> So I came to finish what I started, and wrote the last part
>> of LPC computation in assembly.
>>
>> autocorr_10_c: 135525.8
>> autocorr_10_sse2: 50729.8
>> autocorr_10_fma3: 19007.8
>> autocorr_30_c: 390100.8
>> autocorr_30_sse2: 142478.8
>> autocorr_30_fma3: 50559.8
>> autocorr_32_c: 407058.3
>> autocorr_32_sse2: 151633.3
>> autocorr_32_fma3: 50517.3
>> ---
>> libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
>> libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
>> 2 files changed, 100 insertions(+), 78 deletions(-)
>>
>> diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
>> index a585c17ef5..790841b7f4 100644
>> --- a/libavcodec/x86/lpc.asm
>> +++ b/libavcodec/x86/lpc.asm
>> @@ -32,6 +32,8 @@ dec_tab_sse2: times 2 dq -2.0
>> dec_tab_scalar: times 2 dq -1.0
>> seq_tab_sse2: dq 1.0, 0.0
>> +autoc_init_tab: times 4 dq 1.0
>> +
>> SECTION .text
>> %macro APPLY_WELCH_FN 0
>> @@ -261,3 +263,92 @@ APPLY_WELCH_FN
>> INIT_YMM avx2
>> APPLY_WELCH_FN
>> %endif
>> +
>> +%macro COMPUTE_AUTOCORR_FN 0
>> +cglobal lpc_compute_autocorr, 4, 7, 8, data, len, lag, autoc, lag_p,
>> data_l, len_p
>
> Already mentioned, but it should be 3 not 8.
Already done, as said on IRC not 10 minutes after I submitted it.
>
>> +
>> + shl lagd, 3
>> + shl lenq, 3
>> + xor lag_pq, lag_pq
>> +
>> +.lag_l:
>> + movaps m8, [autoc_init_tab]
>
> m2
>
>> +
>> + mov len_pq, lag_pq
>> +
>> + lea data_lq, [lag_pq + mmsize - 8]
>> + neg data_lq ; -j - mmsize
>> + add data_lq, dataq ; data[-j - mmsize]
>> +.len_l:
>> + ; We waste the upper value here on SSE2,
>> + ; but we use it on AVX.
>> + movupd xm0, [dataq + len_pq] ; data[i]
>
> movsd
Fixed.
>
>> + movupd m1, [data_lq + len_pq] ; data[i - j]
>> +
>> +%if cpuflag(avx)
>
> %if mmsize == 32 here and everywhere else.
Done.
>
>> + vbroadcastsd m0, xm0
>
> This is AVX2. AVX only has memory input argument. So use that and save
> the movsd from above for the FMA3 version.
>
>> + vperm2f128 m1, m1, m1, 0x01
>
> Aren't you loading 16 extra bytes for no reason if you're just going to
> use the upper 16 bytes from the load above?
Lane swapped, like you mentioned.
>> +%endif
>> +
>> + shufpd m0, m0, m0, 1100b
>
> The last argument has two bits, not four. What you're doing here is a
> splat/broadcast, so you don't need it for FMA3.
>
>> + shufpd m1, m1, m1, 0101b
>
> The upper two bits of imm8 are ignored.
Intentional. Not ignored on FMA3.
>> +
>> +%if cpuflag(fma3)
>> + fmaddpd m8, m0, m1, m8 ; sum += data[i]*data[i-j]
>> +%else
>> + mulpd m0, m1
>> + addpd m8, m0 ; sum += data[i]*data[i-j]
>> +%endif
>> +
>> + add len_pq, 8
>> + cmp len_pq, lenq
>> + jl .len_l
>> +
>> + movups [autocq + lag_pq], m8 ; autoc[j] = sum
>> + add lag_pq, mmsize
>> + cmp lag_pq, lagq
>> + jl .lag_l
>> +
>> + ; The tail computation is guaranteed never to happen
>> + ; as long as we're doing multiples of 4, rather than 2.
>> + ; It is trivial to convert this to avx if ever needed.
>> +%if !cpuflag(avx)
>
> This doesn't seem to be tested as is. Maybe the checkasm should try
> other lag values?
That's for the checkasm patch. You can trigger this check with
fate-alac-16-lpc-orders as-is.
[-- Attachment #1.1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 637 bytes --]
[-- Attachment #1.2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 236 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm
2024-05-25 23:24 ` Lynne via ffmpeg-devel
@ 2024-05-25 23:41 ` James Almer
0 siblings, 0 replies; 14+ messages in thread
From: James Almer @ 2024-05-25 23:41 UTC (permalink / raw)
To: ffmpeg-devel
On 5/25/2024 8:24 PM, Lynne via ffmpeg-devel wrote:
> On 26/05/2024 00:31, James Almer wrote:
>> On 5/25/2024 5:57 PM, Lynne via ffmpeg-devel wrote:
>>> The inline asm function had issues running under checkasm.
>>> So I came to finish what I started, and wrote the last part
>>> of LPC computation in assembly.
>>>
>>> autocorr_10_c: 135525.8
>>> autocorr_10_sse2: 50729.8
>>> autocorr_10_fma3: 19007.8
>>> autocorr_30_c: 390100.8
>>> autocorr_30_sse2: 142478.8
>>> autocorr_30_fma3: 50559.8
>>> autocorr_32_c: 407058.3
>>> autocorr_32_sse2: 151633.3
>>> autocorr_32_fma3: 50517.3
>>> ---
>>> libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
>>> libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
>>> 2 files changed, 100 insertions(+), 78 deletions(-)
>>>
>>> diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
>>> index a585c17ef5..790841b7f4 100644
>>> --- a/libavcodec/x86/lpc.asm
>>> +++ b/libavcodec/x86/lpc.asm
>>> @@ -32,6 +32,8 @@ dec_tab_sse2: times 2 dq -2.0
>>> dec_tab_scalar: times 2 dq -1.0
>>> seq_tab_sse2: dq 1.0, 0.0
>>> +autoc_init_tab: times 4 dq 1.0
>>> +
>>> SECTION .text
>>> %macro APPLY_WELCH_FN 0
>>> @@ -261,3 +263,92 @@ APPLY_WELCH_FN
>>> INIT_YMM avx2
>>> APPLY_WELCH_FN
>>> %endif
>>> +
>>> +%macro COMPUTE_AUTOCORR_FN 0
>>> +cglobal lpc_compute_autocorr, 4, 7, 8, data, len, lag, autoc, lag_p,
>>> data_l, len_p
>>
>> Already mentioned, but it should be 3 not 8.
>
> Already done, as said on IRC not 10 minutes after I submitted it.
>
>>
>>> +
>>> + shl lagd, 3
>>> + shl lenq, 3
>>> + xor lag_pq, lag_pq
>>> +
>>> +.lag_l:
>>> + movaps m8, [autoc_init_tab]
>>
>> m2
>>
>>> +
>>> + mov len_pq, lag_pq
>>> +
>>> + lea data_lq, [lag_pq + mmsize - 8]
>>> + neg data_lq ; -j - mmsize
>>> + add data_lq, dataq ; data[-j - mmsize]
>>> +.len_l:
>>> + ; We waste the upper value here on SSE2,
>>> + ; but we use it on AVX.
>>> + movupd xm0, [dataq + len_pq] ; data[i]
>>
>> movsd
>
> Fixed.
>
>>
>>> + movupd m1, [data_lq + len_pq] ; data[i - j]
>>> +
>>> +%if cpuflag(avx)
>>
>> %if mmsize == 32 here and everywhere else.
>
> Done.
>
>>
>>> + vbroadcastsd m0, xm0
>>
>> This is AVX2. AVX only has memory input argument. So use that and save
>> the movsd from above for the FMA3 version.
>>
>>> + vperm2f128 m1, m1, m1, 0x01
>>
>> Aren't you loading 16 extra bytes for no reason if you're just going
>> to use the upper 16 bytes from the load above?
>
> Lane swapped, like you mentioned.
>
>>> +%endif
>>> +
>>> + shufpd m0, m0, m0, 1100b
>>
>> The last argument has two bits, not four. What you're doing here is a
>> splat/broadcast, so you don't need it for FMA3.
>>
>>> + shufpd m1, m1, m1, 0101b
>>
>> The upper two bits of imm8 are ignored.
>
> Intentional. Not ignored on FMA3.
>
>>> +
>>> +%if cpuflag(fma3)
>>> + fmaddpd m8, m0, m1, m8 ; sum += data[i]*data[i-j]
>>> +%else
>>> + mulpd m0, m1
>>> + addpd m8, m0 ; sum += data[i]*data[i-j]
>>> +%endif
>>> +
>>> + add len_pq, 8
>>> + cmp len_pq, lenq
>>> + jl .len_l
>>> +
>>> + movups [autocq + lag_pq], m8 ; autoc[j] = sum
>>> + add lag_pq, mmsize
>>> + cmp lag_pq, lagq
>>> + jl .lag_l
>>> +
>>> + ; The tail computation is guaranteed never to happen
>>> + ; as long as we're doing multiples of 4, rather than 2.
>>> + ; It is trivial to convert this to avx if ever needed.
>>> +%if !cpuflag(avx)
>>
>> This doesn't seem to be tested as is. Maybe the checkasm should try
>> other lag values?
>
> That's for the checkasm patch. You can trigger this check with
> fate-alac-16-lpc-orders as-is.
Checkasm should test the entire function, so if an odd lag value will
trigger this chunk, it should be tested.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm
2024-05-25 22:45 ` James Almer
@ 2024-05-26 0:02 ` Lynne via ffmpeg-devel
2024-05-26 0:09 ` James Almer
0 siblings, 1 reply; 14+ messages in thread
From: Lynne via ffmpeg-devel @ 2024-05-26 0:02 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Lynne
[-- Attachment #1.1.1.1: Type: text/plain, Size: 6137 bytes --]
On 26/05/2024 00:45, James Almer wrote:
> On 5/25/2024 7:31 PM, James Almer wrote:
>> On 5/25/2024 5:57 PM, Lynne via ffmpeg-devel wrote:
>>> The inline asm function had issues running under checkasm.
>>> So I came to finish what I started, and wrote the last part
>>> of LPC computation in assembly.
>>>
>>> autocorr_10_c: 135525.8
>>> autocorr_10_sse2: 50729.8
>>> autocorr_10_fma3: 19007.8
>>> autocorr_30_c: 390100.8
>>> autocorr_30_sse2: 142478.8
>>> autocorr_30_fma3: 50559.8
>>> autocorr_32_c: 407058.3
>>> autocorr_32_sse2: 151633.3
>>> autocorr_32_fma3: 50517.3
>>> ---
>>> libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
>>> libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
>>> 2 files changed, 100 insertions(+), 78 deletions(-)
>>>
>>> diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
>>> index a585c17ef5..790841b7f4 100644
>>> --- a/libavcodec/x86/lpc.asm
>>> +++ b/libavcodec/x86/lpc.asm
>>> @@ -32,6 +32,8 @@ dec_tab_sse2: times 2 dq -2.0
>>> dec_tab_scalar: times 2 dq -1.0
>>> seq_tab_sse2: dq 1.0, 0.0
>>> +autoc_init_tab: times 4 dq 1.0
>>> +
>>> SECTION .text
>>> %macro APPLY_WELCH_FN 0
>>> @@ -261,3 +263,92 @@ APPLY_WELCH_FN
>>> INIT_YMM avx2
>>> APPLY_WELCH_FN
>>> %endif
>>> +
>>> +%macro COMPUTE_AUTOCORR_FN 0
>>> +cglobal lpc_compute_autocorr, 4, 7, 8, data, len, lag, autoc, lag_p,
>>> data_l, len_p
>>
>> Already mentioned, but it should be 3 not 8.
>>
>>> +
>>> + shl lagd, 3
>>> + shl lenq, 3
>>> + xor lag_pq, lag_pq
>>> +
>>> +.lag_l:
>>> + movaps m8, [autoc_init_tab]
>>
>> m2
>>
>>> +
>>> + mov len_pq, lag_pq
>>> +
>>> + lea data_lq, [lag_pq + mmsize - 8]
>>> + neg data_lq ; -j - mmsize
>>> + add data_lq, dataq ; data[-j - mmsize]
>>> +.len_l:
>>> + ; We waste the upper value here on SSE2,
>>> + ; but we use it on AVX.
>>> + movupd xm0, [dataq + len_pq] ; data[i]
>>
>> movsd
>>
>>> + movupd m1, [data_lq + len_pq] ; data[i - j]
>>> +
>>> +%if cpuflag(avx)
>>
>> %if mmsize == 32 here and everywhere else.
>>
>>> + vbroadcastsd m0, xm0
>>
>> This is AVX2. AVX only has memory input argument. So use that and save
>> the movsd from above for the FMA3 version.
>>
>>> + vperm2f128 m1, m1, m1, 0x01
>>
>> Aren't you loading 16 extra bytes for no reason if you're just going
>> to use the upper 16 bytes from the load above?
>
> Nevermind, this is swapping lanes.
>
> That aside, these versions are barely better and sometimes worse in all
> my tests on win64 with GCC with certain seeds.
> For example, seed 4022958484 gives me:
>
> autocorr_10_c: 21345.6
> autocorr_10_sse2: 16434.6
> autocorr_10_fma3: 24154.6
> autocorr_30_c: 59239.1
> autocorr_30_sse2: 46114.6
> autocorr_30_fma3: 64147.1
> autocorr_32_c: 63022.1
> autocorr_32_sse2: 50040.1
> autocorr_32_fma3: 66594.1
>
> But seed 2236774811 gives me:
>
> autocorr_10_c: 37135.3
> autocorr_10_sse2: 26492.3
> autocorr_10_fma3: 32943.3
> autocorr_30_c: 102266.8
> autocorr_30_sse2: 72933.3
> autocorr_30_fma3: 85808.3
> autocorr_32_c: 106537.8
> autocorr_32_sse2: 77623.3
> autocorr_32_fma3: 85844.3
>
> But if i force len to always be 4999 instead of its value varying
> depending on seed, i consistently get things like:
>
> autocorr_10_c: 40447.3
> autocorr_10_sse2: 39526.8
> autocorr_10_fma3: 42955.3
> autocorr_30_c: 111362.3
> autocorr_30_sse2: 111408.3
> autocorr_30_fma3: 116781.8
> autocorr_32_c: 122388.3
> autocorr_32_sse2: 119125.3
> autocorr_32_fma3: 114239.3
>
> It would help if someone else could confirm this, but overall i don't
> see any worthwhile gain here. The old inline version, for those seeds
> where it worked, was somewhat faster.
The metrics given are on Zen 3, with Clang with compiler optimizations
disabled.
We do not rely on compiler optimizations, and have plenty of assembly
which turns out to be slower than modern compilers autovectorizing (even
though we disable tree vectorization on GCC, that does not apply to
simple loops like this one). On the other hand, we also support ancient
compilers, and compilers which have no understanding of vectorization at
all.
To illustrate how different results can look on different arches and
compilers, and even platforms (you mentioned you tested only on win64):
Zen 3, gcc-9, O2:
autocorr_10_c: 48796.8
autocorr_10_sse2: 39571.8
autocorr_10_fma3: 30272.8
autocorr_30_c: 138499.3
autocorr_30_sse2: 114091.3
autocorr_30_fma3: 82114.3
autocorr_32_c: 146466.8
autocorr_32_sse2: 118400.8
autocorr_32_fma3: 80473.8
Zen 3, gcc-14, O2:
autocorr_10_c: 44981.3
autocorr_10_sse2: 36481.3
autocorr_10_fma3: 18418.8
autocorr_30_c: 129462.8
autocorr_30_sse2: 104175.3
autocorr_30_fma3: 48670.3
autocorr_32_c: 135625.3
autocorr_32_sse2: 109079.8
autocorr_32_fma3: 48670.3
Zen 3, clang-18, O2:
autocorr_10_c: 51872.6
autocorr_10_sse2: 48311.1
autocorr_10_fma3: 30070.1
autocorr_30_c: 145899.6
autocorr_30_sse2: 135793.1
autocorr_30_fma3: 79922.6
autocorr_32_c: 160443.1
autocorr_32_sse2: 147591.1
autocorr_32_fma3: 80075.6
Skylake, gcc-14, O2:
autocorr_10_c: 149251.0
autocorr_10_sse2: 133769.5
autocorr_10_fma3: 72886.0
autocorr_30_c: 396145.0
autocorr_30_sse2: 376618.5
autocorr_30_fma3: 194116.5
autocorr_32_c: 413219.0
autocorr_32_sse2: 400867.5
autocorr_32_fma3: 194117.5
Skylake, clang-18, O2:
autocorr_10_c: 153825.3
autocorr_10_sse2: 133774.3
autocorr_10_fma3: 72883.8
autocorr_30_c: 398339.8
autocorr_30_sse2: 376603.8
autocorr_30_fma3: 194098.8
autocorr_32_c: 432183.3
autocorr_32_sse2: 422583.8
autocorr_32_fma3: 194094.3
<Insert your favorite decade old compiler here>
But again, this is irrelevant, as we do not rely on compilers for
optimizations. We help them as much as we can, and when they work, its
nice, but that is in no way reliable, especially to turn down a patch
like this.
[-- Attachment #1.1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 637 bytes --]
[-- Attachment #1.2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 236 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm
2024-05-26 0:02 ` Lynne via ffmpeg-devel
@ 2024-05-26 0:09 ` James Almer
0 siblings, 0 replies; 14+ messages in thread
From: James Almer @ 2024-05-26 0:09 UTC (permalink / raw)
To: ffmpeg-devel
On 5/25/2024 9:02 PM, Lynne via ffmpeg-devel wrote:
> On 26/05/2024 00:45, James Almer wrote:
>> On 5/25/2024 7:31 PM, James Almer wrote:
>>> On 5/25/2024 5:57 PM, Lynne via ffmpeg-devel wrote:
>>>> The inline asm function had issues running under checkasm.
>>>> So I came to finish what I started, and wrote the last part
>>>> of LPC computation in assembly.
>>>>
>>>> autocorr_10_c: 135525.8
>>>> autocorr_10_sse2: 50729.8
>>>> autocorr_10_fma3: 19007.8
>>>> autocorr_30_c: 390100.8
>>>> autocorr_30_sse2: 142478.8
>>>> autocorr_30_fma3: 50559.8
>>>> autocorr_32_c: 407058.3
>>>> autocorr_32_sse2: 151633.3
>>>> autocorr_32_fma3: 50517.3
>>>> ---
>>>> libavcodec/x86/lpc.asm | 91
>>>> +++++++++++++++++++++++++++++++++++++++
>>>> libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
>>>> 2 files changed, 100 insertions(+), 78 deletions(-)
>>>>
>>>> diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
>>>> index a585c17ef5..790841b7f4 100644
>>>> --- a/libavcodec/x86/lpc.asm
>>>> +++ b/libavcodec/x86/lpc.asm
>>>> @@ -32,6 +32,8 @@ dec_tab_sse2: times 2 dq -2.0
>>>> dec_tab_scalar: times 2 dq -1.0
>>>> seq_tab_sse2: dq 1.0, 0.0
>>>> +autoc_init_tab: times 4 dq 1.0
>>>> +
>>>> SECTION .text
>>>> %macro APPLY_WELCH_FN 0
>>>> @@ -261,3 +263,92 @@ APPLY_WELCH_FN
>>>> INIT_YMM avx2
>>>> APPLY_WELCH_FN
>>>> %endif
>>>> +
>>>> +%macro COMPUTE_AUTOCORR_FN 0
>>>> +cglobal lpc_compute_autocorr, 4, 7, 8, data, len, lag, autoc,
>>>> lag_p, data_l, len_p
>>>
>>> Already mentioned, but it should be 3 not 8.
>>>
>>>> +
>>>> + shl lagd, 3
>>>> + shl lenq, 3
>>>> + xor lag_pq, lag_pq
>>>> +
>>>> +.lag_l:
>>>> + movaps m8, [autoc_init_tab]
>>>
>>> m2
>>>
>>>> +
>>>> + mov len_pq, lag_pq
>>>> +
>>>> + lea data_lq, [lag_pq + mmsize - 8]
>>>> + neg data_lq ; -j - mmsize
>>>> + add data_lq, dataq ; data[-j - mmsize]
>>>> +.len_l:
>>>> + ; We waste the upper value here on SSE2,
>>>> + ; but we use it on AVX.
>>>> + movupd xm0, [dataq + len_pq] ; data[i]
>>>
>>> movsd
>>>
>>>> + movupd m1, [data_lq + len_pq] ; data[i - j]
>>>> +
>>>> +%if cpuflag(avx)
>>>
>>> %if mmsize == 32 here and everywhere else.
>>>
>>>> + vbroadcastsd m0, xm0
>>>
>>> This is AVX2. AVX only has memory input argument. So use that and
>>> save the movsd from above for the FMA3 version.
>>>
>>>> + vperm2f128 m1, m1, m1, 0x01
>>>
>>> Aren't you loading 16 extra bytes for no reason if you're just going
>>> to use the upper 16 bytes from the load above?
>>
>> Nevermind, this is swapping lanes.
>>
>> That aside, these versions are barely better and sometimes worse in
>> all my tests on win64 with GCC with certain seeds.
>> For example, seed 4022958484 gives me:
>>
>> autocorr_10_c: 21345.6
>> autocorr_10_sse2: 16434.6
>> autocorr_10_fma3: 24154.6
>> autocorr_30_c: 59239.1
>> autocorr_30_sse2: 46114.6
>> autocorr_30_fma3: 64147.1
>> autocorr_32_c: 63022.1
>> autocorr_32_sse2: 50040.1
>> autocorr_32_fma3: 66594.1
>>
>> But seed 2236774811 gives me:
>>
>> autocorr_10_c: 37135.3
>> autocorr_10_sse2: 26492.3
>> autocorr_10_fma3: 32943.3
>> autocorr_30_c: 102266.8
>> autocorr_30_sse2: 72933.3
>> autocorr_30_fma3: 85808.3
>> autocorr_32_c: 106537.8
>> autocorr_32_sse2: 77623.3
>> autocorr_32_fma3: 85844.3
>>
>> But if i force len to always be 4999 instead of its value varying
>> depending on seed, i consistently get things like:
>>
>> autocorr_10_c: 40447.3
>> autocorr_10_sse2: 39526.8
>> autocorr_10_fma3: 42955.3
>> autocorr_30_c: 111362.3
>> autocorr_30_sse2: 111408.3
>> autocorr_30_fma3: 116781.8
>> autocorr_32_c: 122388.3
>> autocorr_32_sse2: 119125.3
>> autocorr_32_fma3: 114239.3
>>
>> It would help if someone else could confirm this, but overall i don't
>> see any worthwhile gain here. The old inline version, for those seeds
>> where it worked, was somewhat faster.
>
> The metrics given are on Zen 3, with Clang with compiler optimizations
> disabled.
> We do not rely on compiler optimizations, and have plenty of assembly
> which turns out to be slower than modern compilers autovectorizing (even
> though we disable tree vectorization on GCC, that does not apply to
> simple loops like this one). On the other hand, we also support ancient
> compilers, and compilers which have no understanding of vectorization at
> all.
Tree vectorization is disabled everywhere, including my target (GCC 14,
mingw-w64, Alder Lake i7).
> To illustrate how different results can look on different arches and
> compilers, and even platforms (you mentioned you tested only on win64):
>
> Zen 3, gcc-9, O2:
> autocorr_10_c: 48796.8
> autocorr_10_sse2: 39571.8
> autocorr_10_fma3: 30272.8
> autocorr_30_c: 138499.3
> autocorr_30_sse2: 114091.3
> autocorr_30_fma3: 82114.3
> autocorr_32_c: 146466.8
> autocorr_32_sse2: 118400.8
> autocorr_32_fma3: 80473.8
>
> Zen 3, gcc-14, O2:
> autocorr_10_c: 44981.3
> autocorr_10_sse2: 36481.3
> autocorr_10_fma3: 18418.8
> autocorr_30_c: 129462.8
> autocorr_30_sse2: 104175.3
> autocorr_30_fma3: 48670.3
> autocorr_32_c: 135625.3
> autocorr_32_sse2: 109079.8
> autocorr_32_fma3: 48670.3
>
> Zen 3, clang-18, O2:
> autocorr_10_c: 51872.6
> autocorr_10_sse2: 48311.1
> autocorr_10_fma3: 30070.1
> autocorr_30_c: 145899.6
> autocorr_30_sse2: 135793.1
> autocorr_30_fma3: 79922.6
> autocorr_32_c: 160443.1
> autocorr_32_sse2: 147591.1
> autocorr_32_fma3: 80075.6
>
> Skylake, gcc-14, O2:
> autocorr_10_c: 149251.0
> autocorr_10_sse2: 133769.5
> autocorr_10_fma3: 72886.0
> autocorr_30_c: 396145.0
> autocorr_30_sse2: 376618.5
> autocorr_30_fma3: 194116.5
> autocorr_32_c: 413219.0
> autocorr_32_sse2: 400867.5
> autocorr_32_fma3: 194117.5
>
> Skylake, clang-18, O2:
> autocorr_10_c: 153825.3
> autocorr_10_sse2: 133774.3
> autocorr_10_fma3: 72883.8
> autocorr_30_c: 398339.8
> autocorr_30_sse2: 376603.8
> autocorr_30_fma3: 194098.8
> autocorr_32_c: 432183.3
> autocorr_32_sse2: 422583.8
> autocorr_32_fma3: 194094.3
I see no such boost at all. You're getting twice the performance on fma3
than sse2 whereas i get fma3 worse than sse2 in many cases. There is
something fishy going on, hence me asking others to check to see if they
can reproduce it.
>
> <Insert your favorite decade old compiler here>
>
> But again, this is irrelevant, as we do not rely on compilers for
> optimizations. We help them as much as we can, and when they work, its
> nice, but that is in no way reliable, especially to turn down a patch
> like this.
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm
2024-05-25 20:57 [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm Lynne via ffmpeg-devel
2024-05-25 22:12 ` Michael Niedermayer
2024-05-25 22:31 ` James Almer
@ 2024-05-26 0:39 ` James Almer
2024-05-26 1:42 ` [FFmpeg-devel] [PATCH v2] " Lynne via ffmpeg-devel
3 siblings, 0 replies; 14+ messages in thread
From: James Almer @ 2024-05-26 0:39 UTC (permalink / raw)
To: ffmpeg-devel
On 5/25/2024 5:57 PM, Lynne via ffmpeg-devel wrote:
> The inline asm function had issues running under checkasm.
> So I came to finish what I started, and wrote the last part
> of LPC computation in assembly.
>
> autocorr_10_c: 135525.8
> autocorr_10_sse2: 50729.8
> autocorr_10_fma3: 19007.8
> autocorr_30_c: 390100.8
> autocorr_30_sse2: 142478.8
> autocorr_30_fma3: 50559.8
> autocorr_32_c: 407058.3
> autocorr_32_sse2: 151633.3
> autocorr_32_fma3: 50517.3
> ---
> libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
> 2 files changed, 100 insertions(+), 78 deletions(-)
>
> diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
> index a585c17ef5..790841b7f4 100644
> --- a/libavcodec/x86/lpc.asm
> +++ b/libavcodec/x86/lpc.asm
> @@ -32,6 +32,8 @@ dec_tab_sse2: times 2 dq -2.0
> dec_tab_scalar: times 2 dq -1.0
> seq_tab_sse2: dq 1.0, 0.0
>
> +autoc_init_tab: times 4 dq 1.0
There's one_tab already, so no need to add this.
> +
> SECTION .text
>
> %macro APPLY_WELCH_FN 0
> @@ -261,3 +263,92 @@ APPLY_WELCH_FN
> INIT_YMM avx2
> APPLY_WELCH_FN
> %endif
> +
> +%macro COMPUTE_AUTOCORR_FN 0
> +cglobal lpc_compute_autocorr, 4, 7, 8, data, len, lag, autoc, lag_p, data_l, len_p
> +
> + shl lagd, 3
> + shl lenq, 3
> + xor lag_pq, lag_pq
> +
> +.lag_l:
> + movaps m8, [autoc_init_tab]
> +
> + mov len_pq, lag_pq
> +
> + lea data_lq, [lag_pq + mmsize - 8]
> + neg data_lq ; -j - mmsize
> + add data_lq, dataq ; data[-j - mmsize]
> +.len_l:
> + ; We waste the upper value here on SSE2,
> + ; but we use it on AVX.
> + movupd xm0, [dataq + len_pq] ; data[i]
> + movupd m1, [data_lq + len_pq] ; data[i - j]
> +
> +%if cpuflag(avx)
> + vbroadcastsd m0, xm0
> + vperm2f128 m1, m1, m1, 0x01
You can do
vpermpd m1, [data_lq + len_pq], q0123
Which saves you the movupd + vperm2f128 + shufpd for fma3.
> +%endif
> +
> + shufpd m0, m0, m0, 1100b
> + shufpd m1, m1, m1, 0101b
> +
> +%if cpuflag(fma3)
> + fmaddpd m8, m0, m1, m8 ; sum += data[i]*data[i-j]
So i found out this instruction is what's killing performance for me. If
it remove it and use mulpd + addpd like in sse2, i see the same boost
you got.
Why? I have no idea. Other functions using this same instruction, like
ff_vector_dmac_scalar_fma3, don't see any performance hit.
> +%else
> + mulpd m0, m1
> + addpd m8, m0 ; sum += data[i]*data[i-j]
> +%endif
> +
> + add len_pq, 8
> + cmp len_pq, lenq
> + jl .len_l
> +
> + movups [autocq + lag_pq], m8 ; autoc[j] = sum
> + add lag_pq, mmsize
> + cmp lag_pq, lagq
> + jl .lag_l
> +
> + ; The tail computation is guaranteed never to happen
> + ; as long as we're doing multiples of 4, rather than 2.
> + ; It is trivial to convert this to avx if ever needed.
> +%if !cpuflag(avx)
> + jg .end
> + ; If lag_p == lag fallthrough
> +
> +.tail:
> + movaps xm2, [autoc_init_tab]
> +
> + mov len_pq, lag_pq
> + sub len_pq, mmsize
> +
> + lea data_lq, [lag_pq]
> + neg data_lq ; -j
> + add data_lq, dataq ; data[-j]
> +
> +.tail_l:
> + movupd xm0, [dataq + len_pq]
> + movupd xm1, [data_lq + len_pq]
> +
> + mulpd xm0, xm1
> + addpd xm2, xm0 ; sum += data[i]*data[i-j]
> +
> + add len_pq, mmsize
> + cmp len_pq, lenq
> + jl .tail_l
> +
> + shufpd xm1, xm2, xm2, 01b
> + addpd xm2, xm1
> +
> + movhpd [autocq + lag_pq], xm2
> +%endif
> +
> +.end:
> + RET
> +
> +%endmacro
> +
> +INIT_XMM sse2
> +COMPUTE_AUTOCORR_FN
> +INIT_YMM fma3
> +COMPUTE_AUTOCORR_FN
> diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c
> index f2fca53799..96469fae40 100644
> --- a/libavcodec/x86/lpc_init.c
> +++ b/libavcodec/x86/lpc_init.c
> @@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t *data, ptrdiff_t len,
> double *w_data);
> void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len,
> double *w_data);
> -
> -DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
> -
> -#if HAVE_SSE2_INLINE
> -
> -static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
> - double *autoc)
> -{
> - int j;
> -
> - if((x86_reg)data & 15)
> - data++;
> -
> - for(j=0; j<lag; j+=2){
> - x86_reg i = -len*sizeof(double);
> - if(j == lag-2) {
> - __asm__ volatile(
> - "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
> - "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
> - "movsd "MANGLE(pd_1)", %%xmm2 \n\t"
> - "1: \n\t"
> - "movapd (%2,%0), %%xmm3 \n\t"
> - "movupd -8(%3,%0), %%xmm4 \n\t"
> - "movapd (%3,%0), %%xmm5 \n\t"
> - "mulpd %%xmm3, %%xmm4 \n\t"
> - "mulpd %%xmm3, %%xmm5 \n\t"
> - "mulpd -16(%3,%0), %%xmm3 \n\t"
> - "addpd %%xmm4, %%xmm1 \n\t"
> - "addpd %%xmm5, %%xmm0 \n\t"
> - "addpd %%xmm3, %%xmm2 \n\t"
> - "add $16, %0 \n\t"
> - "jl 1b \n\t"
> - "movhlps %%xmm0, %%xmm3 \n\t"
> - "movhlps %%xmm1, %%xmm4 \n\t"
> - "movhlps %%xmm2, %%xmm5 \n\t"
> - "addsd %%xmm3, %%xmm0 \n\t"
> - "addsd %%xmm4, %%xmm1 \n\t"
> - "addsd %%xmm5, %%xmm2 \n\t"
> - "movsd %%xmm0, (%1) \n\t"
> - "movsd %%xmm1, 8(%1) \n\t"
> - "movsd %%xmm2, 16(%1) \n\t"
> - :"+&r"(i)
> - :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
> - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
> - :"memory"
> - );
> - } else {
> - __asm__ volatile(
> - "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
> - "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
> - "1: \n\t"
> - "movapd (%3,%0), %%xmm3 \n\t"
> - "movupd -8(%4,%0), %%xmm4 \n\t"
> - "mulpd %%xmm3, %%xmm4 \n\t"
> - "mulpd (%4,%0), %%xmm3 \n\t"
> - "addpd %%xmm4, %%xmm1 \n\t"
> - "addpd %%xmm3, %%xmm0 \n\t"
> - "add $16, %0 \n\t"
> - "jl 1b \n\t"
> - "movhlps %%xmm0, %%xmm3 \n\t"
> - "movhlps %%xmm1, %%xmm4 \n\t"
> - "addsd %%xmm3, %%xmm0 \n\t"
> - "addsd %%xmm4, %%xmm1 \n\t"
> - "movsd %%xmm0, %1 \n\t"
> - "movsd %%xmm1, %2 \n\t"
> - :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
> - :"r"(data+len), "r"(data+len-j)
> - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
> - );
> - }
> - }
> -}
> -
> -#endif /* HAVE_SSE2_INLINE */
> +void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
> + double *autoc);
> +void ff_lpc_compute_autocorr_fma3(const double *data, ptrdiff_t len, int lag,
> + double *autoc);
>
> av_cold void ff_lpc_init_x86(LPCContext *c)
> {
> int cpu_flags = av_get_cpu_flags();
>
> -#if HAVE_SSE2_INLINE
> - if (INLINE_SSE2_SLOW(cpu_flags))
> - c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
> -#endif
> + if (EXTERNAL_SSE2(cpu_flags))
> + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
> +
> + if (EXTERNAL_FMA3(cpu_flags))
> + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_fma3;
>
> if (EXTERNAL_SSE2(cpu_flags))
> c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* [FFmpeg-devel] [PATCH v2] lpc: rewrite lpc_compute_autocorr in external asm
2024-05-25 20:57 [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm Lynne via ffmpeg-devel
` (2 preceding siblings ...)
2024-05-26 0:39 ` James Almer
@ 2024-05-26 1:42 ` Lynne via ffmpeg-devel
2024-05-26 1:51 ` James Almer
2024-05-26 19:43 ` Michael Niedermayer
3 siblings, 2 replies; 14+ messages in thread
From: Lynne via ffmpeg-devel @ 2024-05-26 1:42 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Lynne
The inline asm function had issues running under checkasm.
So I came to finish what I started, and wrote the last part
of LPC computation in assembly.
---
libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
2 files changed, 100 insertions(+), 78 deletions(-)
diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
index a585c17ef5..9c359ae480 100644
--- a/libavcodec/x86/lpc.asm
+++ b/libavcodec/x86/lpc.asm
@@ -261,3 +261,94 @@ APPLY_WELCH_FN
INIT_YMM avx2
APPLY_WELCH_FN
%endif
+
+%macro COMPUTE_AUTOCORR_FN 0
+cglobal lpc_compute_autocorr, 4, 7, 3, data, len, lag, autoc, lag_p, data_l, len_p
+ shl lagd, 3
+ shl lenq, 3
+ xor lag_pq, lag_pq
+
+.lag_l:
+ movaps m2, [one_tab]
+
+ mov len_pq, lag_pq
+
+ lea data_lq, [lag_pq + mmsize - 8]
+ neg data_lq ; -j - mmsize
+ add data_lq, dataq ; data[-j - mmsize]
+.len_l:
+
+%if mmsize == 32
+ vbroadcastsd m0, [dataq + len_pq]
+ vpermpd m1, [data_lq + len_pq], q0123
+%else
+ movupd m1, [data_lq + len_pq] ; data[i - j]
+ movsd xm0, [dataq + len_pq] ; data[i]
+ shufpd m1, m1, m1, 01b
+%endif
+
+ shufpd m0, m0, m0, 1100b
+
+ ; fmadd actually hurts performance in this case due to
+ ; the earlier loads + shuffles
+ mulpd m0, m1
+ addpd m2, m0 ; sum += data[i]*data[i-j]
+
+ add len_pq, 8
+ cmp len_pq, lenq
+ jl .len_l
+
+ movupd [autocq + lag_pq], m2 ; autoc[j] = sum
+ add lag_pq, mmsize
+ cmp lag_pq, lagq
+ jl .lag_l
+
+ ; The tail computation is guaranteed never to happen
+ ; as long as we're doing multiples of 4, rather than 2.
+%if mmsize != 32
+ jg .end
+ ; If lag_p == lag fallthrough
+
+.tail:
+ movaps m2, [one_tab]
+
+ mov len_pq, lag_pq
+ sub len_pq, mmsize
+
+ lea data_lq, [lag_pq]
+ neg data_lq ; -j
+ add data_lq, dataq ; data[-j]
+
+.tail_l:
+ movupd m0, [dataq + len_pq]
+ movupd m1, [data_lq + len_pq]
+
+ mulpd m0, m1
+ addpd m2, m0 ; sum += data[i]*data[i-j]
+
+ add len_pq, mmsize
+ cmp len_pq, lenq
+ jl .tail_l
+
+ shufpd m1, m2, m2, 01b
+ addpd m2, m1
+
+ ; Leave this here just in case its ever needed
+%if mmsize == 32
+ vperm2f128 m1, m2, m2, 0x01
+ addpd xm2, xm1
+ movupd [autocq + lag_pq], xm2
+%else
+ movhpd [autocq + lag_pq], xm2
+%endif
+
+.end:
+%endif
+
+ RET
+%endmacro
+
+INIT_XMM sse2
+COMPUTE_AUTOCORR_FN
+INIT_YMM avx
+COMPUTE_AUTOCORR_FN
diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c
index f2fca53799..bb174be53e 100644
--- a/libavcodec/x86/lpc_init.c
+++ b/libavcodec/x86/lpc_init.c
@@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t *data, ptrdiff_t len,
double *w_data);
void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len,
double *w_data);
-
-DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
-
-#if HAVE_SSE2_INLINE
-
-static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
- double *autoc)
-{
- int j;
-
- if((x86_reg)data & 15)
- data++;
-
- for(j=0; j<lag; j+=2){
- x86_reg i = -len*sizeof(double);
- if(j == lag-2) {
- __asm__ volatile(
- "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
- "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
- "movsd "MANGLE(pd_1)", %%xmm2 \n\t"
- "1: \n\t"
- "movapd (%2,%0), %%xmm3 \n\t"
- "movupd -8(%3,%0), %%xmm4 \n\t"
- "movapd (%3,%0), %%xmm5 \n\t"
- "mulpd %%xmm3, %%xmm4 \n\t"
- "mulpd %%xmm3, %%xmm5 \n\t"
- "mulpd -16(%3,%0), %%xmm3 \n\t"
- "addpd %%xmm4, %%xmm1 \n\t"
- "addpd %%xmm5, %%xmm0 \n\t"
- "addpd %%xmm3, %%xmm2 \n\t"
- "add $16, %0 \n\t"
- "jl 1b \n\t"
- "movhlps %%xmm0, %%xmm3 \n\t"
- "movhlps %%xmm1, %%xmm4 \n\t"
- "movhlps %%xmm2, %%xmm5 \n\t"
- "addsd %%xmm3, %%xmm0 \n\t"
- "addsd %%xmm4, %%xmm1 \n\t"
- "addsd %%xmm5, %%xmm2 \n\t"
- "movsd %%xmm0, (%1) \n\t"
- "movsd %%xmm1, 8(%1) \n\t"
- "movsd %%xmm2, 16(%1) \n\t"
- :"+&r"(i)
- :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
- NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
- :"memory"
- );
- } else {
- __asm__ volatile(
- "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
- "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
- "1: \n\t"
- "movapd (%3,%0), %%xmm3 \n\t"
- "movupd -8(%4,%0), %%xmm4 \n\t"
- "mulpd %%xmm3, %%xmm4 \n\t"
- "mulpd (%4,%0), %%xmm3 \n\t"
- "addpd %%xmm4, %%xmm1 \n\t"
- "addpd %%xmm3, %%xmm0 \n\t"
- "add $16, %0 \n\t"
- "jl 1b \n\t"
- "movhlps %%xmm0, %%xmm3 \n\t"
- "movhlps %%xmm1, %%xmm4 \n\t"
- "addsd %%xmm3, %%xmm0 \n\t"
- "addsd %%xmm4, %%xmm1 \n\t"
- "movsd %%xmm0, %1 \n\t"
- "movsd %%xmm1, %2 \n\t"
- :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
- :"r"(data+len), "r"(data+len-j)
- NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
- );
- }
- }
-}
-
-#endif /* HAVE_SSE2_INLINE */
+void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
+ double *autoc);
+void ff_lpc_compute_autocorr_avx(const double *data, ptrdiff_t len, int lag,
+ double *autoc);
av_cold void ff_lpc_init_x86(LPCContext *c)
{
int cpu_flags = av_get_cpu_flags();
-#if HAVE_SSE2_INLINE
- if (INLINE_SSE2_SLOW(cpu_flags))
- c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
-#endif
+ if (EXTERNAL_SSE2(cpu_flags))
+ c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
+
+ if (EXTERNAL_AVX_FAST(cpu_flags))
+ c->lpc_compute_autocorr = ff_lpc_compute_autocorr_avx;
if (EXTERNAL_SSE2(cpu_flags))
c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;
--
2.43.0.381.gb435a96ce8
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2] lpc: rewrite lpc_compute_autocorr in external asm
2024-05-26 1:42 ` [FFmpeg-devel] [PATCH v2] " Lynne via ffmpeg-devel
@ 2024-05-26 1:51 ` James Almer
2024-05-26 2:16 ` James Almer
2024-05-26 19:43 ` Michael Niedermayer
1 sibling, 1 reply; 14+ messages in thread
From: James Almer @ 2024-05-26 1:51 UTC (permalink / raw)
To: ffmpeg-devel
On 5/25/2024 10:42 PM, Lynne via ffmpeg-devel wrote:
> The inline asm function had issues running under checkasm.
> So I came to finish what I started, and wrote the last part
> of LPC computation in assembly.
> ---
> libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
> 2 files changed, 100 insertions(+), 78 deletions(-)
>
> diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
> index a585c17ef5..9c359ae480 100644
> --- a/libavcodec/x86/lpc.asm
> +++ b/libavcodec/x86/lpc.asm
> @@ -261,3 +261,94 @@ APPLY_WELCH_FN
> INIT_YMM avx2
> APPLY_WELCH_FN
> %endif
> +
> +%macro COMPUTE_AUTOCORR_FN 0
> +cglobal lpc_compute_autocorr, 4, 7, 3, data, len, lag, autoc, lag_p, data_l, len_p
> + shl lagd, 3
> + shl lenq, 3
> + xor lag_pq, lag_pq
> +
> +.lag_l:
> + movaps m2, [one_tab]
Super nit: movapd
> +
> + mov len_pq, lag_pq
> +
> + lea data_lq, [lag_pq + mmsize - 8]
> + neg data_lq ; -j - mmsize
> + add data_lq, dataq ; data[-j - mmsize]
> +.len_l:
> +
> +%if mmsize == 32
> + vbroadcastsd m0, [dataq + len_pq]
> + vpermpd m1, [data_lq + len_pq], q0123
> +%else
> + movupd m1, [data_lq + len_pq] ; data[i - j]
> + movsd xm0, [dataq + len_pq] ; data[i]
> + shufpd m1, m1, m1, 01b
> +%endif
> +
> + shufpd m0, m0, m0, 1100b
This is not needed for mmsize == 32. The broadcast set every qword to
the value movsd loaded.
> +
> + ; fmadd actually hurts performance in this case due to
> + ; the earlier loads + shuffles
> + mulpd m0, m1
> + addpd m2, m0 ; sum += data[i]*data[i-j]
> +
> + add len_pq, 8
> + cmp len_pq, lenq
> + jl .len_l
> +
> + movupd [autocq + lag_pq], m2 ; autoc[j] = sum
> + add lag_pq, mmsize
> + cmp lag_pq, lagq
> + jl .lag_l
> +
> + ; The tail computation is guaranteed never to happen
> + ; as long as we're doing multiples of 4, rather than 2.
> +%if mmsize != 32
> + jg .end
> + ; If lag_p == lag fallthrough
> +
> +.tail:
> + movaps m2, [one_tab]
> +
> + mov len_pq, lag_pq
> + sub len_pq, mmsize
> +
> + lea data_lq, [lag_pq]
> + neg data_lq ; -j
> + add data_lq, dataq ; data[-j]
> +
> +.tail_l:
> + movupd m0, [dataq + len_pq]
> + movupd m1, [data_lq + len_pq]
> +
> + mulpd m0, m1
> + addpd m2, m0 ; sum += data[i]*data[i-j]
> +
> + add len_pq, mmsize
> + cmp len_pq, lenq
> + jl .tail_l
> +
> + shufpd m1, m2, m2, 01b
> + addpd m2, m1
> +
> + ; Leave this here just in case its ever needed
> +%if mmsize == 32
> + vperm2f128 m1, m2, m2, 0x01
> + addpd xm2, xm1
> + movupd [autocq + lag_pq], xm2
> +%else
> + movhpd [autocq + lag_pq], xm2
> +%endif
> +
> +.end:
> +%endif
> +
> + RET
> +%endmacro
> +
> +INIT_XMM sse2
> +COMPUTE_AUTOCORR_FN
> +INIT_YMM avx
vpermpd is avx2, so it needs to be that.
> +COMPUTE_AUTOCORR_FN
> diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c
> index f2fca53799..bb174be53e 100644
> --- a/libavcodec/x86/lpc_init.c
> +++ b/libavcodec/x86/lpc_init.c
> @@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t *data, ptrdiff_t len,
> double *w_data);
> void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len,
> double *w_data);
> -
> -DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
> -
> -#if HAVE_SSE2_INLINE
> -
> -static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
> - double *autoc)
> -{
> - int j;
> -
> - if((x86_reg)data & 15)
> - data++;
> -
> - for(j=0; j<lag; j+=2){
> - x86_reg i = -len*sizeof(double);
> - if(j == lag-2) {
> - __asm__ volatile(
> - "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
> - "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
> - "movsd "MANGLE(pd_1)", %%xmm2 \n\t"
> - "1: \n\t"
> - "movapd (%2,%0), %%xmm3 \n\t"
> - "movupd -8(%3,%0), %%xmm4 \n\t"
> - "movapd (%3,%0), %%xmm5 \n\t"
> - "mulpd %%xmm3, %%xmm4 \n\t"
> - "mulpd %%xmm3, %%xmm5 \n\t"
> - "mulpd -16(%3,%0), %%xmm3 \n\t"
> - "addpd %%xmm4, %%xmm1 \n\t"
> - "addpd %%xmm5, %%xmm0 \n\t"
> - "addpd %%xmm3, %%xmm2 \n\t"
> - "add $16, %0 \n\t"
> - "jl 1b \n\t"
> - "movhlps %%xmm0, %%xmm3 \n\t"
> - "movhlps %%xmm1, %%xmm4 \n\t"
> - "movhlps %%xmm2, %%xmm5 \n\t"
> - "addsd %%xmm3, %%xmm0 \n\t"
> - "addsd %%xmm4, %%xmm1 \n\t"
> - "addsd %%xmm5, %%xmm2 \n\t"
> - "movsd %%xmm0, (%1) \n\t"
> - "movsd %%xmm1, 8(%1) \n\t"
> - "movsd %%xmm2, 16(%1) \n\t"
> - :"+&r"(i)
> - :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
> - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
> - :"memory"
> - );
> - } else {
> - __asm__ volatile(
> - "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
> - "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
> - "1: \n\t"
> - "movapd (%3,%0), %%xmm3 \n\t"
> - "movupd -8(%4,%0), %%xmm4 \n\t"
> - "mulpd %%xmm3, %%xmm4 \n\t"
> - "mulpd (%4,%0), %%xmm3 \n\t"
> - "addpd %%xmm4, %%xmm1 \n\t"
> - "addpd %%xmm3, %%xmm0 \n\t"
> - "add $16, %0 \n\t"
> - "jl 1b \n\t"
> - "movhlps %%xmm0, %%xmm3 \n\t"
> - "movhlps %%xmm1, %%xmm4 \n\t"
> - "addsd %%xmm3, %%xmm0 \n\t"
> - "addsd %%xmm4, %%xmm1 \n\t"
> - "movsd %%xmm0, %1 \n\t"
> - "movsd %%xmm1, %2 \n\t"
> - :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
> - :"r"(data+len), "r"(data+len-j)
> - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
> - );
> - }
> - }
> -}
> -
> -#endif /* HAVE_SSE2_INLINE */
> +void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
> + double *autoc);
> +void ff_lpc_compute_autocorr_avx(const double *data, ptrdiff_t len, int lag,
> + double *autoc);
>
> av_cold void ff_lpc_init_x86(LPCContext *c)
> {
> int cpu_flags = av_get_cpu_flags();
>
> -#if HAVE_SSE2_INLINE
> - if (INLINE_SSE2_SLOW(cpu_flags))
> - c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
> -#endif
> + if (EXTERNAL_SSE2(cpu_flags))
> + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
Place this with ff_lpc_apply_welch_window_sse2 below.
> +
> + if (EXTERNAL_AVX_FAST(cpu_flags))
> + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_avx;
>
> if (EXTERNAL_SSE2(cpu_flags))
> c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2] lpc: rewrite lpc_compute_autocorr in external asm
2024-05-26 1:51 ` James Almer
@ 2024-05-26 2:16 ` James Almer
0 siblings, 0 replies; 14+ messages in thread
From: James Almer @ 2024-05-26 2:16 UTC (permalink / raw)
To: ffmpeg-devel
On 5/25/2024 10:51 PM, James Almer wrote:
> On 5/25/2024 10:42 PM, Lynne via ffmpeg-devel wrote:
>> The inline asm function had issues running under checkasm.
>> So I came to finish what I started, and wrote the last part
>> of LPC computation in assembly.
>> ---
>> libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
>> libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
>> 2 files changed, 100 insertions(+), 78 deletions(-)
>>
>> diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
>> index a585c17ef5..9c359ae480 100644
>> --- a/libavcodec/x86/lpc.asm
>> +++ b/libavcodec/x86/lpc.asm
>> @@ -261,3 +261,94 @@ APPLY_WELCH_FN
>> INIT_YMM avx2
>> APPLY_WELCH_FN
>> %endif
>> +
>> +%macro COMPUTE_AUTOCORR_FN 0
>> +cglobal lpc_compute_autocorr, 4, 7, 3, data, len, lag, autoc, lag_p,
>> data_l, len_p
>> + shl lagd, 3
>> + shl lenq, 3
>> + xor lag_pq, lag_pq
>> +
>> +.lag_l:
>> + movaps m2, [one_tab]
>
> Super nit: movapd
>
>> +
>> + mov len_pq, lag_pq
>> +
>> + lea data_lq, [lag_pq + mmsize - 8]
>> + neg data_lq ; -j - mmsize
>> + add data_lq, dataq ; data[-j - mmsize]
>> +.len_l:
>> +
>> +%if mmsize == 32
>> + vbroadcastsd m0, [dataq + len_pq]
>> + vpermpd m1, [data_lq + len_pq], q0123
>> +%else
>> + movupd m1, [data_lq + len_pq] ; data[i - j]
>> + movsd xm0, [dataq + len_pq] ; data[i]
>> + shufpd m1, m1, m1, 01b
I just realized you're shuffling the values inside the len_1 loop when
you could do it right before you store the sum.
Something like:
[...]
.len_l:
%if mmsize == 16
movsd m0, [dataq + len_pq] ; data[i]
shufpd m0, m0, m0, 0
movupd m1, [data_lq + len_pq] ; data[i - j]
mulpd m0, m1
%else
vbroadcastsd m0, [dataq + len_pq]
mulpd m0, [data_lq + len_pq] ; data[i - j]
%endif
addpd m2, m0 ; sum += data[i]*data[i-j]
add len_pq, 8
cmp len_pq, lenq
jl .len_l
shufpd m2, m2, m2, 0101b
%if mmsize == 32
vextractf128 [autocq + lag_pq], m2, 1
movupd [autocq + lag_pq + 16], xm2 ; autoc[j] = sum
%else
movupd [autocq + lag_pq], m2 ; autoc[j] = sum
%endif
add lag_pq, mmsize
cmp lag_pq, lagq
jl .lag_l
[...]
And by using vextractf128 here instead of vpermpd you can keep the
function as avx instead of avx2, unless a vpermpd + single 256bit store
is faster than shufpd + two stores (vextractf128 + movu 128bit), which i
assume it wont because of crosslane shuffling.
>> +%endif
>> +
>> + shufpd m0, m0, m0, 1100b
>
> This is not needed for mmsize == 32. The broadcast set every qword to
> the value movsd loaded.
>
>> +
>> + ; fmadd actually hurts performance in this case due to
>> + ; the earlier loads + shuffles
>> + mulpd m0, m1
>> + addpd m2, m0 ; sum += data[i]*data[i-j]
>> +
>> + add len_pq, 8
>> + cmp len_pq, lenq
>> + jl .len_l
>> +
>> + movupd [autocq + lag_pq], m2 ; autoc[j] = sum
>> + add lag_pq, mmsize
>> + cmp lag_pq, lagq
>> + jl .lag_l
>> +
>> + ; The tail computation is guaranteed never to happen
>> + ; as long as we're doing multiples of 4, rather than 2.
>> +%if mmsize != 32
>> + jg .end
>> + ; If lag_p == lag fallthrough
>> +
>> +.tail:
>> + movaps m2, [one_tab]
>> +
>> + mov len_pq, lag_pq
>> + sub len_pq, mmsize
>> +
>> + lea data_lq, [lag_pq]
>> + neg data_lq ; -j
>> + add data_lq, dataq ; data[-j]
>> +
>> +.tail_l:
>> + movupd m0, [dataq + len_pq]
>> + movupd m1, [data_lq + len_pq]
>> +
>> + mulpd m0, m1
>> + addpd m2, m0 ; sum += data[i]*data[i-j]
>> +
>> + add len_pq, mmsize
>> + cmp len_pq, lenq
>> + jl .tail_l
>> +
>> + shufpd m1, m2, m2, 01b
>> + addpd m2, m1
>> +
>> + ; Leave this here just in case its ever needed
>> +%if mmsize == 32
>> + vperm2f128 m1, m2, m2, 0x01
>> + addpd xm2, xm1
>> + movupd [autocq + lag_pq], xm2
>> +%else
>> + movhpd [autocq + lag_pq], xm2
>> +%endif
>> +
>> +.end:
>> +%endif
>> +
>> + RET
>> +%endmacro
>> +
>> +INIT_XMM sse2
>> +COMPUTE_AUTOCORR_FN
>> +INIT_YMM avx
>
> vpermpd is avx2, so it needs to be that.
>
>> +COMPUTE_AUTOCORR_FN
>> diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c
>> index f2fca53799..bb174be53e 100644
>> --- a/libavcodec/x86/lpc_init.c
>> +++ b/libavcodec/x86/lpc_init.c
>> @@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t
>> *data, ptrdiff_t len,
>> double *w_data);
>> void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len,
>> double *w_data);
>> -
>> -DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
>> -
>> -#if HAVE_SSE2_INLINE
>> -
>> -static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t
>> len, int lag,
>> - double *autoc)
>> -{
>> - int j;
>> -
>> - if((x86_reg)data & 15)
>> - data++;
>> -
>> - for(j=0; j<lag; j+=2){
>> - x86_reg i = -len*sizeof(double);
>> - if(j == lag-2) {
>> - __asm__ volatile(
>> - "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
>> - "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
>> - "movsd "MANGLE(pd_1)", %%xmm2 \n\t"
>> - "1: \n\t"
>> - "movapd (%2,%0), %%xmm3 \n\t"
>> - "movupd -8(%3,%0), %%xmm4 \n\t"
>> - "movapd (%3,%0), %%xmm5 \n\t"
>> - "mulpd %%xmm3, %%xmm4 \n\t"
>> - "mulpd %%xmm3, %%xmm5 \n\t"
>> - "mulpd -16(%3,%0), %%xmm3 \n\t"
>> - "addpd %%xmm4, %%xmm1 \n\t"
>> - "addpd %%xmm5, %%xmm0 \n\t"
>> - "addpd %%xmm3, %%xmm2 \n\t"
>> - "add $16, %0 \n\t"
>> - "jl 1b \n\t"
>> - "movhlps %%xmm0, %%xmm3 \n\t"
>> - "movhlps %%xmm1, %%xmm4 \n\t"
>> - "movhlps %%xmm2, %%xmm5 \n\t"
>> - "addsd %%xmm3, %%xmm0 \n\t"
>> - "addsd %%xmm4, %%xmm1 \n\t"
>> - "addsd %%xmm5, %%xmm2 \n\t"
>> - "movsd %%xmm0, (%1) \n\t"
>> - "movsd %%xmm1, 8(%1) \n\t"
>> - "movsd %%xmm2, 16(%1) \n\t"
>> - :"+&r"(i)
>> - :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
>> - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
>> - :"memory"
>> - );
>> - } else {
>> - __asm__ volatile(
>> - "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
>> - "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
>> - "1: \n\t"
>> - "movapd (%3,%0), %%xmm3 \n\t"
>> - "movupd -8(%4,%0), %%xmm4 \n\t"
>> - "mulpd %%xmm3, %%xmm4 \n\t"
>> - "mulpd (%4,%0), %%xmm3 \n\t"
>> - "addpd %%xmm4, %%xmm1 \n\t"
>> - "addpd %%xmm3, %%xmm0 \n\t"
>> - "add $16, %0 \n\t"
>> - "jl 1b \n\t"
>> - "movhlps %%xmm0, %%xmm3 \n\t"
>> - "movhlps %%xmm1, %%xmm4 \n\t"
>> - "addsd %%xmm3, %%xmm0 \n\t"
>> - "addsd %%xmm4, %%xmm1 \n\t"
>> - "movsd %%xmm0, %1 \n\t"
>> - "movsd %%xmm1, %2 \n\t"
>> - :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
>> - :"r"(data+len), "r"(data+len-j)
>> - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
>> - );
>> - }
>> - }
>> -}
>> -
>> -#endif /* HAVE_SSE2_INLINE */
>> +void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len,
>> int lag,
>> + double *autoc);
>> +void ff_lpc_compute_autocorr_avx(const double *data, ptrdiff_t len,
>> int lag,
>> + double *autoc);
>> av_cold void ff_lpc_init_x86(LPCContext *c)
>> {
>> int cpu_flags = av_get_cpu_flags();
>> -#if HAVE_SSE2_INLINE
>> - if (INLINE_SSE2_SLOW(cpu_flags))
>> - c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
>> -#endif
>> + if (EXTERNAL_SSE2(cpu_flags))
>> + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
>
> Place this with ff_lpc_apply_welch_window_sse2 below.
>
>> +
>> + if (EXTERNAL_AVX_FAST(cpu_flags))
>> + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_avx;
>> if (EXTERNAL_SSE2(cpu_flags))
>> c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm
2024-05-25 22:31 ` James Almer
2024-05-25 22:45 ` James Almer
2024-05-25 23:24 ` Lynne via ffmpeg-devel
@ 2024-05-26 5:45 ` Rémi Denis-Courmont
2 siblings, 0 replies; 14+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-26 5:45 UTC (permalink / raw)
To: ffmpeg-devel
Le sunnuntaina 26. toukokuuta 2024, 1.31.18 EEST James Almer a écrit :
> On 5/25/2024 5:57 PM, Lynne via ffmpeg-devel wrote:
> > The inline asm function had issues running under checkasm.
> > So I came to finish what I started, and wrote the last part
> > of LPC computation in assembly.
> >
> > autocorr_10_c: 135525.8
> > autocorr_10_sse2: 50729.8
> > autocorr_10_fma3: 19007.8
> > autocorr_30_c: 390100.8
> > autocorr_30_sse2: 142478.8
> > autocorr_30_fma3: 50559.8
> > autocorr_32_c: 407058.3
> > autocorr_32_sse2: 151633.3
> > autocorr_32_fma3: 50517.3
> > ---
> >
> > libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
> > libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
> > 2 files changed, 100 insertions(+), 78 deletions(-)
> >
> > diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
> > index a585c17ef5..790841b7f4 100644
> > --- a/libavcodec/x86/lpc.asm
> > +++ b/libavcodec/x86/lpc.asm
> > @@ -32,6 +32,8 @@ dec_tab_sse2: times 2 dq -2.0
> >
> > dec_tab_scalar: times 2 dq -1.0
> > seq_tab_sse2: dq 1.0, 0.0
> >
> > +autoc_init_tab: times 4 dq 1.0
> > +
> >
> > SECTION .text
> >
> > %macro APPLY_WELCH_FN 0
> >
> > @@ -261,3 +263,92 @@ APPLY_WELCH_FN
> >
> > INIT_YMM avx2
> > APPLY_WELCH_FN
> > %endif
> >
> > +
> > +%macro COMPUTE_AUTOCORR_FN 0
> > +cglobal lpc_compute_autocorr, 4, 7, 8, data, len, lag, autoc, lag_p,
> > data_l, len_p
> Already mentioned, but it should be 3 not 8.
>
> > +
> > + shl lagd, 3
> > + shl lenq, 3
> > + xor lag_pq, lag_pq
> > +
> > +.lag_l:
> > + movaps m8, [autoc_init_tab]
>
> m2
>
> > +
> > + mov len_pq, lag_pq
> > +
> > + lea data_lq, [lag_pq + mmsize - 8]
> > + neg data_lq ; -j - mmsize
> > + add data_lq, dataq ; data[-j - mmsize]
> > +.len_l:
> > + ; We waste the upper value here on SSE2,
> > + ; but we use it on AVX.
> > + movupd xm0, [dataq + len_pq] ; data[i]
>
> movsd
>
> > + movupd m1, [data_lq + len_pq] ; data[i - j]
> > +
> > +%if cpuflag(avx)
>
> %if mmsize == 32 here and everywhere else.
>
> > + vbroadcastsd m0, xm0
>
> This is AVX2. AVX only has memory input argument. So use that and save
> the movsd from above for the FMA3 version.
>
> > + vperm2f128 m1, m1, m1, 0x01
>
> Aren't you loading 16 extra bytes for no reason if you're just going to
> use the upper 16 bytes from the load above?
>
> > +%endif
> > +
> > + shufpd m0, m0, m0, 1100b
>
> The last argument has two bits, not four. What you're doing here is a
> splat/broadcast, so you don't need it for FMA3.
>
> > + shufpd m1, m1, m1, 0101b
>
> The upper two bits of imm8 are ignored.
>
> > +
> > +%if cpuflag(fma3)
> > + fmaddpd m8, m0, m1, m8 ; sum += data[i]*data[i-j]
> > +%else
> > + mulpd m0, m1
> > + addpd m8, m0 ; sum += data[i]*data[i-j]
> > +%endif
> > +
> > + add len_pq, 8
> > + cmp len_pq, lenq
> > + jl .len_l
> > +
> > + movups [autocq + lag_pq], m8 ; autoc[j] = sum
> > + add lag_pq, mmsize
> > + cmp lag_pq, lagq
> > + jl .lag_l
> > +
> > + ; The tail computation is guaranteed never to happen
> > + ; as long as we're doing multiples of 4, rather than 2.
> > + ; It is trivial to convert this to avx if ever needed.
> > +%if !cpuflag(avx)
>
> This doesn't seem to be tested as is. Maybe the checkasm should try
> other lag values?
Uh, my patch tests 10, 30 and 32, so I am not clear what you think is missing
here.
--
レミ・デニ-クールモン
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2] lpc: rewrite lpc_compute_autocorr in external asm
2024-05-26 1:42 ` [FFmpeg-devel] [PATCH v2] " Lynne via ffmpeg-devel
2024-05-26 1:51 ` James Almer
@ 2024-05-26 19:43 ` Michael Niedermayer
1 sibling, 0 replies; 14+ messages in thread
From: Michael Niedermayer @ 2024-05-26 19:43 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 1393 bytes --]
On Sun, May 26, 2024 at 03:42:01AM +0200, Lynne via ffmpeg-devel wrote:
> The inline asm function had issues running under checkasm.
> So I came to finish what I started, and wrote the last part
> of LPC computation in assembly.
> ---
> libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
> 2 files changed, 100 insertions(+), 78 deletions(-)
seems to break fate
make: *** [tests/Makefile:311: fate-lavf-ogg] Error 1
make: *** [tests/Makefile:311: fate-iamf-stereo] Error 1
make: *** [tests/Makefile:311: fate-mov-mp4-iamf-stereo] Error 1
make: *** [tests/Makefile:311: fate-iamf-ambisonic_1] Error 1
make: *** [tests/Makefile:310: fate-mov-mp4-iamf-ambisonic_1] Error 1
make: *** [tests/Makefile:311: fate-mov-mp4-iamf-5_1_4] Error 1
make: *** [tests/Makefile:311: fate-iamf-5_1_4] Error 1
make: *** [tests/Makefile:311: fate-iamf-7_1_4] Error 1
make: *** [tests/Makefile:311: fate-mov-mp4-iamf-7_1_4] Error 1
make: *** [tests/Makefile:311: fate-cover-art-flac-remux] Error 1
thx
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
If you fake or manipulate statistics in a paper in physics you will never
get a job again.
If you fake or manipulate statistics in a paper in medicin you will get
a job for life at the pharma industry.
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2024-05-26 19:43 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-25 20:57 [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm Lynne via ffmpeg-devel
2024-05-25 22:12 ` Michael Niedermayer
2024-05-25 22:31 ` James Almer
2024-05-25 22:45 ` James Almer
2024-05-26 0:02 ` Lynne via ffmpeg-devel
2024-05-26 0:09 ` James Almer
2024-05-25 23:24 ` Lynne via ffmpeg-devel
2024-05-25 23:41 ` James Almer
2024-05-26 5:45 ` Rémi Denis-Courmont
2024-05-26 0:39 ` James Almer
2024-05-26 1:42 ` [FFmpeg-devel] [PATCH v2] " Lynne via ffmpeg-devel
2024-05-26 1:51 ` James Almer
2024-05-26 2:16 ` James Almer
2024-05-26 19:43 ` Michael Niedermayer
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git