From: Lynne via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> To: ffmpeg-devel@ffmpeg.org Cc: Lynne <dev@lynne.ee> Subject: [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm Date: Sat, 25 May 2024 22:57:21 +0200 Message-ID: <20240525205731.2578146-1-dev@lynne.ee> (raw) The inline asm function had issues running under checkasm. So I came to finish what I started, and wrote the last part of LPC computation in assembly. autocorr_10_c: 135525.8 autocorr_10_sse2: 50729.8 autocorr_10_fma3: 19007.8 autocorr_30_c: 390100.8 autocorr_30_sse2: 142478.8 autocorr_30_fma3: 50559.8 autocorr_32_c: 407058.3 autocorr_32_sse2: 151633.3 autocorr_32_fma3: 50517.3 --- libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++ libavcodec/x86/lpc_init.c | 87 ++++--------------------------------- 2 files changed, 100 insertions(+), 78 deletions(-) diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm index a585c17ef5..790841b7f4 100644 --- a/libavcodec/x86/lpc.asm +++ b/libavcodec/x86/lpc.asm @@ -32,6 +32,8 @@ dec_tab_sse2: times 2 dq -2.0 dec_tab_scalar: times 2 dq -1.0 seq_tab_sse2: dq 1.0, 0.0 +autoc_init_tab: times 4 dq 1.0 + SECTION .text %macro APPLY_WELCH_FN 0 @@ -261,3 +263,92 @@ APPLY_WELCH_FN INIT_YMM avx2 APPLY_WELCH_FN %endif + +%macro COMPUTE_AUTOCORR_FN 0 +cglobal lpc_compute_autocorr, 4, 7, 8, data, len, lag, autoc, lag_p, data_l, len_p + + shl lagd, 3 + shl lenq, 3 + xor lag_pq, lag_pq + +.lag_l: + movaps m8, [autoc_init_tab] + + mov len_pq, lag_pq + + lea data_lq, [lag_pq + mmsize - 8] + neg data_lq ; -j - mmsize + add data_lq, dataq ; data[-j - mmsize] +.len_l: + ; We waste the upper value here on SSE2, + ; but we use it on AVX. + movupd xm0, [dataq + len_pq] ; data[i] + movupd m1, [data_lq + len_pq] ; data[i - j] + +%if cpuflag(avx) + vbroadcastsd m0, xm0 + vperm2f128 m1, m1, m1, 0x01 +%endif + + shufpd m0, m0, m0, 1100b + shufpd m1, m1, m1, 0101b + +%if cpuflag(fma3) + fmaddpd m8, m0, m1, m8 ; sum += data[i]*data[i-j] +%else + mulpd m0, m1 + addpd m8, m0 ; sum += data[i]*data[i-j] +%endif + + add len_pq, 8 + cmp len_pq, lenq + jl .len_l + + movups [autocq + lag_pq], m8 ; autoc[j] = sum + add lag_pq, mmsize + cmp lag_pq, lagq + jl .lag_l + + ; The tail computation is guaranteed never to happen + ; as long as we're doing multiples of 4, rather than 2. + ; It is trivial to convert this to avx if ever needed. +%if !cpuflag(avx) + jg .end + ; If lag_p == lag fallthrough + +.tail: + movaps xm2, [autoc_init_tab] + + mov len_pq, lag_pq + sub len_pq, mmsize + + lea data_lq, [lag_pq] + neg data_lq ; -j + add data_lq, dataq ; data[-j] + +.tail_l: + movupd xm0, [dataq + len_pq] + movupd xm1, [data_lq + len_pq] + + mulpd xm0, xm1 + addpd xm2, xm0 ; sum += data[i]*data[i-j] + + add len_pq, mmsize + cmp len_pq, lenq + jl .tail_l + + shufpd xm1, xm2, xm2, 01b + addpd xm2, xm1 + + movhpd [autocq + lag_pq], xm2 +%endif + +.end: + RET + +%endmacro + +INIT_XMM sse2 +COMPUTE_AUTOCORR_FN +INIT_YMM fma3 +COMPUTE_AUTOCORR_FN diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c index f2fca53799..96469fae40 100644 --- a/libavcodec/x86/lpc_init.c +++ b/libavcodec/x86/lpc_init.c @@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t *data, ptrdiff_t len, double *w_data); void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len, double *w_data); - -DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 }; - -#if HAVE_SSE2_INLINE - -static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag, - double *autoc) -{ - int j; - - if((x86_reg)data & 15) - data++; - - for(j=0; j<lag; j+=2){ - x86_reg i = -len*sizeof(double); - if(j == lag-2) { - __asm__ volatile( - "movsd "MANGLE(pd_1)", %%xmm0 \n\t" - "movsd "MANGLE(pd_1)", %%xmm1 \n\t" - "movsd "MANGLE(pd_1)", %%xmm2 \n\t" - "1: \n\t" - "movapd (%2,%0), %%xmm3 \n\t" - "movupd -8(%3,%0), %%xmm4 \n\t" - "movapd (%3,%0), %%xmm5 \n\t" - "mulpd %%xmm3, %%xmm4 \n\t" - "mulpd %%xmm3, %%xmm5 \n\t" - "mulpd -16(%3,%0), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm1 \n\t" - "addpd %%xmm5, %%xmm0 \n\t" - "addpd %%xmm3, %%xmm2 \n\t" - "add $16, %0 \n\t" - "jl 1b \n\t" - "movhlps %%xmm0, %%xmm3 \n\t" - "movhlps %%xmm1, %%xmm4 \n\t" - "movhlps %%xmm2, %%xmm5 \n\t" - "addsd %%xmm3, %%xmm0 \n\t" - "addsd %%xmm4, %%xmm1 \n\t" - "addsd %%xmm5, %%xmm2 \n\t" - "movsd %%xmm0, (%1) \n\t" - "movsd %%xmm1, 8(%1) \n\t" - "movsd %%xmm2, 16(%1) \n\t" - :"+&r"(i) - :"r"(autoc+j), "r"(data+len), "r"(data+len-j) - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1) - :"memory" - ); - } else { - __asm__ volatile( - "movsd "MANGLE(pd_1)", %%xmm0 \n\t" - "movsd "MANGLE(pd_1)", %%xmm1 \n\t" - "1: \n\t" - "movapd (%3,%0), %%xmm3 \n\t" - "movupd -8(%4,%0), %%xmm4 \n\t" - "mulpd %%xmm3, %%xmm4 \n\t" - "mulpd (%4,%0), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm1 \n\t" - "addpd %%xmm3, %%xmm0 \n\t" - "add $16, %0 \n\t" - "jl 1b \n\t" - "movhlps %%xmm0, %%xmm3 \n\t" - "movhlps %%xmm1, %%xmm4 \n\t" - "addsd %%xmm3, %%xmm0 \n\t" - "addsd %%xmm4, %%xmm1 \n\t" - "movsd %%xmm0, %1 \n\t" - "movsd %%xmm1, %2 \n\t" - :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]) - :"r"(data+len), "r"(data+len-j) - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1) - ); - } - } -} - -#endif /* HAVE_SSE2_INLINE */ +void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag, + double *autoc); +void ff_lpc_compute_autocorr_fma3(const double *data, ptrdiff_t len, int lag, + double *autoc); av_cold void ff_lpc_init_x86(LPCContext *c) { int cpu_flags = av_get_cpu_flags(); -#if HAVE_SSE2_INLINE - if (INLINE_SSE2_SLOW(cpu_flags)) - c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; -#endif + if (EXTERNAL_SSE2(cpu_flags)) + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2; + + if (EXTERNAL_FMA3(cpu_flags)) + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_fma3; if (EXTERNAL_SSE2(cpu_flags)) c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2; -- 2.43.0.381.gb435a96ce8 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next reply other threads:[~2024-05-25 20:57 UTC|newest] Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top 2024-05-25 20:57 Lynne via ffmpeg-devel [this message] 2024-05-25 22:12 ` Michael Niedermayer 2024-05-25 22:31 ` James Almer 2024-05-25 22:45 ` James Almer 2024-05-26 0:02 ` Lynne via ffmpeg-devel 2024-05-26 0:09 ` James Almer 2024-05-25 23:24 ` Lynne via ffmpeg-devel 2024-05-25 23:41 ` James Almer 2024-05-26 5:45 ` Rémi Denis-Courmont 2024-05-26 0:39 ` James Almer 2024-05-26 1:42 ` [FFmpeg-devel] [PATCH v2] " Lynne via ffmpeg-devel 2024-05-26 1:51 ` James Almer 2024-05-26 2:16 ` James Almer 2024-05-26 19:43 ` Michael Niedermayer
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20240525205731.2578146-1-dev@lynne.ee \ --to=ffmpeg-devel@ffmpeg.org \ --cc=dev@lynne.ee \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git