From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTP id A33294B008 for ; Sun, 26 May 2024 01:42:23 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id D5D8C68D4DB; Sun, 26 May 2024 04:42:20 +0300 (EEST) Received: from vidala.lynne.ee (vidala.pars.ee [116.203.72.101]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 68E7968D491 for ; Sun, 26 May 2024 04:42:14 +0300 (EEST) To: ffmpeg-devel@ffmpeg.org Date: Sun, 26 May 2024 03:42:01 +0200 Message-ID: <20240526014207.2697057-1-dev@lynne.ee> X-Mailer: git-send-email 2.43.0.381.gb435a96ce8 In-Reply-To: <20240525205731.2578146-1-dev@lynne.ee> References: <20240525205731.2578146-1-dev@lynne.ee> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH v2] lpc: rewrite lpc_compute_autocorr in external asm X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , From: Lynne via ffmpeg-devel Reply-To: FFmpeg development discussions and patches Cc: Lynne Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: The inline asm function had issues running under checkasm. So I came to finish what I started, and wrote the last part of LPC computation in assembly. --- libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++ libavcodec/x86/lpc_init.c | 87 ++++--------------------------------- 2 files changed, 100 insertions(+), 78 deletions(-) diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm index a585c17ef5..9c359ae480 100644 --- a/libavcodec/x86/lpc.asm +++ b/libavcodec/x86/lpc.asm @@ -261,3 +261,94 @@ APPLY_WELCH_FN INIT_YMM avx2 APPLY_WELCH_FN %endif + +%macro COMPUTE_AUTOCORR_FN 0 +cglobal lpc_compute_autocorr, 4, 7, 3, data, len, lag, autoc, lag_p, data_l, len_p + shl lagd, 3 + shl lenq, 3 + xor lag_pq, lag_pq + +.lag_l: + movaps m2, [one_tab] + + mov len_pq, lag_pq + + lea data_lq, [lag_pq + mmsize - 8] + neg data_lq ; -j - mmsize + add data_lq, dataq ; data[-j - mmsize] +.len_l: + +%if mmsize == 32 + vbroadcastsd m0, [dataq + len_pq] + vpermpd m1, [data_lq + len_pq], q0123 +%else + movupd m1, [data_lq + len_pq] ; data[i - j] + movsd xm0, [dataq + len_pq] ; data[i] + shufpd m1, m1, m1, 01b +%endif + + shufpd m0, m0, m0, 1100b + + ; fmadd actually hurts performance in this case due to + ; the earlier loads + shuffles + mulpd m0, m1 + addpd m2, m0 ; sum += data[i]*data[i-j] + + add len_pq, 8 + cmp len_pq, lenq + jl .len_l + + movupd [autocq + lag_pq], m2 ; autoc[j] = sum + add lag_pq, mmsize + cmp lag_pq, lagq + jl .lag_l + + ; The tail computation is guaranteed never to happen + ; as long as we're doing multiples of 4, rather than 2. +%if mmsize != 32 + jg .end + ; If lag_p == lag fallthrough + +.tail: + movaps m2, [one_tab] + + mov len_pq, lag_pq + sub len_pq, mmsize + + lea data_lq, [lag_pq] + neg data_lq ; -j + add data_lq, dataq ; data[-j] + +.tail_l: + movupd m0, [dataq + len_pq] + movupd m1, [data_lq + len_pq] + + mulpd m0, m1 + addpd m2, m0 ; sum += data[i]*data[i-j] + + add len_pq, mmsize + cmp len_pq, lenq + jl .tail_l + + shufpd m1, m2, m2, 01b + addpd m2, m1 + + ; Leave this here just in case its ever needed +%if mmsize == 32 + vperm2f128 m1, m2, m2, 0x01 + addpd xm2, xm1 + movupd [autocq + lag_pq], xm2 +%else + movhpd [autocq + lag_pq], xm2 +%endif + +.end: +%endif + + RET +%endmacro + +INIT_XMM sse2 +COMPUTE_AUTOCORR_FN +INIT_YMM avx +COMPUTE_AUTOCORR_FN diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c index f2fca53799..bb174be53e 100644 --- a/libavcodec/x86/lpc_init.c +++ b/libavcodec/x86/lpc_init.c @@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t *data, ptrdiff_t len, double *w_data); void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len, double *w_data); - -DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 }; - -#if HAVE_SSE2_INLINE - -static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag, - double *autoc) -{ - int j; - - if((x86_reg)data & 15) - data++; - - for(j=0; jlpc_compute_autocorr = lpc_compute_autocorr_sse2; -#endif + if (EXTERNAL_SSE2(cpu_flags)) + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2; + + if (EXTERNAL_AVX_FAST(cpu_flags)) + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_avx; if (EXTERNAL_SSE2(cpu_flags)) c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2; -- 2.43.0.381.gb435a96ce8 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".