From: James Almer <jamrial@gmail.com> To: ffmpeg-devel@ffmpeg.org Subject: [FFmpeg-devel] [PATCH 2/2] x86/aacpsdsp: add ps_hybrid_analysis_fma3 Date: Tue, 20 Sep 2022 16:32:45 -0300 Message-ID: <20220920193245.3390-2-jamrial@gmail.com> (raw) In-Reply-To: <20220920193245.3390-1-jamrial@gmail.com> This replace the sse3 version, which was not faster than the sse one. Signed-off-by: James Almer <jamrial@gmail.com> --- libavcodec/x86/aacpsdsp.asm | 38 +++++++++++++++------------------- libavcodec/x86/aacpsdsp_init.c | 6 ++++-- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm index 543d33e68d..98f3e00f9a 100644 --- a/libavcodec/x86/aacpsdsp.asm +++ b/libavcodec/x86/aacpsdsp.asm @@ -403,10 +403,8 @@ HYBRID_SYNTHESIS_DEINT %macro PS_HYBRID_ANALYSIS_IN 1 movu m0, [inq+mmsize*%1] movu m1, [inq+mmsize*(5-%1)+8] - mova m3, m0 - mova m4, m1 - shufps m3, m3, q2301 - shufps m4, m4, q0123 + shufps m3, m0, m0, q2301 + shufps m4, m1, m1, q0123 shufps m1, m1, q1032 %if cpuflag(sse3) addsubps m3, m4 @@ -424,6 +422,15 @@ HYBRID_SYNTHESIS_DEINT %macro PS_HYBRID_ANALYSIS_LOOP 3 mova m2, [filterq+nq+mmsize*%3] shufps m2, m2, q2301 +%if cpuflag(fma3) +%if %3 + fmaddps m3, m2, [rsp+mmsize*%3*2], m3 + fmaddps m0, m2, [rsp+mmsize+mmsize*%3*2], m0 +%else + mulps m3, m2, [rsp] + mulps m0, m2, [rsp+mmsize] +%endif +%else ; cpuflag(sse) mova %2, [rsp+mmsize*%3*2] mova %1, [rsp+mmsize+mmsize*%3*2] mulps %2, m2 @@ -432,20 +439,21 @@ HYBRID_SYNTHESIS_DEINT addps m3, %2 addps m0, %1 %endif +%endif %endmacro %macro PS_HYBRID_ANALYSIS 0 -cglobal ps_hybrid_analysis, 5, 5, 8, 24 * 4, out, in, filter, stride, n +cglobal ps_hybrid_analysis, 5, 5, 5 + notcpuflag(fma3) * 3, 24 * 4, out, in, filter, stride, n %if cpuflag(sse3) %define MOVH movsd %else %define MOVH movlps + mova m7, [ps_p1m1p1m1] %endif shl strideq, 3 shl nd, 6 add filterq, nq neg nq - mova m7, [ps_p1m1p1m1] PS_HYBRID_ANALYSIS_IN 0 PS_HYBRID_ANALYSIS_IN 1 PS_HYBRID_ANALYSIS_IN 2 @@ -456,26 +464,14 @@ align 16 PS_HYBRID_ANALYSIS_LOOP m5, m6, 1 PS_HYBRID_ANALYSIS_LOOP m5, m6, 2 -%if cpuflag(sse3) - pshufd m3, m3, q2301 - xorps m0, m7 - hsubps m3, m0 - pshufd m1, m3, q0020 - pshufd m3, m3, q0031 - addps m1, m3 - movsd m2, [inq+6*8] -%else - mova m1, m3 - mova m2, m0 - shufps m1, m1, q2301 - shufps m2, m2, q2301 + shufps m1, m3, m3, q2301 + shufps m2, m0, m0, q2301 subps m1, m3 addps m2, m0 unpcklps m3, m1, m2 unpckhps m1, m2 addps m1, m3 movu m2, [inq+6*8] ; faster than movlps and no risk of overread -%endif movss m3, [filterq+nq+8*6] SPLATD m3 mulps m2, m3 @@ -489,5 +485,5 @@ align 16 INIT_XMM sse PS_HYBRID_ANALYSIS -INIT_XMM sse3 +INIT_XMM fma3 PS_HYBRID_ANALYSIS diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c index 21f00efa24..0b0ee07db4 100644 --- a/libavcodec/x86/aacpsdsp_init.c +++ b/libavcodec/x86/aacpsdsp_init.c @@ -33,7 +33,7 @@ void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2], void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2], const float (*filter)[8][2], ptrdiff_t stride, int n); -void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2], +void ff_ps_hybrid_analysis_fma3(float (*out)[2], float (*in)[2], const float (*filter)[8][2], ptrdiff_t stride, int n); void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2], @@ -64,9 +64,11 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s) s->add_squares = ff_ps_add_squares_sse3; s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3; s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3; - s->hybrid_analysis = ff_ps_hybrid_analysis_sse3; } if (EXTERNAL_SSE4(cpu_flags)) { s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4; } + if (EXTERNAL_FMA3(cpu_flags)) { + s->hybrid_analysis = ff_ps_hybrid_analysis_fma3; + } } -- 2.37.3 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
prev parent reply other threads:[~2022-09-20 19:34 UTC|newest] Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top 2022-09-20 19:32 [FFmpeg-devel] [PATCH 1/2] x86/aacpsdsp: precompute constant factors James Almer 2022-09-20 19:32 ` James Almer [this message]
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20220920193245.3390-2-jamrial@gmail.com \ --to=jamrial@gmail.com \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git