* [FFmpeg-devel] [PATCH] avutil/x86/float_dsp: add fma3 for scalarproduct
@ 2022-09-12 15:22 Paul B Mahol
0 siblings, 0 replies; only message in thread
From: Paul B Mahol @ 2022-09-12 15:22 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 16 bytes --]
Patch attached.
[-- Attachment #2: 0001-avutil-x86-float_dsp-add-fma3-for-scalarproduct.patch --]
[-- Type: text/x-patch, Size: 4782 bytes --]
From f7c47b8eefa1c06a74d17f13b4e9010785dc6430 Mon Sep 17 00:00:00 2001
From: Paul B Mahol <onemda@gmail.com>
Date: Wed, 20 Jan 2021 16:58:31 +0100
Subject: [PATCH] avutil/x86/float_dsp: add fma3 for scalarproduct
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
libavutil/x86/float_dsp.asm | 127 +++++++++++++++++++++++++++++++++
libavutil/x86/float_dsp_init.c | 2 +
2 files changed, 129 insertions(+)
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index cca4d019c7..8f8e6dddf5 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -440,6 +440,133 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset
%endif
RET
+INIT_YMM fma3
+cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset
+ xor offsetq, offsetq
+ xorps m0, m0
+ shl sized, 2
+ mov lenq, sizeq
+ cmp lenq, 32
+ jl .l16
+ cmp lenq, 64
+ jl .l32
+ xorps m1, m1
+ cmp lenq, 128
+ jl .l64
+ and lenq, ~127
+ xorps m2, m2
+ xorps m3, m3
+.loop128:
+ movups m4, [v1q+offsetq]
+ movups m5, [v1q+offsetq + 32]
+ movups m6, [v1q+offsetq + 64]
+ movups m7, [v1q+offsetq + 96]
+ fmaddps m0, m4, [v2q+offsetq ], m0
+ fmaddps m1, m5, [v2q+offsetq + 32], m1
+ fmaddps m2, m6, [v2q+offsetq + 64], m2
+ fmaddps m3, m7, [v2q+offsetq + 96], m3
+ add offsetq, 128
+ cmp offsetq, lenq
+ jl .loop128
+ addps m0, m2
+ addps m1, m3
+ mov lenq, sizeq
+ and lenq, 127
+ cmp lenq, 64
+ jge .l64
+ addps m0, m1
+ cmp lenq, 32
+ jge .l32
+ vextractf128 xmm2, m0, 1
+ addps xmm0, xmm2
+ cmp lenq, 16
+ jge .l16
+ movhlps xmm1, xmm0
+ addps xmm0, xmm1
+ movss xmm1, xmm0
+ shufps xmm0, xmm0, 1
+ addss xmm0, xmm1
+%if ARCH_X86_64 == 0
+ movss r0m, xm0
+ fld dword r0m
+%endif
+ RET
+.l64:
+ and lenq, ~63
+ add lenq, offsetq
+.loop64:
+ movups m4, [v1q+offsetq]
+ movups m5, [v1q+offsetq + 32]
+ fmaddps m0, m4, [v2q+offsetq], m0
+ fmaddps m1, m5, [v2q+offsetq + 32], m1
+ add offsetq, 64
+ cmp offsetq, lenq
+ jl .loop64
+ addps m0, m1
+ mov lenq, sizeq
+ and lenq, 63
+ cmp lenq, 32
+ jge .l32
+ vextractf128 xmm2, m0, 1
+ addps xmm0, xmm2
+ cmp lenq, 16
+ jge .l16
+ movhlps xmm1, xmm0
+ addps xmm0, xmm1
+ movss xmm1, xmm0
+ shufps xmm0, xmm0, 1
+ addss xmm0, xmm1
+%if ARCH_X86_64 == 0
+ movss r0m, xm0
+ fld dword r0m
+%endif
+ RET
+.l32:
+ and lenq, ~31
+ add lenq, offsetq
+.loop32:
+ movups m4, [v1q+offsetq]
+ fmaddps m0, m4, [v2q+offsetq], m0
+ add offsetq, 32
+ cmp offsetq, lenq
+ jl .loop32
+ vextractf128 xmm2, m0, 1
+ addps xmm0, xmm2
+ mov lenq, sizeq
+ and lenq, 31
+ cmp lenq, 16
+ jge .l16
+ movhlps xmm1, xmm0
+ addps xmm0, xmm1
+ movss xmm1, xmm0
+ shufps xmm0, xmm0, 1
+ addss xmm0, xmm1
+%if ARCH_X86_64 == 0
+ movss r0m, xm0
+ fld dword r0m
+%endif
+ RET
+.l16:
+ and lenq, ~15
+ add lenq, offsetq
+.loop16:
+ movaps xmm1, [v1q+offsetq]
+ mulps xmm1, [v2q+offsetq]
+ addps xmm0, xmm1
+ add offsetq, 16
+ cmp offsetq, lenq
+ jl .loop16
+ movhlps xmm1, xmm0
+ addps xmm0, xmm1
+ movss xmm1, xmm0
+ shufps xmm0, xmm0, 1
+ addss xmm0, xmm1
+%if ARCH_X86_64 == 0
+ movss r0m, xm0
+ fld dword r0m
+%endif
+ RET
+
;-----------------------------------------------------------------------------
; void ff_butterflies_float(float *src0, float *src1, int len);
;-----------------------------------------------------------------------------
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index ad17bc2044..ad6b506259 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -74,6 +74,7 @@ void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
const float *src1, int len);
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
+float ff_scalarproduct_float_fma3(const float *v1, const float *v2, int order);
void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len);
@@ -112,5 +113,6 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;
fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3;
+ fdsp->scalarproduct_float = ff_scalarproduct_float_fma3;
}
}
--
2.37.2
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2022-09-12 15:23 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-12 15:22 [FFmpeg-devel] [PATCH] avutil/x86/float_dsp: add fma3 for scalarproduct Paul B Mahol
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git