Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] x86/intreadwrite: use intrinsics instead of inline asm for AV_ZERO128
@ 2022-10-24 20:03 James Almer
  2022-11-13 22:32 ` James Almer
  0 siblings, 1 reply; 2+ messages in thread
From: James Almer @ 2022-10-24 20:03 UTC (permalink / raw)
  To: ffmpeg-devel

When called inside a loop, the inline asm version results in one pxor
unnecessarely emitted per iteration, as the contents of the __asm__() block are
opaque to the compiler's instruction scheduler.
This is not the case with intrinsics, where pxor will be emitted once with any
half decent compiler.

The code can be adapted to also work with MSVC, but for now, it will work with
the same compilers previously supported (GCC, Clang, etc).

Signed-off-by: James Almer <jamrial@gmail.com>
---
 configure                    |  3 +++
 libavutil/x86/intreadwrite.h | 15 +++++++--------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/configure b/configure
index c5a466657f..5bb83f5b5a 100755
--- a/configure
+++ b/configure
@@ -2222,6 +2222,7 @@ HEADERS_LIST="
 
 INTRINSICS_LIST="
     intrinsics_neon
+    intrinsics_sse2
 "
 
 COMPLEX_FUNCS="
@@ -2636,6 +2637,7 @@ armv6t2_deps="arm"
 armv8_deps="aarch64"
 neon_deps_any="aarch64 arm"
 intrinsics_neon_deps="neon"
+intrinsics_sse2_deps="sse2"
 vfp_deps_any="aarch64 arm"
 vfpv3_deps="vfp"
 setend_deps="arm"
@@ -6207,6 +6209,7 @@ elif enabled loongarch; then
 fi
 
 check_cc intrinsics_neon arm_neon.h "int16x8_t test = vdupq_n_s16(0)"
+check_cc intrinsics_sse2 emmintrin.h "__m128i test = _mm_setzero_si128()"
 
 check_ldflags -Wl,--as-needed
 check_ldflags -Wl,-z,noexecstack
diff --git a/libavutil/x86/intreadwrite.h b/libavutil/x86/intreadwrite.h
index 40f375b013..4a03e60fc6 100644
--- a/libavutil/x86/intreadwrite.h
+++ b/libavutil/x86/intreadwrite.h
@@ -21,6 +21,9 @@
 #ifndef AVUTIL_X86_INTREADWRITE_H
 #define AVUTIL_X86_INTREADWRITE_H
 
+#if HAVE_INTRINSICS_SSE2
+#include <emmintrin.h>
+#endif
 #include <stdint.h>
 #include "config.h"
 #include "libavutil/attributes.h"
@@ -79,20 +82,16 @@ static av_always_inline void AV_COPY128(void *d, const void *s)
 
 #endif /* __SSE__ */
 
-#ifdef __SSE2__
+#if HAVE_INTRINSICS_SSE2 && defined(__SSE2__)
 
 #define AV_ZERO128 AV_ZERO128
 static av_always_inline void AV_ZERO128(void *d)
 {
-    struct v {uint64_t v[2];};
-
-    __asm__("pxor %%xmm0, %%xmm0  \n\t"
-            "movdqa   %%xmm0, %0  \n\t"
-            : "=m"(*(struct v*)d)
-            :: "xmm0");
+    __m128i zero = _mm_setzero_si128();
+    _mm_store_si128(d, zero);
 }
 
-#endif /* __SSE2__ */
+#endif /* HAVE_INTRINSICS_SSE2 && defined(__SSE2__) */
 
 #endif /* HAVE_MMX */
 
-- 
2.38.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [FFmpeg-devel] [PATCH] x86/intreadwrite: use intrinsics instead of inline asm for AV_ZERO128
  2022-10-24 20:03 [FFmpeg-devel] [PATCH] x86/intreadwrite: use intrinsics instead of inline asm for AV_ZERO128 James Almer
@ 2022-11-13 22:32 ` James Almer
  0 siblings, 0 replies; 2+ messages in thread
From: James Almer @ 2022-11-13 22:32 UTC (permalink / raw)
  To: ffmpeg-devel

On 10/24/2022 5:03 PM, James Almer wrote:
> When called inside a loop, the inline asm version results in one pxor
> unnecessarely emitted per iteration, as the contents of the __asm__() block are
> opaque to the compiler's instruction scheduler.
> This is not the case with intrinsics, where pxor will be emitted once with any
> half decent compiler.
> 
> The code can be adapted to also work with MSVC, but for now, it will work with
> the same compilers previously supported (GCC, Clang, etc).
> 
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>   configure                    |  3 +++
>   libavutil/x86/intreadwrite.h | 15 +++++++--------
>   2 files changed, 10 insertions(+), 8 deletions(-)
> 
> diff --git a/configure b/configure
> index c5a466657f..5bb83f5b5a 100755
> --- a/configure
> +++ b/configure
> @@ -2222,6 +2222,7 @@ HEADERS_LIST="
>   
>   INTRINSICS_LIST="
>       intrinsics_neon
> +    intrinsics_sse2
>   "
>   
>   COMPLEX_FUNCS="
> @@ -2636,6 +2637,7 @@ armv6t2_deps="arm"
>   armv8_deps="aarch64"
>   neon_deps_any="aarch64 arm"
>   intrinsics_neon_deps="neon"
> +intrinsics_sse2_deps="sse2"
>   vfp_deps_any="aarch64 arm"
>   vfpv3_deps="vfp"
>   setend_deps="arm"
> @@ -6207,6 +6209,7 @@ elif enabled loongarch; then
>   fi
>   
>   check_cc intrinsics_neon arm_neon.h "int16x8_t test = vdupq_n_s16(0)"
> +check_cc intrinsics_sse2 emmintrin.h "__m128i test = _mm_setzero_si128()"
>   
>   check_ldflags -Wl,--as-needed
>   check_ldflags -Wl,-z,noexecstack
> diff --git a/libavutil/x86/intreadwrite.h b/libavutil/x86/intreadwrite.h
> index 40f375b013..4a03e60fc6 100644
> --- a/libavutil/x86/intreadwrite.h
> +++ b/libavutil/x86/intreadwrite.h
> @@ -21,6 +21,9 @@
>   #ifndef AVUTIL_X86_INTREADWRITE_H
>   #define AVUTIL_X86_INTREADWRITE_H
>   
> +#if HAVE_INTRINSICS_SSE2
> +#include <emmintrin.h>
> +#endif
>   #include <stdint.h>
>   #include "config.h"
>   #include "libavutil/attributes.h"
> @@ -79,20 +82,16 @@ static av_always_inline void AV_COPY128(void *d, const void *s)
>   
>   #endif /* __SSE__ */
>   
> -#ifdef __SSE2__
> +#if HAVE_INTRINSICS_SSE2 && defined(__SSE2__)
>   
>   #define AV_ZERO128 AV_ZERO128
>   static av_always_inline void AV_ZERO128(void *d)
>   {
> -    struct v {uint64_t v[2];};
> -
> -    __asm__("pxor %%xmm0, %%xmm0  \n\t"
> -            "movdqa   %%xmm0, %0  \n\t"
> -            : "=m"(*(struct v*)d)
> -            :: "xmm0");
> +    __m128i zero = _mm_setzero_si128();
> +    _mm_store_si128(d, zero);
>   }
>   
> -#endif /* __SSE2__ */
> +#endif /* HAVE_INTRINSICS_SSE2 && defined(__SSE2__) */
>   
>   #endif /* HAVE_MMX */

Will apply.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2022-11-13 22:32 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-24 20:03 [FFmpeg-devel] [PATCH] x86/intreadwrite: use intrinsics instead of inline asm for AV_ZERO128 James Almer
2022-11-13 22:32 ` James Almer

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git