Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
@ 2021-12-20 13:56 Alan Kelly
  2021-12-20 14:21 ` James Almer
  0 siblings, 1 reply; 10+ messages in thread
From: Alan Kelly @ 2021-12-20 13:56 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Alan Kelly

This flag is set on Haswell and earlier and all AMD cpus.
---
 As discussed on IRC last week.
 libavutil/cpu.h     | 57 +++++++++++++++++++++++----------------------
 libavutil/x86/cpu.c | 13 ++++++++++-
 2 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index ae443eccad..4272d11d73 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -26,34 +26,35 @@
 #define AV_CPU_FLAG_FORCE    0x80000000 /* force usage of selected flags (OR) */
 
     /* lower 16 bits - CPU features */
-#define AV_CPU_FLAG_MMX          0x0001 ///< standard MMX
-#define AV_CPU_FLAG_MMXEXT       0x0002 ///< SSE integer functions or AMD MMX ext
-#define AV_CPU_FLAG_MMX2         0x0002 ///< SSE integer functions or AMD MMX ext
-#define AV_CPU_FLAG_3DNOW        0x0004 ///< AMD 3DNOW
-#define AV_CPU_FLAG_SSE          0x0008 ///< SSE functions
-#define AV_CPU_FLAG_SSE2         0x0010 ///< PIV SSE2 functions
-#define AV_CPU_FLAG_SSE2SLOW 0x40000000 ///< SSE2 supported, but usually not faster
-                                        ///< than regular MMX/SSE (e.g. Core1)
-#define AV_CPU_FLAG_3DNOWEXT     0x0020 ///< AMD 3DNowExt
-#define AV_CPU_FLAG_SSE3         0x0040 ///< Prescott SSE3 functions
-#define AV_CPU_FLAG_SSE3SLOW 0x20000000 ///< SSE3 supported, but usually not faster
-                                        ///< than regular MMX/SSE (e.g. Core1)
-#define AV_CPU_FLAG_SSSE3        0x0080 ///< Conroe SSSE3 functions
-#define AV_CPU_FLAG_SSSE3SLOW 0x4000000 ///< SSSE3 supported, but usually not faster
-#define AV_CPU_FLAG_ATOM     0x10000000 ///< Atom processor, some SSSE3 instructions are slower
-#define AV_CPU_FLAG_SSE4         0x0100 ///< Penryn SSE4.1 functions
-#define AV_CPU_FLAG_SSE42        0x0200 ///< Nehalem SSE4.2 functions
-#define AV_CPU_FLAG_AESNI       0x80000 ///< Advanced Encryption Standard functions
-#define AV_CPU_FLAG_AVX          0x4000 ///< AVX functions: requires OS support even if YMM registers aren't used
-#define AV_CPU_FLAG_AVXSLOW   0x8000000 ///< AVX supported, but slow when using YMM registers (e.g. Bulldozer)
-#define AV_CPU_FLAG_XOP          0x0400 ///< Bulldozer XOP functions
-#define AV_CPU_FLAG_FMA4         0x0800 ///< Bulldozer FMA4 functions
-#define AV_CPU_FLAG_CMOV         0x1000 ///< supports cmov instruction
-#define AV_CPU_FLAG_AVX2         0x8000 ///< AVX2 functions: requires OS support even if YMM registers aren't used
-#define AV_CPU_FLAG_FMA3        0x10000 ///< Haswell FMA3 functions
-#define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation Instruction Set 1
-#define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation Instruction Set 2
-#define AV_CPU_FLAG_AVX512     0x100000 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used
+#define AV_CPU_FLAG_MMX             0x0001 ///< standard MMX
+#define AV_CPU_FLAG_MMXEXT          0x0002 ///< SSE integer functions or AMD MMX ext
+#define AV_CPU_FLAG_MMX2            0x0002 ///< SSE integer functions or AMD MMX ext
+#define AV_CPU_FLAG_3DNOW           0x0004 ///< AMD 3DNOW
+#define AV_CPU_FLAG_SSE             0x0008 ///< SSE functions
+#define AV_CPU_FLAG_SSE2            0x0010 ///< PIV SSE2 functions
+#define AV_CPU_FLAG_SSE2SLOW    0x40000000 ///< SSE2 supported, but usually not faster
+                                           ///< than regular MMX/SSE (e.g. Core1)
+#define AV_CPU_FLAG_3DNOWEXT        0x0020 ///< AMD 3DNowExt
+#define AV_CPU_FLAG_SSE3            0x0040 ///< Prescott SSE3 functions
+#define AV_CPU_FLAG_SSE3SLOW    0x20000000 ///< SSE3 supported, but usually not faster
+                                           ///< than regular MMX/SSE (e.g. Core1)
+#define AV_CPU_FLAG_SSSE3           0x0080 ///< Conroe SSSE3 functions
+#define AV_CPU_FLAG_SSSE3SLOW    0x4000000 ///< SSSE3 supported, but usually not faster
+#define AV_CPU_FLAG_ATOM        0x10000000 ///< Atom processor, some SSSE3 instructions are slower
+#define AV_CPU_FLAG_SSE4            0x0100 ///< Penryn SSE4.1 functions
+#define AV_CPU_FLAG_SSE42           0x0200 ///< Nehalem SSE4.2 functions
+#define AV_CPU_FLAG_AESNI          0x80000 ///< Advanced Encryption Standard functions
+#define AV_CPU_FLAG_AVX             0x4000 ///< AVX functions: requires OS support even if YMM registers aren't used
+#define AV_CPU_FLAG_AVXSLOW      0x8000000 ///< AVX supported, but slow when using YMM registers (e.g. Bulldozer)
+#define AV_CPU_FLAG_XOP             0x0400 ///< Bulldozer XOP functions
+#define AV_CPU_FLAG_FMA4            0x0800 ///< Bulldozer FMA4 functions
+#define AV_CPU_FLAG_CMOV            0x1000 ///< supports cmov instruction
+#define AV_CPU_FLAG_AVX2            0x8000 ///< AVX2 functions: requires OS support even if YMM registers aren't used
+#define AV_CPU_FLAG_FMA3           0x10000 ///< Haswell FMA3 functions
+#define AV_CPU_FLAG_BMI1           0x20000 ///< Bit Manipulation Instruction Set 1
+#define AV_CPU_FLAG_BMI2           0x40000 ///< Bit Manipulation Instruction Set 2
+#define AV_CPU_FLAG_AVX512        0x100000 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used
+#define AV_CPU_FLAG_SLOW_GATHER  0x2000000 ///< CPU has slow gathers.
 
 #define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
 #define AV_CPU_FLAG_VSX          0x0002 ///< ISA 2.06
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index bcd41a50a2..5770ecec72 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -146,8 +146,16 @@ int ff_get_cpu_flags_x86(void)
     if (max_std_level >= 7) {
         cpuid(7, eax, ebx, ecx, edx);
 #if HAVE_AVX2
-        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
+        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020)) {
             rval |= AV_CPU_FLAG_AVX2;
+            cpuid(1, eax, ebx, ecx, std_caps);
+            family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+            model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+            /* Haswell and earlier has slow gather */
+            if(family == 6 && model < 70)
+                rval |= AV_CPU_FLAG_SLOW_GATHER;
+        }
+
 #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
         if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
             if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
@@ -196,6 +204,9 @@ int ff_get_cpu_flags_x86(void)
            used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
             if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
                 rval |= AV_CPU_FLAG_AVXSLOW;
+
+        /* AMD cpus have slow gather */
+            rval |= AV_CPU_FLAG_SLOW_GATHER;
         }
 
         /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
-- 
2.34.1.173.g76aa8bc2d0-goog

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
  2021-12-20 13:56 [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER Alan Kelly
@ 2021-12-20 14:21 ` James Almer
  2021-12-20 14:43   ` Alan Kelly
  0 siblings, 1 reply; 10+ messages in thread
From: James Almer @ 2021-12-20 14:21 UTC (permalink / raw)
  To: ffmpeg-devel



On 12/20/2021 10:56 AM, Alan Kelly wrote:
> This flag is set on Haswell and earlier and all AMD cpus.
> ---
>   As discussed on IRC last week.
>   libavutil/cpu.h     | 57 +++++++++++++++++++++++----------------------
>   libavutil/x86/cpu.c | 13 ++++++++++-
>   2 files changed, 41 insertions(+), 29 deletions(-)
> 
> diff --git a/libavutil/cpu.h b/libavutil/cpu.h
> index ae443eccad..4272d11d73 100644
> --- a/libavutil/cpu.h
> +++ b/libavutil/cpu.h
> @@ -26,34 +26,35 @@
>   #define AV_CPU_FLAG_FORCE    0x80000000 /* force usage of selected flags (OR) */
>   
>       /* lower 16 bits - CPU features */
> -#define AV_CPU_FLAG_MMX          0x0001 ///< standard MMX
> -#define AV_CPU_FLAG_MMXEXT       0x0002 ///< SSE integer functions or AMD MMX ext
> -#define AV_CPU_FLAG_MMX2         0x0002 ///< SSE integer functions or AMD MMX ext
> -#define AV_CPU_FLAG_3DNOW        0x0004 ///< AMD 3DNOW
> -#define AV_CPU_FLAG_SSE          0x0008 ///< SSE functions
> -#define AV_CPU_FLAG_SSE2         0x0010 ///< PIV SSE2 functions
> -#define AV_CPU_FLAG_SSE2SLOW 0x40000000 ///< SSE2 supported, but usually not faster
> -                                        ///< than regular MMX/SSE (e.g. Core1)
> -#define AV_CPU_FLAG_3DNOWEXT     0x0020 ///< AMD 3DNowExt
> -#define AV_CPU_FLAG_SSE3         0x0040 ///< Prescott SSE3 functions
> -#define AV_CPU_FLAG_SSE3SLOW 0x20000000 ///< SSE3 supported, but usually not faster
> -                                        ///< than regular MMX/SSE (e.g. Core1)
> -#define AV_CPU_FLAG_SSSE3        0x0080 ///< Conroe SSSE3 functions
> -#define AV_CPU_FLAG_SSSE3SLOW 0x4000000 ///< SSSE3 supported, but usually not faster
> -#define AV_CPU_FLAG_ATOM     0x10000000 ///< Atom processor, some SSSE3 instructions are slower
> -#define AV_CPU_FLAG_SSE4         0x0100 ///< Penryn SSE4.1 functions
> -#define AV_CPU_FLAG_SSE42        0x0200 ///< Nehalem SSE4.2 functions
> -#define AV_CPU_FLAG_AESNI       0x80000 ///< Advanced Encryption Standard functions
> -#define AV_CPU_FLAG_AVX          0x4000 ///< AVX functions: requires OS support even if YMM registers aren't used
> -#define AV_CPU_FLAG_AVXSLOW   0x8000000 ///< AVX supported, but slow when using YMM registers (e.g. Bulldozer)
> -#define AV_CPU_FLAG_XOP          0x0400 ///< Bulldozer XOP functions
> -#define AV_CPU_FLAG_FMA4         0x0800 ///< Bulldozer FMA4 functions
> -#define AV_CPU_FLAG_CMOV         0x1000 ///< supports cmov instruction
> -#define AV_CPU_FLAG_AVX2         0x8000 ///< AVX2 functions: requires OS support even if YMM registers aren't used
> -#define AV_CPU_FLAG_FMA3        0x10000 ///< Haswell FMA3 functions
> -#define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation Instruction Set 1
> -#define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation Instruction Set 2
> -#define AV_CPU_FLAG_AVX512     0x100000 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used
> +#define AV_CPU_FLAG_MMX             0x0001 ///< standard MMX
> +#define AV_CPU_FLAG_MMXEXT          0x0002 ///< SSE integer functions or AMD MMX ext
> +#define AV_CPU_FLAG_MMX2            0x0002 ///< SSE integer functions or AMD MMX ext
> +#define AV_CPU_FLAG_3DNOW           0x0004 ///< AMD 3DNOW
> +#define AV_CPU_FLAG_SSE             0x0008 ///< SSE functions
> +#define AV_CPU_FLAG_SSE2            0x0010 ///< PIV SSE2 functions
> +#define AV_CPU_FLAG_SSE2SLOW    0x40000000 ///< SSE2 supported, but usually not faster
> +                                           ///< than regular MMX/SSE (e.g. Core1)
> +#define AV_CPU_FLAG_3DNOWEXT        0x0020 ///< AMD 3DNowExt
> +#define AV_CPU_FLAG_SSE3            0x0040 ///< Prescott SSE3 functions
> +#define AV_CPU_FLAG_SSE3SLOW    0x20000000 ///< SSE3 supported, but usually not faster
> +                                           ///< than regular MMX/SSE (e.g. Core1)
> +#define AV_CPU_FLAG_SSSE3           0x0080 ///< Conroe SSSE3 functions
> +#define AV_CPU_FLAG_SSSE3SLOW    0x4000000 ///< SSSE3 supported, but usually not faster
> +#define AV_CPU_FLAG_ATOM        0x10000000 ///< Atom processor, some SSSE3 instructions are slower
> +#define AV_CPU_FLAG_SSE4            0x0100 ///< Penryn SSE4.1 functions
> +#define AV_CPU_FLAG_SSE42           0x0200 ///< Nehalem SSE4.2 functions
> +#define AV_CPU_FLAG_AESNI          0x80000 ///< Advanced Encryption Standard functions
> +#define AV_CPU_FLAG_AVX             0x4000 ///< AVX functions: requires OS support even if YMM registers aren't used
> +#define AV_CPU_FLAG_AVXSLOW      0x8000000 ///< AVX supported, but slow when using YMM registers (e.g. Bulldozer)
> +#define AV_CPU_FLAG_XOP             0x0400 ///< Bulldozer XOP functions
> +#define AV_CPU_FLAG_FMA4            0x0800 ///< Bulldozer FMA4 functions
> +#define AV_CPU_FLAG_CMOV            0x1000 ///< supports cmov instruction
> +#define AV_CPU_FLAG_AVX2            0x8000 ///< AVX2 functions: requires OS support even if YMM registers aren't used
> +#define AV_CPU_FLAG_FMA3           0x10000 ///< Haswell FMA3 functions
> +#define AV_CPU_FLAG_BMI1           0x20000 ///< Bit Manipulation Instruction Set 1
> +#define AV_CPU_FLAG_BMI2           0x40000 ///< Bit Manipulation Instruction Set 2
> +#define AV_CPU_FLAG_AVX512        0x100000 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used
> +#define AV_CPU_FLAG_SLOW_GATHER  0x2000000 ///< CPU has slow gathers.

Don't re-indent the other flags. It will affect git blame output.

>   
>   #define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
>   #define AV_CPU_FLAG_VSX          0x0002 ///< ISA 2.06
> diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
> index bcd41a50a2..5770ecec72 100644
> --- a/libavutil/x86/cpu.c
> +++ b/libavutil/x86/cpu.c
> @@ -146,8 +146,16 @@ int ff_get_cpu_flags_x86(void)
>       if (max_std_level >= 7) {
>           cpuid(7, eax, ebx, ecx, edx);
>   #if HAVE_AVX2
> -        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
> +        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020)) {
>               rval |= AV_CPU_FLAG_AVX2;
> +            cpuid(1, eax, ebx, ecx, std_caps);
> +            family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
> +            model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
> +            /* Haswell and earlier has slow gather */

Afaik Haswell is the first CPU with gathers, so this comment is odd. Is 
there another CPU this check below will trigger for?

> +            if(family == 6 && model < 70)
> +                rval |= AV_CPU_FLAG_SLOW_GATHER;
> +        }
> +
>   #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
>           if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
>               if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
> @@ -196,6 +204,9 @@ int ff_get_cpu_flags_x86(void)
>              used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
>               if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
>                   rval |= AV_CPU_FLAG_AVXSLOW;
> +
> +        /* AMD cpus have slow gather */
> +            rval |= AV_CPU_FLAG_SLOW_GATHER;

Don't unconditionally enable this for every CPU. Do it only for those 
with AVX2.

if (rval & AV_CPU_FLAG_AVX2)
     rval |= AV_CPU_FLAG_SLOW_GATHER;

>           }
>   
>           /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
  2021-12-20 14:21 ` James Almer
@ 2021-12-20 14:43   ` Alan Kelly
  2021-12-20 14:47     ` Lynne
  0 siblings, 1 reply; 10+ messages in thread
From: Alan Kelly @ 2021-12-20 14:43 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Alan Kelly

This flag is set on Haswell and earlier and all AMD cpus.
---
 Removes unnecessary indentation, clarifies comment and only sets flag on AMD
 cpus with AVX2.
 libavutil/cpu.h     |  1 +
 libavutil/x86/cpu.c | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index ae443eccad..ce9bf14bf7 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -54,6 +54,7 @@
 #define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation Instruction Set 1
 #define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation Instruction Set 2
 #define AV_CPU_FLAG_AVX512     0x100000 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used
+#define AV_CPU_FLAG_SLOW_GATHER  0x2000000 ///< CPU has slow gathers.
 
 #define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
 #define AV_CPU_FLAG_VSX          0x0002 ///< ISA 2.06
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index bcd41a50a2..563984f234 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -146,8 +146,16 @@ int ff_get_cpu_flags_x86(void)
     if (max_std_level >= 7) {
         cpuid(7, eax, ebx, ecx, edx);
 #if HAVE_AVX2
-        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
+        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020)) {
             rval |= AV_CPU_FLAG_AVX2;
+            cpuid(1, eax, ebx, ecx, std_caps);
+            family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+            model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+            /* Haswell has slow gather */
+            if(family == 6 && model < 70)
+                rval |= AV_CPU_FLAG_SLOW_GATHER;
+        }
+
 #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
         if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
             if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
@@ -196,6 +204,10 @@ int ff_get_cpu_flags_x86(void)
            used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
             if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
                 rval |= AV_CPU_FLAG_AVXSLOW;
+
+        /* AMD cpus have slow gather */
+            if(rval & AV_CPU_FLAG_AVX2)
+                rval |= AV_CPU_FLAG_SLOW_GATHER;
         }
 
         /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
-- 
2.34.1.173.g76aa8bc2d0-goog

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
  2021-12-20 14:43   ` Alan Kelly
@ 2021-12-20 14:47     ` Lynne
  2021-12-20 14:53       ` James Almer
  0 siblings, 1 reply; 10+ messages in thread
From: Lynne @ 2021-12-20 14:47 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

20 Dec 2021, 15:43 by alankelly-at-google.com@ffmpeg.org:

> This flag is set on Haswell and earlier and all AMD cpus.
> ---
>  Removes unnecessary indentation, clarifies comment and only sets flag on AMD
>  cpus with AVX2.
>  libavutil/cpu.h     |  1 +
>  libavutil/x86/cpu.c | 14 +++++++++++++-
>  2 files changed, 14 insertions(+), 1 deletion(-)
>
> diff --git a/libavutil/cpu.h b/libavutil/cpu.h
> index ae443eccad..ce9bf14bf7 100644
> --- a/libavutil/cpu.h
> +++ b/libavutil/cpu.h
> @@ -54,6 +54,7 @@
>  #define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation Instruction Set 1
>  #define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation Instruction Set 2
>  #define AV_CPU_FLAG_AVX512     0x100000 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used
> +#define AV_CPU_FLAG_SLOW_GATHER  0x2000000 ///< CPU has slow gathers.
>  
>  #define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
>  #define AV_CPU_FLAG_VSX          0x0002 ///< ISA 2.06
> diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
> index bcd41a50a2..563984f234 100644
> --- a/libavutil/x86/cpu.c
> +++ b/libavutil/x86/cpu.c
> @@ -146,8 +146,16 @@ int ff_get_cpu_flags_x86(void)
>  if (max_std_level >= 7) {
>  cpuid(7, eax, ebx, ecx, edx);
>  #if HAVE_AVX2
> -        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
> +        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020)) {
>  rval |= AV_CPU_FLAG_AVX2;
> +            cpuid(1, eax, ebx, ecx, std_caps);
> +            family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
> +            model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
> +            /* Haswell has slow gather */
> +            if(family == 6 && model < 70)
> +                rval |= AV_CPU_FLAG_SLOW_GATHER;
> +        }
> +
>  #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
>  if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
>  if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
> @@ -196,6 +204,10 @@ int ff_get_cpu_flags_x86(void)
>  used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
>  if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
>  rval |= AV_CPU_FLAG_AVXSLOW;
> +
> +        /* AMD cpus have slow gather */
> +            if(rval & AV_CPU_FLAG_AVX2)
> +                rval |= AV_CPU_FLAG_SLOW_GATHER;
>  } 
>

No, I'd rather limit AMD CPUs to all currently released CPUs.
Future ones are getting AVX512, which did speed up gathers on
Intel CPUs, as the ISA extension extended gathers and addded
scatters.

Also your previous patch introduces ff_shuffle_filter_coefficients()
which is so bad it pretty much needs a complete rewrite.
You're also not detecting malloc errors or propagating them back.

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
  2021-12-20 14:47     ` Lynne
@ 2021-12-20 14:53       ` James Almer
  2021-12-20 14:59         ` Alan Kelly
  2021-12-20 15:01         ` Alan Kelly
  0 siblings, 2 replies; 10+ messages in thread
From: James Almer @ 2021-12-20 14:53 UTC (permalink / raw)
  To: ffmpeg-devel



On 12/20/2021 11:47 AM, Lynne wrote:
> 20 Dec 2021, 15:43 by alankelly-at-google.com@ffmpeg.org:
> 
>> This flag is set on Haswell and earlier and all AMD cpus.
>> ---
>>   Removes unnecessary indentation, clarifies comment and only sets flag on AMD
>>   cpus with AVX2.
>>   libavutil/cpu.h     |  1 +
>>   libavutil/x86/cpu.c | 14 +++++++++++++-
>>   2 files changed, 14 insertions(+), 1 deletion(-)
>>
>> diff --git a/libavutil/cpu.h b/libavutil/cpu.h
>> index ae443eccad..ce9bf14bf7 100644
>> --- a/libavutil/cpu.h
>> +++ b/libavutil/cpu.h
>> @@ -54,6 +54,7 @@
>>   #define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation Instruction Set 1
>>   #define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation Instruction Set 2
>>   #define AV_CPU_FLAG_AVX512     0x100000 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used
>> +#define AV_CPU_FLAG_SLOW_GATHER  0x2000000 ///< CPU has slow gathers.
>>   
>>   #define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
>>   #define AV_CPU_FLAG_VSX          0x0002 ///< ISA 2.06
>> diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
>> index bcd41a50a2..563984f234 100644
>> --- a/libavutil/x86/cpu.c
>> +++ b/libavutil/x86/cpu.c
>> @@ -146,8 +146,16 @@ int ff_get_cpu_flags_x86(void)
>>   if (max_std_level >= 7) {
>>   cpuid(7, eax, ebx, ecx, edx);
>>   #if HAVE_AVX2
>> -        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
>> +        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020)) {
>>   rval |= AV_CPU_FLAG_AVX2;
>> +            cpuid(1, eax, ebx, ecx, std_caps);
>> +            family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
>> +            model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
>> +            /* Haswell has slow gather */
>> +            if(family == 6 && model < 70)
>> +                rval |= AV_CPU_FLAG_SLOW_GATHER;
>> +        }
>> +
>>   #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
>>   if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
>>   if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
>> @@ -196,6 +204,10 @@ int ff_get_cpu_flags_x86(void)
>>   used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
>>   if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
>>   rval |= AV_CPU_FLAG_AVXSLOW;
>> +
>> +        /* AMD cpus have slow gather */
>> +            if(rval & AV_CPU_FLAG_AVX2)
>> +                rval |= AV_CPU_FLAG_SLOW_GATHER;
>>   }
>>
> 
> No, I'd rather limit AMD CPUs to all currently released CPUs.
> Future ones are getting AVX512, which did speed up gathers on
> Intel CPUs, as the ISA extension extended gathers and addded
> scatters.

I wouldn't hold my breath for that, but it's probably a good idea 
anyway. A check so it's flagged only on Excavator and Zen <= 3.

> 
> Also your previous patch introduces ff_shuffle_filter_coefficients()
> which is so bad it pretty much needs a complete rewrite.
> You're also not detecting malloc errors or propagating them back.

That's unrelated to this patch.

> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
  2021-12-20 14:53       ` James Almer
@ 2021-12-20 14:59         ` Alan Kelly
  2021-12-20 15:26           ` Lynne
  2021-12-20 15:01         ` Alan Kelly
  1 sibling, 1 reply; 10+ messages in thread
From: Alan Kelly @ 2021-12-20 14:59 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Alan Kelly

This flag is set on Haswell and earlier and all AMD cpus.
---
 Sets this flag on Zen 3 and earlier.
 libavutil/cpu.h     |  1 +
 libavutil/x86/cpu.c | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index ae443eccad..ce9bf14bf7 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -54,6 +54,7 @@
 #define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation Instruction Set 1
 #define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation Instruction Set 2
 #define AV_CPU_FLAG_AVX512     0x100000 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used
+#define AV_CPU_FLAG_SLOW_GATHER  0x2000000 ///< CPU has slow gathers.
 
 #define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
 #define AV_CPU_FLAG_VSX          0x0002 ///< ISA 2.06
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index bcd41a50a2..11467ba99d 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -146,8 +146,16 @@ int ff_get_cpu_flags_x86(void)
     if (max_std_level >= 7) {
         cpuid(7, eax, ebx, ecx, edx);
 #if HAVE_AVX2
-        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
+        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020)) {
             rval |= AV_CPU_FLAG_AVX2;
+            cpuid(1, eax, ebx, ecx, std_caps);
+            family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+            model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+            /* Haswell has slow gather */
+            if(family == 6 && model < 70)
+                rval |= AV_CPU_FLAG_SLOW_GATHER;
+        }
+
 #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
         if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
             if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
@@ -196,6 +204,10 @@ int ff_get_cpu_flags_x86(void)
            used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
             if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
                 rval |= AV_CPU_FLAG_AVXSLOW;
+
+        /* Zen 3 and earlier have slow gather */
+            if((rval & AV_CPU_FLAG_AVX2) & family <= 25)
+                rval |= AV_CPU_FLAG_SLOW_GATHER;
         }
 
         /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
-- 
2.34.1.173.g76aa8bc2d0-goog

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
  2021-12-20 14:53       ` James Almer
  2021-12-20 14:59         ` Alan Kelly
@ 2021-12-20 15:01         ` Alan Kelly
  1 sibling, 0 replies; 10+ messages in thread
From: Alan Kelly @ 2021-12-20 15:01 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Mon, Dec 20, 2021 at 3:53 PM James Almer <jamrial@gmail.com> wrote:

>
>
> On 12/20/2021 11:47 AM, Lynne wrote:
> > 20 Dec 2021, 15:43 by alankelly-at-google.com@ffmpeg.org:
> >
> >> This flag is set on Haswell and earlier and all AMD cpus.
> >> ---
> >>   Removes unnecessary indentation, clarifies comment and only sets flag
> on AMD
> >>   cpus with AVX2.
> >>   libavutil/cpu.h     |  1 +
> >>   libavutil/x86/cpu.c | 14 +++++++++++++-
> >>   2 files changed, 14 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/libavutil/cpu.h b/libavutil/cpu.h
> >> index ae443eccad..ce9bf14bf7 100644
> >> --- a/libavutil/cpu.h
> >> +++ b/libavutil/cpu.h
> >> @@ -54,6 +54,7 @@
> >>   #define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation
> Instruction Set 1
> >>   #define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation
> Instruction Set 2
> >>   #define AV_CPU_FLAG_AVX512     0x100000 ///< AVX-512 functions:
> requires OS support even if YMM/ZMM registers aren't used
> >> +#define AV_CPU_FLAG_SLOW_GATHER  0x2000000 ///< CPU has slow gathers.
> >>
> >>   #define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
> >>   #define AV_CPU_FLAG_VSX          0x0002 ///< ISA 2.06
> >> diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
> >> index bcd41a50a2..563984f234 100644
> >> --- a/libavutil/x86/cpu.c
> >> +++ b/libavutil/x86/cpu.c
> >> @@ -146,8 +146,16 @@ int ff_get_cpu_flags_x86(void)
> >>   if (max_std_level >= 7) {
> >>   cpuid(7, eax, ebx, ecx, edx);
> >>   #if HAVE_AVX2
> >> -        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
> >> +        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020)) {
> >>   rval |= AV_CPU_FLAG_AVX2;
> >> +            cpuid(1, eax, ebx, ecx, std_caps);
> >> +            family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
> >> +            model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
> >> +            /* Haswell has slow gather */
> >> +            if(family == 6 && model < 70)
> >> +                rval |= AV_CPU_FLAG_SLOW_GATHER;
> >> +        }
> >> +
> >>   #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
> >>   if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
> >>   if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
> >> @@ -196,6 +204,10 @@ int ff_get_cpu_flags_x86(void)
> >>   used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
> >>   if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
> >>   rval |= AV_CPU_FLAG_AVXSLOW;
> >> +
> >> +        /* AMD cpus have slow gather */
> >> +            if(rval & AV_CPU_FLAG_AVX2)
> >> +                rval |= AV_CPU_FLAG_SLOW_GATHER;
> >>   }
> >>
> >
> > No, I'd rather limit AMD CPUs to all currently released CPUs.
> > Future ones are getting AVX512, which did speed up gathers on
> > Intel CPUs, as the ISA extension extended gathers and addded
> > scatters.
>
> I wouldn't hold my breath for that, but it's probably a good idea
> anyway. A check so it's flagged only on Excavator and Zen <= 3.
>
> >
> > Also your previous patch introduces ff_shuffle_filter_coefficients()
> > which is so bad it pretty much needs a complete rewrite.
> > You're also not detecting malloc errors or propagating them back.
>
> That's unrelated to this patch.
>
> >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>

Updated patch sent with check for family <= 25 so that future CPUs will
have avx2 hscale enabled by default.

I may have time this week to look at ff_shuffle_filter_coefficients.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
  2021-12-20 14:59         ` Alan Kelly
@ 2021-12-20 15:26           ` Lynne
  2021-12-21 19:56             ` Alan Kelly
  0 siblings, 1 reply; 10+ messages in thread
From: Lynne @ 2021-12-20 15:26 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

20 Dec 2021, 15:59 by alankelly-at-google.com@ffmpeg.org:

> This flag is set on Haswell and earlier and all AMD cpus.
> ---
>  Sets this flag on Zen 3 and earlier.
>  libavutil/cpu.h     |  1 +
>  libavutil/x86/cpu.c | 14 +++++++++++++-
>  2 files changed, 14 insertions(+), 1 deletion(-)
>
> diff --git a/libavutil/cpu.h b/libavutil/cpu.h
> index ae443eccad..ce9bf14bf7 100644
> --- a/libavutil/cpu.h
> +++ b/libavutil/cpu.h
> @@ -54,6 +54,7 @@
>  #define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation Instruction Set 1
>  #define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation Instruction Set 2
>  #define AV_CPU_FLAG_AVX512     0x100000 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used
> +#define AV_CPU_FLAG_SLOW_GATHER  0x2000000 ///< CPU has slow gathers.
>  
>  #define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
>  #define AV_CPU_FLAG_VSX          0x0002 ///< ISA 2.06
> diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
> index bcd41a50a2..11467ba99d 100644
> --- a/libavutil/x86/cpu.c
> +++ b/libavutil/x86/cpu.c
> @@ -146,8 +146,16 @@ int ff_get_cpu_flags_x86(void)
>  if (max_std_level >= 7) {
>  cpuid(7, eax, ebx, ecx, edx);
>  #if HAVE_AVX2
> -        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
> +        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020)) {
>  rval |= AV_CPU_FLAG_AVX2;
> +            cpuid(1, eax, ebx, ecx, std_caps);
> +            family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
> +            model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
> +            /* Haswell has slow gather */
> +            if(family == 6 && model < 70)
>

Shouldn't this check for the vendor as well?
Nit: space after the if.


> +                rval |= AV_CPU_FLAG_SLOW_GATHER;
> +        }
> +
>  #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
>  if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
>  if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
> @@ -196,6 +204,10 @@ int ff_get_cpu_flags_x86(void)
>  used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
>  if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
>  rval |= AV_CPU_FLAG_AVXSLOW;
> +
> +        /* Zen 3 and earlier have slow gather */
>

Nit: put a space after the start of a comment and indent
to the same level as the line below.


> +            if((rval & AV_CPU_FLAG_AVX2) & family <= 25)
>

Mistake: you're ANDing the conditions, you want `&& family <= 0x19`
(hex because the other checks are all hex too).
Nit: space after the if.


> +                rval |= AV_CPU_FLAG_SLOW_GATHER;
>  }
>  
>  /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
  2021-12-20 15:26           ` Lynne
@ 2021-12-21 19:56             ` Alan Kelly
  2021-12-21 20:52               ` Lynne
  0 siblings, 1 reply; 10+ messages in thread
From: Alan Kelly @ 2021-12-21 19:56 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Alan Kelly

This flag is set on Haswell and earlier and all AMD cpus.
---
 Checks for family for Haswell. All checks are done where AVX2 flag is
 set as this is clearer.
 libavutil/cpu.h     |  1 +
 libavutil/x86/cpu.c | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index ae443eccad..ce9bf14bf7 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -54,6 +54,7 @@
 #define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation Instruction Set 1
 #define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation Instruction Set 2
 #define AV_CPU_FLAG_AVX512     0x100000 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used
+#define AV_CPU_FLAG_SLOW_GATHER  0x2000000 ///< CPU has slow gathers.
 
 #define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
 #define AV_CPU_FLAG_VSX          0x0002 ///< ISA 2.06
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index bcd41a50a2..441b4695d5 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -146,8 +146,21 @@ int ff_get_cpu_flags_x86(void)
     if (max_std_level >= 7) {
         cpuid(7, eax, ebx, ecx, edx);
 #if HAVE_AVX2
-        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
+        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020)) {
             rval |= AV_CPU_FLAG_AVX2;
+            cpuid(1, eax, ebx, ecx, std_caps);
+            family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+            model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+            /* Haswell has slow gather */
+            if (!strncmp(vendor.c, "GenuineIntel", 12))
+                if (family == 6 && model < 70)
+                    rval |= AV_CPU_FLAG_SLOW_GATHER;
+            /* Zen 3 and earlier have slow gather */
+            if (!strncmp(vendor.c, "AuthenticAMD", 12))
+                if (family <= 0x19)
+                    rval |= AV_CPU_FLAG_SLOW_GATHER;
+        }
+
 #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
         if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
             if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
-- 
2.34.1.173.g76aa8bc2d0-goog

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
  2021-12-21 19:56             ` Alan Kelly
@ 2021-12-21 20:52               ` Lynne
  0 siblings, 0 replies; 10+ messages in thread
From: Lynne @ 2021-12-21 20:52 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

21 Dec 2021, 20:56 by alankelly-at-google.com@ffmpeg.org:

> This flag is set on Haswell and earlier and all AMD cpus.
> ---
>  Checks for family for Haswell. All checks are done where AVX2 flag is
>  set as this is clearer.
>  libavutil/cpu.h     |  1 +
>  libavutil/x86/cpu.c | 15 ++++++++++++++-
>  2 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/libavutil/cpu.h b/libavutil/cpu.h
> index ae443eccad..ce9bf14bf7 100644
> --- a/libavutil/cpu.h
> +++ b/libavutil/cpu.h
> @@ -54,6 +54,7 @@
>  #define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation Instruction Set 1
>  #define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation Instruction Set 2
>  #define AV_CPU_FLAG_AVX512     0x100000 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used
> +#define AV_CPU_FLAG_SLOW_GATHER  0x2000000 ///< CPU has slow gathers.
>  
>  #define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
>  #define AV_CPU_FLAG_VSX          0x0002 ///< ISA 2.06
> diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
> index bcd41a50a2..441b4695d5 100644
> --- a/libavutil/x86/cpu.c
> +++ b/libavutil/x86/cpu.c
> @@ -146,8 +146,21 @@ int ff_get_cpu_flags_x86(void)
>  if (max_std_level >= 7) {
>  cpuid(7, eax, ebx, ecx, edx);
>  #if HAVE_AVX2
> -        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
> +        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020)) {
>  rval |= AV_CPU_FLAG_AVX2;
> +            cpuid(1, eax, ebx, ecx, std_caps);
> +            family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
> +            model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
> +            /* Haswell has slow gather */
> +            if (!strncmp(vendor.c, "GenuineIntel", 12))
> +                if (family == 6 && model < 70)
> +                    rval |= AV_CPU_FLAG_SLOW_GATHER;
> +            /* Zen 3 and earlier have slow gather */
> +            if (!strncmp(vendor.c, "AuthenticAMD", 12))
> +                if (family <= 0x19)
> +                    rval |= AV_CPU_FLAG_SLOW_GATHER;
> +        }
> +
>  #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
>  if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
>  if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
> -- 
> 2.34.1.173.g76aa8bc2d0-goog
>

LGTM, thanks
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-12-21 20:52 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-20 13:56 [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER Alan Kelly
2021-12-20 14:21 ` James Almer
2021-12-20 14:43   ` Alan Kelly
2021-12-20 14:47     ` Lynne
2021-12-20 14:53       ` James Almer
2021-12-20 14:59         ` Alan Kelly
2021-12-20 15:26           ` Lynne
2021-12-21 19:56             ` Alan Kelly
2021-12-21 20:52               ` Lynne
2021-12-20 15:01         ` Alan Kelly

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git