Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag
@ 2022-02-23  8:57 jianhua.wu-at-intel.com
  2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 2/6] avcodec/x86/hevc_mc: add qpel_h8_8_avx512icl and qpel_hv8_8_avx512icl jianhua.wu-at-intel.com
                   ` (6 more replies)
  0 siblings, 7 replies; 11+ messages in thread
From: jianhua.wu-at-intel.com @ 2022-02-23  8:57 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Wu Jianhua

From: Wu Jianhua <jianhua.wu@intel.com>

Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 configure                 | 13 +++++++---
 libavutil/cpu.c           |  1 +
 libavutil/cpu.h           |  1 +
 libavutil/x86/cpu.c       |  8 ++++--
 libavutil/x86/cpu.h       |  1 +
 libavutil/x86/x86inc.asm  | 53 ++++++++++++++++++++-------------------
 tests/checkasm/checkasm.c | 35 +++++++++++++-------------
 7 files changed, 63 insertions(+), 49 deletions(-)

diff --git a/configure b/configure
index 1535dc3c5b..d88c2ae979 100755
--- a/configure
+++ b/configure
@@ -444,6 +444,7 @@ Optimization options (experts only):
   --disable-fma4           disable FMA4 optimizations
   --disable-avx2           disable AVX2 optimizations
   --disable-avx512         disable AVX-512 optimizations
+  --disable-avx512icl      disable AVX-512ICL optimizations
   --disable-aesni          disable AESNI optimizations
   --disable-armv5te        disable armv5te optimizations
   --disable-armv6          disable armv6 optimizations
@@ -2098,6 +2099,7 @@ ARCH_EXT_LIST_X86_SIMD="
     avx
     avx2
     avx512
+    avx512icl
     fma3
     fma4
     mmx
@@ -2666,6 +2668,7 @@ fma3_deps="avx"
 fma4_deps="avx"
 avx2_deps="avx"
 avx512_deps="avx2"
+avx512icl_deps="avx512"
 
 mmx_external_deps="x86asm"
 mmx_inline_deps="inline_asm x86"
@@ -6128,10 +6131,11 @@ EOF
             elf*) enabled debug && append X86ASMFLAGS $x86asm_debug ;;
         esac
 
-        enabled avx512 && check_x86asm avx512_external "vmovdqa32 [eax]{k1}{z}, zmm0"
-        enabled avx2   && check_x86asm avx2_external   "vextracti128 xmm0, ymm0, 0"
-        enabled xop    && check_x86asm xop_external    "vpmacsdd xmm0, xmm1, xmm2, xmm3"
-        enabled fma4   && check_x86asm fma4_external   "vfmaddps ymm0, ymm1, ymm2, ymm3"
+        enabled avx512    && check_x86asm avx512_external    "vmovdqa32 [eax]{k1}{z}, zmm0"
+        enabled avx512icl && check_x86asm avx512icl_external "vpdpwssds zmm31{k1}{z}, zmm29, zmm28"
+        enabled avx2      && check_x86asm avx2_external      "vextracti128 xmm0, ymm0, 0"
+        enabled xop       && check_x86asm xop_external       "vpmacsdd xmm0, xmm1, xmm2, xmm3"
+        enabled fma4      && check_x86asm fma4_external      "vfmaddps ymm0, ymm1, ymm2, ymm3"
         check_x86asm cpunop          "CPU amdnop"
     fi
 
@@ -7471,6 +7475,7 @@ if enabled x86; then
     echo "AVX enabled               ${avx-no}"
     echo "AVX2 enabled              ${avx2-no}"
     echo "AVX-512 enabled           ${avx512-no}"
+    echo "AVX-512ICL enabled        ${avx512icl-no}"
     echo "XOP enabled               ${xop-no}"
     echo "FMA3 enabled              ${fma3-no}"
     echo "FMA4 enabled              ${fma4-no}"
diff --git a/libavutil/cpu.c b/libavutil/cpu.c
index 1368502245..833c220192 100644
--- a/libavutil/cpu.c
+++ b/libavutil/cpu.c
@@ -137,6 +137,7 @@ int av_parse_cpu_caps(unsigned *flags, const char *s)
         { "cmov",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_CMOV     },    .unit = "flags" },
         { "aesni",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AESNI    },    .unit = "flags" },
         { "avx512"  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AVX512   },    .unit = "flags" },
+        { "avx512icl",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AVX512ICL   }, .unit = "flags" },
         { "slowgather", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_SLOW_GATHER }, .unit = "flags" },
 
 #define CPU_FLAG_P2 AV_CPU_FLAG_CMOV | AV_CPU_FLAG_MMX
diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index ce9bf14bf7..9711e574c5 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -54,6 +54,7 @@
 #define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation Instruction Set 1
 #define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation Instruction Set 2
 #define AV_CPU_FLAG_AVX512     0x100000 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used
+#define AV_CPU_FLAG_AVX512ICL  0x200000 ///< F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ
 #define AV_CPU_FLAG_SLOW_GATHER  0x2000000 ///< CPU has slow gathers.
 
 #define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index 7b13fcae91..d6cd4fab9c 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -150,9 +150,13 @@ int ff_get_cpu_flags_x86(void)
             rval |= AV_CPU_FLAG_AVX2;
 #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
         if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
-            if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
+            if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000) {
                 rval |= AV_CPU_FLAG_AVX512;
-
+#if HAVE_AVX512ICL
+                if ((ebx & 0xd0200000) == 0xd0200000 && (ecx & 0x5f42) == 0x5f42)
+                    rval |= AV_CPU_FLAG_AVX512ICL;
+#endif /* HAVE_AVX512ICL */
+            }
         }
 #endif /* HAVE_AVX512 */
 #endif /* HAVE_AVX2 */
diff --git a/libavutil/x86/cpu.h b/libavutil/x86/cpu.h
index 937c697fa0..40a1eef0ab 100644
--- a/libavutil/x86/cpu.h
+++ b/libavutil/x86/cpu.h
@@ -80,6 +80,7 @@
 #define EXTERNAL_AVX2_SLOW(flags)   CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, AVX2, AVX)
 #define EXTERNAL_AESNI(flags)       CPUEXT_SUFFIX(flags, _EXTERNAL, AESNI)
 #define EXTERNAL_AVX512(flags)      CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512)
+#define EXTERNAL_AVX512ICL(flags)   CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512ICL)
 
 #define INLINE_AMD3DNOW(flags)      CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOW)
 #define INLINE_AMD3DNOWEXT(flags)   CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOWEXT)
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 01c35e3a4b..251ee797de 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -817,32 +817,33 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
 
 ; cpuflags
 
-%assign cpuflags_mmx      (1<<0)
-%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
-%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
-%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
-%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
-%assign cpuflags_sse2     (1<<5) | cpuflags_sse
-%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
-%assign cpuflags_lzcnt    (1<<7) | cpuflags_sse2
-%assign cpuflags_sse3     (1<<8) | cpuflags_sse2
-%assign cpuflags_ssse3    (1<<9) | cpuflags_sse3
-%assign cpuflags_sse4     (1<<10)| cpuflags_ssse3
-%assign cpuflags_sse42    (1<<11)| cpuflags_sse4
-%assign cpuflags_aesni    (1<<12)| cpuflags_sse42
-%assign cpuflags_avx      (1<<13)| cpuflags_sse42
-%assign cpuflags_xop      (1<<14)| cpuflags_avx
-%assign cpuflags_fma4     (1<<15)| cpuflags_avx
-%assign cpuflags_fma3     (1<<16)| cpuflags_avx
-%assign cpuflags_bmi1     (1<<17)| cpuflags_avx|cpuflags_lzcnt
-%assign cpuflags_bmi2     (1<<18)| cpuflags_bmi1
-%assign cpuflags_avx2     (1<<19)| cpuflags_fma3|cpuflags_bmi2
-%assign cpuflags_avx512   (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
-
-%assign cpuflags_cache32  (1<<21)
-%assign cpuflags_cache64  (1<<22)
-%assign cpuflags_aligned  (1<<23) ; not a cpu feature, but a function variant
-%assign cpuflags_atom     (1<<24)
+%assign cpuflags_mmx       (1<<0)
+%assign cpuflags_mmx2      (1<<1) | cpuflags_mmx
+%assign cpuflags_3dnow     (1<<2) | cpuflags_mmx
+%assign cpuflags_3dnowext  (1<<3) | cpuflags_3dnow
+%assign cpuflags_sse       (1<<4) | cpuflags_mmx2
+%assign cpuflags_sse2      (1<<5) | cpuflags_sse
+%assign cpuflags_sse2slow  (1<<6) | cpuflags_sse2
+%assign cpuflags_lzcnt     (1<<7) | cpuflags_sse2
+%assign cpuflags_sse3      (1<<8) | cpuflags_sse2
+%assign cpuflags_ssse3     (1<<9) | cpuflags_sse3
+%assign cpuflags_sse4      (1<<10)| cpuflags_ssse3
+%assign cpuflags_sse42     (1<<11)| cpuflags_sse4
+%assign cpuflags_aesni     (1<<12)| cpuflags_sse42
+%assign cpuflags_avx       (1<<13)| cpuflags_sse42
+%assign cpuflags_xop       (1<<14)| cpuflags_avx
+%assign cpuflags_fma4      (1<<15)| cpuflags_avx
+%assign cpuflags_fma3      (1<<16)| cpuflags_avx
+%assign cpuflags_bmi1      (1<<17)| cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2      (1<<18)| cpuflags_bmi1
+%assign cpuflags_avx2      (1<<19)| cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512    (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
+%assign cpuflags_avx512icl (1<<25)| cpuflags_avx512
+
+%assign cpuflags_cache32   (1<<21)
+%assign cpuflags_cache64   (1<<22)
+%assign cpuflags_aligned   (1<<23) ; not a cpu feature, but a function variant
+%assign cpuflags_atom      (1<<24)
 
 ; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
 %define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index f74125e810..e77b4ec20f 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -220,23 +220,24 @@ static const struct {
     { "MMI",      "mmi",      AV_CPU_FLAG_MMI },
     { "MSA",      "msa",      AV_CPU_FLAG_MSA },
 #elif ARCH_X86
-    { "MMX",      "mmx",      AV_CPU_FLAG_MMX|AV_CPU_FLAG_CMOV },
-    { "MMXEXT",   "mmxext",   AV_CPU_FLAG_MMXEXT },
-    { "3DNOW",    "3dnow",    AV_CPU_FLAG_3DNOW },
-    { "3DNOWEXT", "3dnowext", AV_CPU_FLAG_3DNOWEXT },
-    { "SSE",      "sse",      AV_CPU_FLAG_SSE },
-    { "SSE2",     "sse2",     AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW },
-    { "SSE3",     "sse3",     AV_CPU_FLAG_SSE3|AV_CPU_FLAG_SSE3SLOW },
-    { "SSSE3",    "ssse3",    AV_CPU_FLAG_SSSE3|AV_CPU_FLAG_ATOM },
-    { "SSE4.1",   "sse4",     AV_CPU_FLAG_SSE4 },
-    { "SSE4.2",   "sse42",    AV_CPU_FLAG_SSE42 },
-    { "AES-NI",   "aesni",    AV_CPU_FLAG_AESNI },
-    { "AVX",      "avx",      AV_CPU_FLAG_AVX },
-    { "XOP",      "xop",      AV_CPU_FLAG_XOP },
-    { "FMA3",     "fma3",     AV_CPU_FLAG_FMA3 },
-    { "FMA4",     "fma4",     AV_CPU_FLAG_FMA4 },
-    { "AVX2",     "avx2",     AV_CPU_FLAG_AVX2 },
-    { "AVX-512",  "avx512",   AV_CPU_FLAG_AVX512 },
+    { "MMX",        "mmx",       AV_CPU_FLAG_MMX|AV_CPU_FLAG_CMOV },
+    { "MMXEXT",     "mmxext",    AV_CPU_FLAG_MMXEXT },
+    { "3DNOW",      "3dnow",     AV_CPU_FLAG_3DNOW },
+    { "3DNOWEXT",   "3dnowext",  AV_CPU_FLAG_3DNOWEXT },
+    { "SSE",        "sse",       AV_CPU_FLAG_SSE },
+    { "SSE2",       "sse2",      AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW },
+    { "SSE3",       "sse3",      AV_CPU_FLAG_SSE3|AV_CPU_FLAG_SSE3SLOW },
+    { "SSSE3",      "ssse3",     AV_CPU_FLAG_SSSE3|AV_CPU_FLAG_ATOM },
+    { "SSE4.1",     "sse4",      AV_CPU_FLAG_SSE4 },
+    { "SSE4.2",     "sse42",     AV_CPU_FLAG_SSE42 },
+    { "AES-NI",     "aesni",     AV_CPU_FLAG_AESNI },
+    { "AVX",        "avx",       AV_CPU_FLAG_AVX },
+    { "XOP",        "xop",       AV_CPU_FLAG_XOP },
+    { "FMA3",       "fma3",      AV_CPU_FLAG_FMA3 },
+    { "FMA4",       "fma4",      AV_CPU_FLAG_FMA4 },
+    { "AVX2",       "avx2",      AV_CPU_FLAG_AVX2 },
+    { "AVX-512",    "avx512",    AV_CPU_FLAG_AVX512 },
+    { "AVX-512ICL", "avx512icl", AV_CPU_FLAG_AVX512ICL },
 #elif ARCH_LOONGARCH
     { "LSX",      "lsx",      AV_CPU_FLAG_LSX },
     { "LASX",     "lasx",     AV_CPU_FLAG_LASX },
-- 
2.17.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [FFmpeg-devel] [PATCH 2/6] avcodec/x86/hevc_mc: add qpel_h8_8_avx512icl and qpel_hv8_8_avx512icl
  2022-02-23  8:57 [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag jianhua.wu-at-intel.com
@ 2022-02-23  8:57 ` jianhua.wu-at-intel.com
  2022-03-10 15:22   ` Henrik Gramner
  2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 3/6] avcodec/x86/hevc_mc: add qpel_h16_8_avx512icl jianhua.wu-at-intel.com
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 11+ messages in thread
From: jianhua.wu-at-intel.com @ 2022-02-23  8:57 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Wu Jianhua

From: Wu Jianhua <jianhua.wu@intel.com>

This commit uses the instruction `vpdpbusd` introduced by AVX512 VNNI
to calculate the horizontal filter.

ff_hevc_put_hevc_qpel_h8_8_sse4       1039169
ff_hevc_put_hevc_qpel_h8_8_avx512icl   677153
ff_hevc_put_hevc_qpel_hv8_8_sse4      3603511
ff_hevc_put_hevc_qpel_hv8_8_avx512icl 2995354

Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 libavcodec/x86/hevc_mc.asm    | 139 ++++++++++++++++++++++++++++++++++
 libavcodec/x86/hevcdsp.h      |   3 +
 libavcodec/x86/hevcdsp_init.c |   4 +
 3 files changed, 146 insertions(+)

diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index ff6ed0711a..026b6b48ee 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -87,6 +87,26 @@ QPEL_TABLE 12, 4, w, sse4
 QPEL_TABLE  8,16, b, avx2
 QPEL_TABLE 10, 8, w, avx2
 
+QPEL_TABLE  8, 1, b, avx512icl_h
+QPEL_TABLE  8, 1, d, avx512icl_v
+
+pb_qpel_shuffle_index: db  0,  1,  2,  3
+                       db  1,  2,  3,  4
+                       db  2,  3,  4,  5
+                       db  3,  4,  5,  6
+                       db  4,  5,  6,  7
+                       db  5,  6,  7,  8
+                       db  6,  7,  8,  9
+                       db  7,  8,  9, 10
+                       db  4,  5,  6,  7
+                       db  5,  6,  7,  8
+                       db  6,  7,  8,  9
+                       db  7,  8,  9, 10
+                       db  8,  9, 10, 11
+                       db  9, 10, 11, 12
+                       db 10, 11, 12, 13
+                       db 11, 12, 13, 14
+
 SECTION .text
 
 %define MAX_PB_SIZE  64
@@ -1670,3 +1690,122 @@ HEVC_PUT_HEVC_QPEL_HV 16, 10
 
 %endif ;AVX2
 %endif ; ARCH_X86_64
+
+%macro QPEL_FILTER_H 5
+%define %%table hevc_qpel_filters_avx512icl_h_%1
+%assign %%offset 4
+    dec %2q
+    shl %2q, 3
+%ifdef PIC
+    lea %5q, [%%table]
+    %define FILTER %5q
+%else
+    %define FILTER %%table
+%endif
+    vpbroadcastd m%3, [FILTER + %2q + 0*%%offset]
+    vpbroadcastd m%4, [FILTER + %2q + 1*%%offset]
+%endmacro
+
+%macro QPEL_FILTER_V 5
+    vpbroadcastd m%3, [%5 + %2q + 4*%4]
+%endmacro
+
+%macro QPEL_LOAD_SHUF 2
+    movu m%1, [pb_qpel_shuffle_index +  0]
+    movu m%2, [pb_qpel_shuffle_index + 32]
+%endmacro
+
+; required: m0-m5
+; %1: dst register index
+; %2: name for src
+%macro QPEL_H_LOAD_COMPUTE 2
+    pxor            m%1, m%1
+    movu            xm4, [%2q - 3]
+    vpermb           m5, m2, m4
+    vpermb           m4, m3, m4
+    vpdpbusd        m%1, m5, m0
+    vpdpbusd        m%1, m4, m1
+%endmacro
+
+%macro HEVC_PUT_HEVC_QPEL_AVX512ICL 2
+cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 8, dst, src, srcstride, height, mx, tmp
+    QPEL_FILTER_H   %1, mx, 0, 1, tmp
+    QPEL_LOAD_SHUF   2, 3
+.loop:
+    QPEL_H_LOAD_COMPUTE   6, src
+    vpmovdw             xm6, m6
+    movu             [dstq], xm6
+    LOOP_END            dst, src, srcstride
+    RET
+%endmacro
+
+%macro HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 2
+cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 7, 8, dst, src, srcstride, height, mx, my, tmp
+%assign %%shift 6
+%assign %%extra 7
+    QPEL_FILTER_H    %1, mx, 0, 1, tmp
+    QPEL_LOAD_SHUF    2, 3
+    lea            tmpq, [srcstrideq*3]
+    sub            srcq, tmpq
+    sub             myq, 1
+    shl             myq, 5
+%ifdef PIC
+%define %%table hevc_qpel_filters_avx512icl_v_%1
+    lea tmpq, [%%table]
+    %define FILTER tmpq
+%else
+    %define FILTER %%table
+%endif
+%assign %%i 6
+%assign %%j 0
+%rep %1
+    QPEL_FILTER_V %1, my, %%i, %%j, FILTER
+    %assign %%i %%i+1
+    %assign %%j %%j+1
+%endrep
+%rep %%extra
+    QPEL_H_LOAD_COMPUTE %%i, src
+    add srcq, srcstrideq
+%assign %%i %%i+1
+%endrep
+.loop:
+    QPEL_H_LOAD_COMPUTE %%i, src
+    vpmulld           m22, m14, m6
+    vpmulld           m23, m15, m7
+    vpmulld           m24, m16, m8
+    vpmulld           m25, m17, m9
+    vpaddd            m26, m22, m23
+    vpaddd            m24, m25
+    vpaddd            m26, m24
+    vpmulld           m22, m18, m10
+    vpmulld           m23, m19, m11
+    vpmulld           m24, m20, m12
+    vpmulld           m25, m21, m13
+    vpaddd            m22, m22, m23
+    vpaddd            m24, m25
+    vpaddd            m26, m24
+    vpaddd            m22, m26
+    mova              m14, m15
+    mova              m15, m16
+    mova              m16, m17
+    mova              m17, m18
+    mova              m18, m19
+    mova              m19, m20
+    mova              m20, m21
+    vpsrad            m22, %%shift
+    vpmovdw          xm22, m22
+    movu           [dstq], xm22
+    LOOP_END          dst, src, srcstride
+
+    RET
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX512ICL_EXTERNAL
+
+INIT_YMM avx512icl
+HEVC_PUT_HEVC_QPEL_AVX512ICL 8, 8
+HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 8, 8
+
+%endif
+%endif
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 67be0a9059..5a495d2563 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -233,6 +233,9 @@ WEIGHTING_PROTOTYPES(8, sse4);
 WEIGHTING_PROTOTYPES(10, sse4);
 WEIGHTING_PROTOTYPES(12, sse4);
 
+void ff_hevc_put_hevc_qpel_h8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_hv8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+
 ///////////////////////////////////////////////////////////////////////////////
 // TRANSFORM_ADD
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 8a3fa2744b..0341835944 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -878,6 +878,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
 
             c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
         }
+        if (EXTERNAL_AVX512ICL(cpu_flags)) {
+            c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_avx512icl;
+            c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_avx512icl;
+        }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
             c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
-- 
2.17.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [FFmpeg-devel] [PATCH 3/6] avcodec/x86/hevc_mc: add qpel_h16_8_avx512icl
  2022-02-23  8:57 [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag jianhua.wu-at-intel.com
  2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 2/6] avcodec/x86/hevc_mc: add qpel_h8_8_avx512icl and qpel_hv8_8_avx512icl jianhua.wu-at-intel.com
@ 2022-02-23  8:57 ` jianhua.wu-at-intel.com
  2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 4/6] avcodec/x86/hevc_mc: add qpel_h4_8_avx512icl jianhua.wu-at-intel.com
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: jianhua.wu-at-intel.com @ 2022-02-23  8:57 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Wu Jianhua

From: Wu Jianhua <jianhua.wu@intel.com>

ff_hevc_put_hevc_qpel_h16_8_sse4       3290870
ff_hevc_put_hevc_qpel_h16_8_avx512icl  1730033

Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 libavcodec/x86/hevc_mc.asm    | 39 ++++++++++++++++++++++++++++++-----
 libavcodec/x86/hevcdsp.h      |  1 +
 libavcodec/x86/hevcdsp_init.c |  1 +
 3 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 026b6b48ee..8c128f5202 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -89,6 +89,7 @@ QPEL_TABLE 10, 8, w, avx2
 
 QPEL_TABLE  8, 1, b, avx512icl_h
 QPEL_TABLE  8, 1, d, avx512icl_v
+QPEL_TABLE 16, 1, b, avx512icl_h
 
 pb_qpel_shuffle_index: db  0,  1,  2,  3
                        db  1,  2,  3,  4
@@ -98,6 +99,14 @@ pb_qpel_shuffle_index: db  0,  1,  2,  3
                        db  5,  6,  7,  8
                        db  6,  7,  8,  9
                        db  7,  8,  9, 10
+                       db  8,  9, 10, 11
+                       db  9, 10, 11, 12
+                       db 10, 11, 12, 13
+                       db 11, 12, 13, 14
+                       db 12, 13, 14, 15
+                       db 13, 14, 15, 16
+                       db 14, 15, 16, 17
+                       db 15, 16, 17, 18
                        db  4,  5,  6,  7
                        db  5,  6,  7,  8
                        db  6,  7,  8,  9
@@ -106,6 +115,14 @@ pb_qpel_shuffle_index: db  0,  1,  2,  3
                        db  9, 10, 11, 12
                        db 10, 11, 12, 13
                        db 11, 12, 13, 14
+                       db 12, 13, 14, 15
+                       db 13, 14, 15, 16
+                       db 14, 15, 16, 17
+                       db 15, 16, 17, 18
+                       db 16, 17, 18, 19
+                       db 17, 18, 19, 20
+                       db 18, 19, 20, 21
+                       db 19, 20, 21, 22
 
 SECTION .text
 
@@ -1712,7 +1729,7 @@ HEVC_PUT_HEVC_QPEL_HV 16, 10
 
 %macro QPEL_LOAD_SHUF 2
     movu m%1, [pb_qpel_shuffle_index +  0]
-    movu m%2, [pb_qpel_shuffle_index + 32]
+    movu m%2, [pb_qpel_shuffle_index + 64]
 %endmacro
 
 ; required: m0-m5
@@ -1720,7 +1737,11 @@ HEVC_PUT_HEVC_QPEL_HV 16, 10
 ; %2: name for src
 %macro QPEL_H_LOAD_COMPUTE 2
     pxor            m%1, m%1
-    movu            xm4, [%2q - 3]
+%if mmsize == 64
+    movu            ym4, [%2]
+%else
+    movu            xm4, [%2]
+%endif
     vpermb           m5, m2, m4
     vpermb           m4, m3, m4
     vpdpbusd        m%1, m5, m0
@@ -1732,9 +1753,14 @@ cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 8, dst, src, srcstride, height, mx, tmp
     QPEL_FILTER_H   %1, mx, 0, 1, tmp
     QPEL_LOAD_SHUF   2, 3
 .loop:
-    QPEL_H_LOAD_COMPUTE   6, src
+    QPEL_H_LOAD_COMPUTE   6, srcq - 3
+%if %1 == 8
     vpmovdw             xm6, m6
     movu             [dstq], xm6
+%else
+    vpmovdw             ym6, m6
+    movu             [dstq], ym6
+%endif
     LOOP_END            dst, src, srcstride
     RET
 %endmacro
@@ -1764,12 +1790,12 @@ cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 7, 8, dst, src, srcstride, height, mx, my
     %assign %%j %%j+1
 %endrep
 %rep %%extra
-    QPEL_H_LOAD_COMPUTE %%i, src
+    QPEL_H_LOAD_COMPUTE %%i, srcq - 3
     add srcq, srcstrideq
 %assign %%i %%i+1
 %endrep
 .loop:
-    QPEL_H_LOAD_COMPUTE %%i, src
+    QPEL_H_LOAD_COMPUTE %%i, srcq - 3
     vpmulld           m22, m14, m6
     vpmulld           m23, m15, m7
     vpmulld           m24, m16, m8
@@ -1807,5 +1833,8 @@ INIT_YMM avx512icl
 HEVC_PUT_HEVC_QPEL_AVX512ICL 8, 8
 HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 8, 8
 
+INIT_ZMM avx512icl
+HEVC_PUT_HEVC_QPEL_AVX512ICL 16, 8
+
 %endif
 %endif
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 5a495d2563..6e3fc01ad0 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -234,6 +234,7 @@ WEIGHTING_PROTOTYPES(10, sse4);
 WEIGHTING_PROTOTYPES(12, sse4);
 
 void ff_hevc_put_hevc_qpel_h8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h16_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_hv8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 0341835944..4023faa654 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -880,6 +880,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
         }
         if (EXTERNAL_AVX512ICL(cpu_flags)) {
             c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_avx512icl;
+            c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_avx512icl;
             c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_avx512icl;
         }
     } else if (bit_depth == 10) {
-- 
2.17.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [FFmpeg-devel] [PATCH 4/6] avcodec/x86/hevc_mc: add qpel_h4_8_avx512icl
  2022-02-23  8:57 [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag jianhua.wu-at-intel.com
  2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 2/6] avcodec/x86/hevc_mc: add qpel_h8_8_avx512icl and qpel_hv8_8_avx512icl jianhua.wu-at-intel.com
  2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 3/6] avcodec/x86/hevc_mc: add qpel_h16_8_avx512icl jianhua.wu-at-intel.com
@ 2022-02-23  8:57 ` jianhua.wu-at-intel.com
  2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 5/6] avcodec/x86/hevc_mc: add qpel_h32_8_avx512icl jianhua.wu-at-intel.com
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: jianhua.wu-at-intel.com @ 2022-02-23  8:57 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Wu Jianhua

From: Wu Jianhua <jianhua.wu@intel.com>

ff_hevc_put_hevc_qpel_h4_8_sse4       993694
ff_hevc_put_hevc_qpel_h4_8_avx512icl  686647

Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 libavcodec/x86/hevc_mc.asm    | 12 ++++++++++--
 libavcodec/x86/hevcdsp.h      |  1 +
 libavcodec/x86/hevcdsp_init.c |  1 +
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 8c128f5202..25880b8858 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -87,6 +87,7 @@ QPEL_TABLE 12, 4, w, sse4
 QPEL_TABLE  8,16, b, avx2
 QPEL_TABLE 10, 8, w, avx2
 
+QPEL_TABLE  4, 1, b, avx512icl_h
 QPEL_TABLE  8, 1, b, avx512icl_h
 QPEL_TABLE  8, 1, d, avx512icl_v
 QPEL_TABLE 16, 1, b, avx512icl_h
@@ -1734,7 +1735,7 @@ HEVC_PUT_HEVC_QPEL_HV 16, 10
 
 ; required: m0-m5
 ; %1: dst register index
-; %2: name for src
+; %2: src
 %macro QPEL_H_LOAD_COMPUTE 2
     pxor            m%1, m%1
 %if mmsize == 64
@@ -1754,9 +1755,13 @@ cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 8, dst, src, srcstride, height, mx, tmp
     QPEL_LOAD_SHUF   2, 3
 .loop:
     QPEL_H_LOAD_COMPUTE   6, srcq - 3
-%if %1 == 8
+%if %1 < 16
     vpmovdw             xm6, m6
+%if %1 == 4
+    movq             [dstq], xm6
+%else
     movu             [dstq], xm6
+%endif
 %else
     vpmovdw             ym6, m6
     movu             [dstq], ym6
@@ -1829,6 +1834,9 @@ cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 7, 8, dst, src, srcstride, height, mx, my
 %if ARCH_X86_64
 %if HAVE_AVX512ICL_EXTERNAL
 
+INIT_XMM avx512icl
+HEVC_PUT_HEVC_QPEL_AVX512ICL 4, 8
+
 INIT_YMM avx512icl
 HEVC_PUT_HEVC_QPEL_AVX512ICL 8, 8
 HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 8, 8
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 6e3fc01ad0..51ffdc9628 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -233,6 +233,7 @@ WEIGHTING_PROTOTYPES(8, sse4);
 WEIGHTING_PROTOTYPES(10, sse4);
 WEIGHTING_PROTOTYPES(12, sse4);
 
+void ff_hevc_put_hevc_qpel_h4_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_h8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_h16_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_hv8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 4023faa654..be1484d06e 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -879,6 +879,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
         }
         if (EXTERNAL_AVX512ICL(cpu_flags)) {
+            c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_avx512icl;
             c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_avx512icl;
             c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_avx512icl;
             c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_avx512icl;
-- 
2.17.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [FFmpeg-devel] [PATCH 5/6] avcodec/x86/hevc_mc: add qpel_h32_8_avx512icl
  2022-02-23  8:57 [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag jianhua.wu-at-intel.com
                   ` (2 preceding siblings ...)
  2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 4/6] avcodec/x86/hevc_mc: add qpel_h4_8_avx512icl jianhua.wu-at-intel.com
@ 2022-02-23  8:57 ` jianhua.wu-at-intel.com
  2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 6/6] avcodec/x86/hevc_mc: add qpel_h64_8_avx512icl jianhua.wu-at-intel.com
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: jianhua.wu-at-intel.com @ 2022-02-23  8:57 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Wu Jianhua

From: Wu Jianhua <jianhua.wu@intel.com>

ff_hevc_put_hevc_qpel_h32_8_sse4       14122151
ff_hevc_put_hevc_qpel_h32_8_avx2        9337675
ff_hevc_put_hevc_qpel_h32_8_avx512icl   6424654

Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 libavcodec/x86/hevc_mc.asm    | 7 +++++++
 libavcodec/x86/hevcdsp.h      | 1 +
 libavcodec/x86/hevcdsp_init.c | 1 +
 3 files changed, 9 insertions(+)

diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 25880b8858..4cf5dcd338 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -91,6 +91,7 @@ QPEL_TABLE  4, 1, b, avx512icl_h
 QPEL_TABLE  8, 1, b, avx512icl_h
 QPEL_TABLE  8, 1, d, avx512icl_v
 QPEL_TABLE 16, 1, b, avx512icl_h
+QPEL_TABLE 32, 1, b, avx512icl_h
 
 pb_qpel_shuffle_index: db  0,  1,  2,  3
                        db  1,  2,  3,  4
@@ -1765,6 +1766,11 @@ cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 8, dst, src, srcstride, height, mx, tmp
 %else
     vpmovdw             ym6, m6
     movu             [dstq], ym6
+%endif
+%if %1 == 32
+    QPEL_H_LOAD_COMPUTE   7, srcq + 16 - 3
+    vpmovdw             ym7, m7
+    movu        [dstq + 32], ym7
 %endif
     LOOP_END            dst, src, srcstride
     RET
@@ -1843,6 +1849,7 @@ HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 8, 8
 
 INIT_ZMM avx512icl
 HEVC_PUT_HEVC_QPEL_AVX512ICL 16, 8
+HEVC_PUT_HEVC_QPEL_AVX512ICL 32, 8
 
 %endif
 %endif
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 51ffdc9628..8d3c3cc75f 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -236,6 +236,7 @@ WEIGHTING_PROTOTYPES(12, sse4);
 void ff_hevc_put_hevc_qpel_h4_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_h8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_h16_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h32_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_hv8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index be1484d06e..e9002c8b15 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -882,6 +882,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_avx512icl;
             c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_avx512icl;
             c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_avx512icl;
+            c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx512icl;
             c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_avx512icl;
         }
     } else if (bit_depth == 10) {
-- 
2.17.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [FFmpeg-devel] [PATCH 6/6] avcodec/x86/hevc_mc: add qpel_h64_8_avx512icl
  2022-02-23  8:57 [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag jianhua.wu-at-intel.com
                   ` (3 preceding siblings ...)
  2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 5/6] avcodec/x86/hevc_mc: add qpel_h32_8_avx512icl jianhua.wu-at-intel.com
@ 2022-02-23  8:57 ` jianhua.wu-at-intel.com
  2022-03-02  5:33 ` [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag Wu, Jianhua
  2022-03-10 14:34 ` Henrik Gramner
  6 siblings, 0 replies; 11+ messages in thread
From: jianhua.wu-at-intel.com @ 2022-02-23  8:57 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Wu Jianhua

From: Wu Jianhua <jianhua.wu@intel.com>

ff_hevc_put_hevc_qpel_h64_8_sse4       56782981
ff_hevc_put_hevc_qpel_h64_8_avx2       40097816
ff_hevc_put_hevc_qpel_h64_8_avx512icl  25488576

Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 libavcodec/x86/hevc_mc.asm    | 12 +++++++++++-
 libavcodec/x86/hevcdsp.h      |  1 +
 libavcodec/x86/hevcdsp_init.c |  1 +
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 4cf5dcd338..37264962af 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -92,6 +92,7 @@ QPEL_TABLE  8, 1, b, avx512icl_h
 QPEL_TABLE  8, 1, d, avx512icl_v
 QPEL_TABLE 16, 1, b, avx512icl_h
 QPEL_TABLE 32, 1, b, avx512icl_h
+QPEL_TABLE 64, 1, b, avx512icl_h
 
 pb_qpel_shuffle_index: db  0,  1,  2,  3
                        db  1,  2,  3,  4
@@ -1767,10 +1768,18 @@ cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 8, dst, src, srcstride, height, mx, tmp
     vpmovdw             ym6, m6
     movu             [dstq], ym6
 %endif
-%if %1 == 32
+%if %1 > 16
     QPEL_H_LOAD_COMPUTE   7, srcq + 16 - 3
     vpmovdw             ym7, m7
     movu        [dstq + 32], ym7
+%endif
+%if %1 > 32
+    QPEL_H_LOAD_COMPUTE   6, srcq + 32 - 3
+    QPEL_H_LOAD_COMPUTE   7, srcq + 48 - 3
+    vpmovdw             ym6, m6
+    vpmovdw             ym7, m7
+    movu        [dstq + 64], ym6
+    movu        [dstq + 96], ym7
 %endif
     LOOP_END            dst, src, srcstride
     RET
@@ -1850,6 +1859,7 @@ HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 8, 8
 INIT_ZMM avx512icl
 HEVC_PUT_HEVC_QPEL_AVX512ICL 16, 8
 HEVC_PUT_HEVC_QPEL_AVX512ICL 32, 8
+HEVC_PUT_HEVC_QPEL_AVX512ICL 64, 8
 
 %endif
 %endif
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 8d3c3cc75f..24e35bc032 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -237,6 +237,7 @@ void ff_hevc_put_hevc_qpel_h4_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t
 void ff_hevc_put_hevc_qpel_h8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_h16_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_h32_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h64_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_hv8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index e9002c8b15..64fa5bc1f8 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -883,6 +883,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_avx512icl;
             c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_avx512icl;
             c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx512icl;
+            c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx512icl;
             c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_avx512icl;
         }
     } else if (bit_depth == 10) {
-- 
2.17.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag
  2022-02-23  8:57 [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag jianhua.wu-at-intel.com
                   ` (4 preceding siblings ...)
  2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 6/6] avcodec/x86/hevc_mc: add qpel_h64_8_avx512icl jianhua.wu-at-intel.com
@ 2022-03-02  5:33 ` Wu, Jianhua
  2022-03-09  7:38   ` Wu, Jianhua
  2022-03-10 14:34 ` Henrik Gramner
  6 siblings, 1 reply; 11+ messages in thread
From: Wu, Jianhua @ 2022-03-02  5:33 UTC (permalink / raw)
  To: ffmpeg-devel

Ping.
> -----Original Message-----
> From: Wu, Jianhua <jianhua.wu@intel.com>
> Sent: Wednesday, February 23, 2022 4:58 PM
> To: ffmpeg-devel@ffmpeg.org
> Cc: Wu, Jianhua <jianhua.wu@intel.com>
> Subject: [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag
> 
> From: Wu Jianhua <jianhua.wu@intel.com>
> 
> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
> ---
>  configure                 | 13 +++++++---
>  libavutil/cpu.c           |  1 +
>  libavutil/cpu.h           |  1 +
>  libavutil/x86/cpu.c       |  8 ++++--
>  libavutil/x86/cpu.h       |  1 +
>  libavutil/x86/x86inc.asm  | 53 ++++++++++++++++++++-------------------
>  tests/checkasm/checkasm.c | 35 +++++++++++++-------------
>  7 files changed, 63 insertions(+), 49 deletions(-)
> 
> diff --git a/configure b/configure
> index 1535dc3c5b..d88c2ae979 100755
> --- a/configure
> +++ b/configure
> @@ -444,6 +444,7 @@ Optimization options (experts only):
>    --disable-fma4           disable FMA4 optimizations
>    --disable-avx2           disable AVX2 optimizations
>    --disable-avx512         disable AVX-512 optimizations
> +  --disable-avx512icl      disable AVX-512ICL optimizations
>    --disable-aesni          disable AESNI optimizations
>    --disable-armv5te        disable armv5te optimizations
>    --disable-armv6          disable armv6 optimizations
> @@ -2098,6 +2099,7 @@ ARCH_EXT_LIST_X86_SIMD="
>      avx
>      avx2
>      avx512
> +    avx512icl
>      fma3
>      fma4
>      mmx
> @@ -2666,6 +2668,7 @@ fma3_deps="avx"
>  fma4_deps="avx"
>  avx2_deps="avx"
>  avx512_deps="avx2"
> +avx512icl_deps="avx512"
> 
>  mmx_external_deps="x86asm"
>  mmx_inline_deps="inline_asm x86"
> @@ -6128,10 +6131,11 @@ EOF
>              elf*) enabled debug && append X86ASMFLAGS $x86asm_debug ;;
>          esac
> 
> -        enabled avx512 && check_x86asm avx512_external "vmovdqa32
> [eax]{k1}{z}, zmm0"
> -        enabled avx2   && check_x86asm avx2_external   "vextracti128 xmm0,
> ymm0, 0"
> -        enabled xop    && check_x86asm xop_external    "vpmacsdd xmm0,
> xmm1, xmm2, xmm3"
> -        enabled fma4   && check_x86asm fma4_external   "vfmaddps ymm0,
> ymm1, ymm2, ymm3"
> +        enabled avx512    && check_x86asm avx512_external    "vmovdqa32
> [eax]{k1}{z}, zmm0"
> +        enabled avx512icl && check_x86asm avx512icl_external "vpdpwssds
> zmm31{k1}{z}, zmm29, zmm28"
> +        enabled avx2      && check_x86asm avx2_external      "vextracti128
> xmm0, ymm0, 0"
> +        enabled xop       && check_x86asm xop_external       "vpmacsdd xmm0,
> xmm1, xmm2, xmm3"
> +        enabled fma4      && check_x86asm fma4_external      "vfmaddps ymm0,
> ymm1, ymm2, ymm3"
>          check_x86asm cpunop          "CPU amdnop"
>      fi
> 
> @@ -7471,6 +7475,7 @@ if enabled x86; then
>      echo "AVX enabled               ${avx-no}"
>      echo "AVX2 enabled              ${avx2-no}"
>      echo "AVX-512 enabled           ${avx512-no}"
> +    echo "AVX-512ICL enabled        ${avx512icl-no}"
>      echo "XOP enabled               ${xop-no}"
>      echo "FMA3 enabled              ${fma3-no}"
>      echo "FMA4 enabled              ${fma4-no}"
> diff --git a/libavutil/cpu.c b/libavutil/cpu.c index 1368502245..833c220192
> 100644
> --- a/libavutil/cpu.c
> +++ b/libavutil/cpu.c
> @@ -137,6 +137,7 @@ int av_parse_cpu_caps(unsigned *flags, const char *s)
>          { "cmov",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> AV_CPU_FLAG_CMOV     },    .unit = "flags" },
>          { "aesni",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> AV_CPU_FLAG_AESNI    },    .unit = "flags" },
>          { "avx512"  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> AV_CPU_FLAG_AVX512   },    .unit = "flags" },
> +        { "avx512icl",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> AV_CPU_FLAG_AVX512ICL   }, .unit = "flags" },
>          { "slowgather", NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> AV_CPU_FLAG_SLOW_GATHER }, .unit = "flags" },
> 
>  #define CPU_FLAG_P2 AV_CPU_FLAG_CMOV | AV_CPU_FLAG_MMX diff --
> git a/libavutil/cpu.h b/libavutil/cpu.h index ce9bf14bf7..9711e574c5 100644
> --- a/libavutil/cpu.h
> +++ b/libavutil/cpu.h
> @@ -54,6 +54,7 @@
>  #define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation Instruction
> Set 1
>  #define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation Instruction
> Set 2
>  #define AV_CPU_FLAG_AVX512     0x100000 ///< AVX-512 functions:
> requires OS support even if YMM/ZMM registers aren't used
> +#define AV_CPU_FLAG_AVX512ICL  0x200000 ///<
> +F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/VPOPCNTDQ/BITALG/GFNI/V
> AES/VPCLMULQD
> +Q
>  #define AV_CPU_FLAG_SLOW_GATHER  0x2000000 ///< CPU has slow
> gathers.
> 
>  #define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
> diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c index
> 7b13fcae91..d6cd4fab9c 100644
> --- a/libavutil/x86/cpu.c
> +++ b/libavutil/x86/cpu.c
> @@ -150,9 +150,13 @@ int ff_get_cpu_flags_x86(void)
>              rval |= AV_CPU_FLAG_AVX2;
>  #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
>          if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
> -            if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) ==
> 0xd0030000)
> +            if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) ==
> + 0xd0030000) {
>                  rval |= AV_CPU_FLAG_AVX512;
> -
> +#if HAVE_AVX512ICL
> +                if ((ebx & 0xd0200000) == 0xd0200000 && (ecx & 0x5f42) == 0x5f42)
> +                    rval |= AV_CPU_FLAG_AVX512ICL; #endif /*
> +HAVE_AVX512ICL */
> +            }
>          }
>  #endif /* HAVE_AVX512 */
>  #endif /* HAVE_AVX2 */
> diff --git a/libavutil/x86/cpu.h b/libavutil/x86/cpu.h index
> 937c697fa0..40a1eef0ab 100644
> --- a/libavutil/x86/cpu.h
> +++ b/libavutil/x86/cpu.h
> @@ -80,6 +80,7 @@
>  #define EXTERNAL_AVX2_SLOW(flags)   CPUEXT_SUFFIX_SLOW2(flags,
> _EXTERNAL, AVX2, AVX)
>  #define EXTERNAL_AESNI(flags)       CPUEXT_SUFFIX(flags, _EXTERNAL,
> AESNI)
>  #define EXTERNAL_AVX512(flags)      CPUEXT_SUFFIX(flags, _EXTERNAL,
> AVX512)
> +#define EXTERNAL_AVX512ICL(flags)   CPUEXT_SUFFIX(flags, _EXTERNAL,
> AVX512ICL)
> 
>  #define INLINE_AMD3DNOW(flags)      CPUEXT_SUFFIX(flags, _INLINE,
> AMD3DNOW)
>  #define INLINE_AMD3DNOWEXT(flags)   CPUEXT_SUFFIX(flags, _INLINE,
> AMD3DNOWEXT)
> diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index
> 01c35e3a4b..251ee797de 100644
> --- a/libavutil/x86/x86inc.asm
> +++ b/libavutil/x86/x86inc.asm
> @@ -817,32 +817,33 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge,
> jng, jnge, ja, jae,
> 
>  ; cpuflags
> 
> -%assign cpuflags_mmx      (1<<0)
> -%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
> -%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
> -%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
> -%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
> -%assign cpuflags_sse2     (1<<5) | cpuflags_sse
> -%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
> -%assign cpuflags_lzcnt    (1<<7) | cpuflags_sse2
> -%assign cpuflags_sse3     (1<<8) | cpuflags_sse2
> -%assign cpuflags_ssse3    (1<<9) | cpuflags_sse3
> -%assign cpuflags_sse4     (1<<10)| cpuflags_ssse3
> -%assign cpuflags_sse42    (1<<11)| cpuflags_sse4
> -%assign cpuflags_aesni    (1<<12)| cpuflags_sse42
> -%assign cpuflags_avx      (1<<13)| cpuflags_sse42
> -%assign cpuflags_xop      (1<<14)| cpuflags_avx
> -%assign cpuflags_fma4     (1<<15)| cpuflags_avx
> -%assign cpuflags_fma3     (1<<16)| cpuflags_avx
> -%assign cpuflags_bmi1     (1<<17)| cpuflags_avx|cpuflags_lzcnt
> -%assign cpuflags_bmi2     (1<<18)| cpuflags_bmi1
> -%assign cpuflags_avx2     (1<<19)| cpuflags_fma3|cpuflags_bmi2
> -%assign cpuflags_avx512   (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
> -
> -%assign cpuflags_cache32  (1<<21)
> -%assign cpuflags_cache64  (1<<22)
> -%assign cpuflags_aligned  (1<<23) ; not a cpu feature, but a function variant
> -%assign cpuflags_atom     (1<<24)
> +%assign cpuflags_mmx       (1<<0)
> +%assign cpuflags_mmx2      (1<<1) | cpuflags_mmx
> +%assign cpuflags_3dnow     (1<<2) | cpuflags_mmx
> +%assign cpuflags_3dnowext  (1<<3) | cpuflags_3dnow
> +%assign cpuflags_sse       (1<<4) | cpuflags_mmx2
> +%assign cpuflags_sse2      (1<<5) | cpuflags_sse
> +%assign cpuflags_sse2slow  (1<<6) | cpuflags_sse2
> +%assign cpuflags_lzcnt     (1<<7) | cpuflags_sse2
> +%assign cpuflags_sse3      (1<<8) | cpuflags_sse2
> +%assign cpuflags_ssse3     (1<<9) | cpuflags_sse3
> +%assign cpuflags_sse4      (1<<10)| cpuflags_ssse3
> +%assign cpuflags_sse42     (1<<11)| cpuflags_sse4
> +%assign cpuflags_aesni     (1<<12)| cpuflags_sse42
> +%assign cpuflags_avx       (1<<13)| cpuflags_sse42
> +%assign cpuflags_xop       (1<<14)| cpuflags_avx
> +%assign cpuflags_fma4      (1<<15)| cpuflags_avx
> +%assign cpuflags_fma3      (1<<16)| cpuflags_avx
> +%assign cpuflags_bmi1      (1<<17)| cpuflags_avx|cpuflags_lzcnt
> +%assign cpuflags_bmi2      (1<<18)| cpuflags_bmi1
> +%assign cpuflags_avx2      (1<<19)| cpuflags_fma3|cpuflags_bmi2
> +%assign cpuflags_avx512    (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
> +%assign cpuflags_avx512icl (1<<25)| cpuflags_avx512
> +
> +%assign cpuflags_cache32   (1<<21)
> +%assign cpuflags_cache64   (1<<22)
> +%assign cpuflags_aligned   (1<<23) ; not a cpu feature, but a function variant
> +%assign cpuflags_atom      (1<<24)
> 
>  ; Returns a boolean value expressing whether or not the specified cpuflag is
> enabled.
>  %define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) -
> 1) >> 31) & 1)
> diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index
> f74125e810..e77b4ec20f 100644
> --- a/tests/checkasm/checkasm.c
> +++ b/tests/checkasm/checkasm.c
> @@ -220,23 +220,24 @@ static const struct {
>      { "MMI",      "mmi",      AV_CPU_FLAG_MMI },
>      { "MSA",      "msa",      AV_CPU_FLAG_MSA },
>  #elif ARCH_X86
> -    { "MMX",      "mmx",      AV_CPU_FLAG_MMX|AV_CPU_FLAG_CMOV },
> -    { "MMXEXT",   "mmxext",   AV_CPU_FLAG_MMXEXT },
> -    { "3DNOW",    "3dnow",    AV_CPU_FLAG_3DNOW },
> -    { "3DNOWEXT", "3dnowext", AV_CPU_FLAG_3DNOWEXT },
> -    { "SSE",      "sse",      AV_CPU_FLAG_SSE },
> -    { "SSE2",     "sse2",     AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW },
> -    { "SSE3",     "sse3",     AV_CPU_FLAG_SSE3|AV_CPU_FLAG_SSE3SLOW },
> -    { "SSSE3",    "ssse3",    AV_CPU_FLAG_SSSE3|AV_CPU_FLAG_ATOM },
> -    { "SSE4.1",   "sse4",     AV_CPU_FLAG_SSE4 },
> -    { "SSE4.2",   "sse42",    AV_CPU_FLAG_SSE42 },
> -    { "AES-NI",   "aesni",    AV_CPU_FLAG_AESNI },
> -    { "AVX",      "avx",      AV_CPU_FLAG_AVX },
> -    { "XOP",      "xop",      AV_CPU_FLAG_XOP },
> -    { "FMA3",     "fma3",     AV_CPU_FLAG_FMA3 },
> -    { "FMA4",     "fma4",     AV_CPU_FLAG_FMA4 },
> -    { "AVX2",     "avx2",     AV_CPU_FLAG_AVX2 },
> -    { "AVX-512",  "avx512",   AV_CPU_FLAG_AVX512 },
> +    { "MMX",        "mmx",       AV_CPU_FLAG_MMX|AV_CPU_FLAG_CMOV },
> +    { "MMXEXT",     "mmxext",    AV_CPU_FLAG_MMXEXT },
> +    { "3DNOW",      "3dnow",     AV_CPU_FLAG_3DNOW },
> +    { "3DNOWEXT",   "3dnowext",  AV_CPU_FLAG_3DNOWEXT },
> +    { "SSE",        "sse",       AV_CPU_FLAG_SSE },
> +    { "SSE2",       "sse2",      AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW },
> +    { "SSE3",       "sse3",      AV_CPU_FLAG_SSE3|AV_CPU_FLAG_SSE3SLOW },
> +    { "SSSE3",      "ssse3",     AV_CPU_FLAG_SSSE3|AV_CPU_FLAG_ATOM },
> +    { "SSE4.1",     "sse4",      AV_CPU_FLAG_SSE4 },
> +    { "SSE4.2",     "sse42",     AV_CPU_FLAG_SSE42 },
> +    { "AES-NI",     "aesni",     AV_CPU_FLAG_AESNI },
> +    { "AVX",        "avx",       AV_CPU_FLAG_AVX },
> +    { "XOP",        "xop",       AV_CPU_FLAG_XOP },
> +    { "FMA3",       "fma3",      AV_CPU_FLAG_FMA3 },
> +    { "FMA4",       "fma4",      AV_CPU_FLAG_FMA4 },
> +    { "AVX2",       "avx2",      AV_CPU_FLAG_AVX2 },
> +    { "AVX-512",    "avx512",    AV_CPU_FLAG_AVX512 },
> +    { "AVX-512ICL", "avx512icl", AV_CPU_FLAG_AVX512ICL },
>  #elif ARCH_LOONGARCH
>      { "LSX",      "lsx",      AV_CPU_FLAG_LSX },
>      { "LASX",     "lasx",     AV_CPU_FLAG_LASX },
> --
> 2.17.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag
  2022-03-02  5:33 ` [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag Wu, Jianhua
@ 2022-03-09  7:38   ` Wu, Jianhua
  0 siblings, 0 replies; 11+ messages in thread
From: Wu, Jianhua @ 2022-03-09  7:38 UTC (permalink / raw)
  To: ffmpeg-devel

Ping.
> From: Wu, Jianhua
> Sent: Wednesday, March 2, 2022 1:34 PM
> To: ffmpeg-devel@ffmpeg.org
> Subject: RE: [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag
> 
> Ping.
> > From: Wu, Jianhua <jianhua.wu@intel.com>
> > Sent: Wednesday, February 23, 2022 4:58 PM
> > To: ffmpeg-devel@ffmpeg.org
> > Cc: Wu, Jianhua <jianhua.wu@intel.com>
> > Subject: [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag
> >
> > From: Wu Jianhua <jianhua.wu@intel.com>
> >
> > Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
> > ---
> >  configure                 | 13 +++++++---
> >  libavutil/cpu.c           |  1 +
> >  libavutil/cpu.h           |  1 +
> >  libavutil/x86/cpu.c       |  8 ++++--
> >  libavutil/x86/cpu.h       |  1 +
> >  libavutil/x86/x86inc.asm  | 53
> > ++++++++++++++++++++-------------------
> >  tests/checkasm/checkasm.c | 35 +++++++++++++-------------
> >  7 files changed, 63 insertions(+), 49 deletions(-)
> >
> > diff --git a/configure b/configure
> > index 1535dc3c5b..d88c2ae979 100755
> > --- a/configure
> > +++ b/configure
> > @@ -444,6 +444,7 @@ Optimization options (experts only):
> >    --disable-fma4           disable FMA4 optimizations
> >    --disable-avx2           disable AVX2 optimizations
> >    --disable-avx512         disable AVX-512 optimizations
> > +  --disable-avx512icl      disable AVX-512ICL optimizations
> >    --disable-aesni          disable AESNI optimizations
> >    --disable-armv5te        disable armv5te optimizations
> >    --disable-armv6          disable armv6 optimizations
> > @@ -2098,6 +2099,7 @@ ARCH_EXT_LIST_X86_SIMD="
> >      avx
> >      avx2
> >      avx512
> > +    avx512icl
> >      fma3
> >      fma4
> >      mmx
> > @@ -2666,6 +2668,7 @@ fma3_deps="avx"
> >  fma4_deps="avx"
> >  avx2_deps="avx"
> >  avx512_deps="avx2"
> > +avx512icl_deps="avx512"
> >
> >  mmx_external_deps="x86asm"
> >  mmx_inline_deps="inline_asm x86"
> > @@ -6128,10 +6131,11 @@ EOF
> >              elf*) enabled debug && append X86ASMFLAGS $x86asm_debug ;;
> >          esac
> >
> > -        enabled avx512 && check_x86asm avx512_external "vmovdqa32
> > [eax]{k1}{z}, zmm0"
> > -        enabled avx2   && check_x86asm avx2_external   "vextracti128 xmm0,
> > ymm0, 0"
> > -        enabled xop    && check_x86asm xop_external    "vpmacsdd xmm0,
> > xmm1, xmm2, xmm3"
> > -        enabled fma4   && check_x86asm fma4_external   "vfmaddps ymm0,
> > ymm1, ymm2, ymm3"
> > +        enabled avx512    && check_x86asm avx512_external    "vmovdqa32
> > [eax]{k1}{z}, zmm0"
> > +        enabled avx512icl && check_x86asm avx512icl_external
> > + "vpdpwssds
> > zmm31{k1}{z}, zmm29, zmm28"
> > +        enabled avx2      && check_x86asm avx2_external      "vextracti128
> > xmm0, ymm0, 0"
> > +        enabled xop       && check_x86asm xop_external       "vpmacsdd xmm0,
> > xmm1, xmm2, xmm3"
> > +        enabled fma4      && check_x86asm fma4_external      "vfmaddps
> ymm0,
> > ymm1, ymm2, ymm3"
> >          check_x86asm cpunop          "CPU amdnop"
> >      fi
> >
> > @@ -7471,6 +7475,7 @@ if enabled x86; then
> >      echo "AVX enabled               ${avx-no}"
> >      echo "AVX2 enabled              ${avx2-no}"
> >      echo "AVX-512 enabled           ${avx512-no}"
> > +    echo "AVX-512ICL enabled        ${avx512icl-no}"
> >      echo "XOP enabled               ${xop-no}"
> >      echo "FMA3 enabled              ${fma3-no}"
> >      echo "FMA4 enabled              ${fma4-no}"
> > diff --git a/libavutil/cpu.c b/libavutil/cpu.c index
> > 1368502245..833c220192
> > 100644
> > --- a/libavutil/cpu.c
> > +++ b/libavutil/cpu.c
> > @@ -137,6 +137,7 @@ int av_parse_cpu_caps(unsigned *flags, const char
> *s)
> >          { "cmov",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> > AV_CPU_FLAG_CMOV     },    .unit = "flags" },
> >          { "aesni",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> > AV_CPU_FLAG_AESNI    },    .unit = "flags" },
> >          { "avx512"  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> > AV_CPU_FLAG_AVX512   },    .unit = "flags" },
> > +        { "avx512icl",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> > AV_CPU_FLAG_AVX512ICL   }, .unit = "flags" },
> >          { "slowgather", NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> > AV_CPU_FLAG_SLOW_GATHER }, .unit = "flags" },
> >
> >  #define CPU_FLAG_P2 AV_CPU_FLAG_CMOV | AV_CPU_FLAG_MMX diff
> -- git
> > a/libavutil/cpu.h b/libavutil/cpu.h index ce9bf14bf7..9711e574c5
> > 100644
> > --- a/libavutil/cpu.h
> > +++ b/libavutil/cpu.h
> > @@ -54,6 +54,7 @@
> >  #define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation
> Instruction
> > Set 1
> >  #define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation
> Instruction
> > Set 2
> >  #define AV_CPU_FLAG_AVX512     0x100000 ///< AVX-512 functions:
> > requires OS support even if YMM/ZMM registers aren't used
> > +#define AV_CPU_FLAG_AVX512ICL  0x200000 ///<
> >
> +F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/VPOPCNTDQ/BITALG/GFNI/V
> > AES/VPCLMULQD
> > +Q
> >  #define AV_CPU_FLAG_SLOW_GATHER  0x2000000 ///< CPU has slow
> gathers.
> >
> >  #define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
> > diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c index
> > 7b13fcae91..d6cd4fab9c 100644
> > --- a/libavutil/x86/cpu.c
> > +++ b/libavutil/x86/cpu.c
> > @@ -150,9 +150,13 @@ int ff_get_cpu_flags_x86(void)
> >              rval |= AV_CPU_FLAG_AVX2;  #if HAVE_AVX512 /* F, CD, BW,
> > DQ, VL */
> >          if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
> > -            if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) ==
> > 0xd0030000)
> > +            if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) ==
> > + 0xd0030000) {
> >                  rval |= AV_CPU_FLAG_AVX512;
> > -
> > +#if HAVE_AVX512ICL
> > +                if ((ebx & 0xd0200000) == 0xd0200000 && (ecx & 0x5f42) ==
> 0x5f42)
> > +                    rval |= AV_CPU_FLAG_AVX512ICL; #endif /*
> > +HAVE_AVX512ICL */
> > +            }
> >          }
> >  #endif /* HAVE_AVX512 */
> >  #endif /* HAVE_AVX2 */
> > diff --git a/libavutil/x86/cpu.h b/libavutil/x86/cpu.h index
> > 937c697fa0..40a1eef0ab 100644
> > --- a/libavutil/x86/cpu.h
> > +++ b/libavutil/x86/cpu.h
> > @@ -80,6 +80,7 @@
> >  #define EXTERNAL_AVX2_SLOW(flags)   CPUEXT_SUFFIX_SLOW2(flags,
> > _EXTERNAL, AVX2, AVX)
> >  #define EXTERNAL_AESNI(flags)       CPUEXT_SUFFIX(flags, _EXTERNAL,
> > AESNI)
> >  #define EXTERNAL_AVX512(flags)      CPUEXT_SUFFIX(flags, _EXTERNAL,
> > AVX512)
> > +#define EXTERNAL_AVX512ICL(flags)   CPUEXT_SUFFIX(flags, _EXTERNAL,
> > AVX512ICL)
> >
> >  #define INLINE_AMD3DNOW(flags)      CPUEXT_SUFFIX(flags, _INLINE,
> > AMD3DNOW)
> >  #define INLINE_AMD3DNOWEXT(flags)   CPUEXT_SUFFIX(flags, _INLINE,
> > AMD3DNOWEXT)
> > diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index
> > 01c35e3a4b..251ee797de 100644
> > --- a/libavutil/x86/x86inc.asm
> > +++ b/libavutil/x86/x86inc.asm
> > @@ -817,32 +817,33 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl,
> > jnle, jg, jge, jng, jnge, ja, jae,
> >
> >  ; cpuflags
> >
> > -%assign cpuflags_mmx      (1<<0)
> > -%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
> > -%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
> > -%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
> > -%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
> > -%assign cpuflags_sse2     (1<<5) | cpuflags_sse
> > -%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
> > -%assign cpuflags_lzcnt    (1<<7) | cpuflags_sse2
> > -%assign cpuflags_sse3     (1<<8) | cpuflags_sse2
> > -%assign cpuflags_ssse3    (1<<9) | cpuflags_sse3
> > -%assign cpuflags_sse4     (1<<10)| cpuflags_ssse3
> > -%assign cpuflags_sse42    (1<<11)| cpuflags_sse4
> > -%assign cpuflags_aesni    (1<<12)| cpuflags_sse42
> > -%assign cpuflags_avx      (1<<13)| cpuflags_sse42
> > -%assign cpuflags_xop      (1<<14)| cpuflags_avx
> > -%assign cpuflags_fma4     (1<<15)| cpuflags_avx
> > -%assign cpuflags_fma3     (1<<16)| cpuflags_avx
> > -%assign cpuflags_bmi1     (1<<17)| cpuflags_avx|cpuflags_lzcnt
> > -%assign cpuflags_bmi2     (1<<18)| cpuflags_bmi1
> > -%assign cpuflags_avx2     (1<<19)| cpuflags_fma3|cpuflags_bmi2
> > -%assign cpuflags_avx512   (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
> > -
> > -%assign cpuflags_cache32  (1<<21)
> > -%assign cpuflags_cache64  (1<<22)
> > -%assign cpuflags_aligned  (1<<23) ; not a cpu feature, but a function
> variant
> > -%assign cpuflags_atom     (1<<24)
> > +%assign cpuflags_mmx       (1<<0)
> > +%assign cpuflags_mmx2      (1<<1) | cpuflags_mmx
> > +%assign cpuflags_3dnow     (1<<2) | cpuflags_mmx
> > +%assign cpuflags_3dnowext  (1<<3) | cpuflags_3dnow
> > +%assign cpuflags_sse       (1<<4) | cpuflags_mmx2
> > +%assign cpuflags_sse2      (1<<5) | cpuflags_sse
> > +%assign cpuflags_sse2slow  (1<<6) | cpuflags_sse2
> > +%assign cpuflags_lzcnt     (1<<7) | cpuflags_sse2
> > +%assign cpuflags_sse3      (1<<8) | cpuflags_sse2
> > +%assign cpuflags_ssse3     (1<<9) | cpuflags_sse3
> > +%assign cpuflags_sse4      (1<<10)| cpuflags_ssse3
> > +%assign cpuflags_sse42     (1<<11)| cpuflags_sse4
> > +%assign cpuflags_aesni     (1<<12)| cpuflags_sse42
> > +%assign cpuflags_avx       (1<<13)| cpuflags_sse42
> > +%assign cpuflags_xop       (1<<14)| cpuflags_avx
> > +%assign cpuflags_fma4      (1<<15)| cpuflags_avx
> > +%assign cpuflags_fma3      (1<<16)| cpuflags_avx
> > +%assign cpuflags_bmi1      (1<<17)| cpuflags_avx|cpuflags_lzcnt
> > +%assign cpuflags_bmi2      (1<<18)| cpuflags_bmi1
> > +%assign cpuflags_avx2      (1<<19)| cpuflags_fma3|cpuflags_bmi2
> > +%assign cpuflags_avx512    (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
> > +%assign cpuflags_avx512icl (1<<25)| cpuflags_avx512
> > +
> > +%assign cpuflags_cache32   (1<<21)
> > +%assign cpuflags_cache64   (1<<22)
> > +%assign cpuflags_aligned   (1<<23) ; not a cpu feature, but a function
> variant
> > +%assign cpuflags_atom      (1<<24)
> >
> >  ; Returns a boolean value expressing whether or not the specified
> > cpuflag is enabled.
> >  %define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) -
> > 1) >> 31) & 1)
> > diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
> > index f74125e810..e77b4ec20f 100644
> > --- a/tests/checkasm/checkasm.c
> > +++ b/tests/checkasm/checkasm.c
> > @@ -220,23 +220,24 @@ static const struct {
> >      { "MMI",      "mmi",      AV_CPU_FLAG_MMI },
> >      { "MSA",      "msa",      AV_CPU_FLAG_MSA },
> >  #elif ARCH_X86
> > -    { "MMX",      "mmx",      AV_CPU_FLAG_MMX|AV_CPU_FLAG_CMOV },
> > -    { "MMXEXT",   "mmxext",   AV_CPU_FLAG_MMXEXT },
> > -    { "3DNOW",    "3dnow",    AV_CPU_FLAG_3DNOW },
> > -    { "3DNOWEXT", "3dnowext", AV_CPU_FLAG_3DNOWEXT },
> > -    { "SSE",      "sse",      AV_CPU_FLAG_SSE },
> > -    { "SSE2",     "sse2",     AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW },
> > -    { "SSE3",     "sse3",     AV_CPU_FLAG_SSE3|AV_CPU_FLAG_SSE3SLOW },
> > -    { "SSSE3",    "ssse3",    AV_CPU_FLAG_SSSE3|AV_CPU_FLAG_ATOM },
> > -    { "SSE4.1",   "sse4",     AV_CPU_FLAG_SSE4 },
> > -    { "SSE4.2",   "sse42",    AV_CPU_FLAG_SSE42 },
> > -    { "AES-NI",   "aesni",    AV_CPU_FLAG_AESNI },
> > -    { "AVX",      "avx",      AV_CPU_FLAG_AVX },
> > -    { "XOP",      "xop",      AV_CPU_FLAG_XOP },
> > -    { "FMA3",     "fma3",     AV_CPU_FLAG_FMA3 },
> > -    { "FMA4",     "fma4",     AV_CPU_FLAG_FMA4 },
> > -    { "AVX2",     "avx2",     AV_CPU_FLAG_AVX2 },
> > -    { "AVX-512",  "avx512",   AV_CPU_FLAG_AVX512 },
> > +    { "MMX",        "mmx",       AV_CPU_FLAG_MMX|AV_CPU_FLAG_CMOV },
> > +    { "MMXEXT",     "mmxext",    AV_CPU_FLAG_MMXEXT },
> > +    { "3DNOW",      "3dnow",     AV_CPU_FLAG_3DNOW },
> > +    { "3DNOWEXT",   "3dnowext",  AV_CPU_FLAG_3DNOWEXT },
> > +    { "SSE",        "sse",       AV_CPU_FLAG_SSE },
> > +    { "SSE2",       "sse2",
> AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW },
> > +    { "SSE3",       "sse3",
> AV_CPU_FLAG_SSE3|AV_CPU_FLAG_SSE3SLOW },
> > +    { "SSSE3",      "ssse3",     AV_CPU_FLAG_SSSE3|AV_CPU_FLAG_ATOM },
> > +    { "SSE4.1",     "sse4",      AV_CPU_FLAG_SSE4 },
> > +    { "SSE4.2",     "sse42",     AV_CPU_FLAG_SSE42 },
> > +    { "AES-NI",     "aesni",     AV_CPU_FLAG_AESNI },
> > +    { "AVX",        "avx",       AV_CPU_FLAG_AVX },
> > +    { "XOP",        "xop",       AV_CPU_FLAG_XOP },
> > +    { "FMA3",       "fma3",      AV_CPU_FLAG_FMA3 },
> > +    { "FMA4",       "fma4",      AV_CPU_FLAG_FMA4 },
> > +    { "AVX2",       "avx2",      AV_CPU_FLAG_AVX2 },
> > +    { "AVX-512",    "avx512",    AV_CPU_FLAG_AVX512 },
> > +    { "AVX-512ICL", "avx512icl", AV_CPU_FLAG_AVX512ICL },
> >  #elif ARCH_LOONGARCH
> >      { "LSX",      "lsx",      AV_CPU_FLAG_LSX },
> >      { "LASX",     "lasx",     AV_CPU_FLAG_LASX },
> > --
> > 2.17.1

Hi there,

These patches have been sent for two weeks but got zero response so far. Could the
maintainers of CPU flags and native HEVC decoding help review this patchset? 

Thanks,
Jianhua

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag
  2022-02-23  8:57 [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag jianhua.wu-at-intel.com
                   ` (5 preceding siblings ...)
  2022-03-02  5:33 ` [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag Wu, Jianhua
@ 2022-03-10 14:34 ` Henrik Gramner
  6 siblings, 0 replies; 11+ messages in thread
From: Henrik Gramner @ 2022-03-10 14:34 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, Feb 23, 2022 at 9:57 AM <jianhua.wu-at-intel.com@ffmpeg.org> wrote:
>
> From: Wu Jianhua <jianhua.wu@intel.com>
>
> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
> ---
>  configure                 | 13 +++++++---
>  libavutil/cpu.c           |  1 +
>  libavutil/cpu.h           |  1 +
>  libavutil/x86/cpu.c       |  8 ++++--
>  libavutil/x86/cpu.h       |  1 +
>  libavutil/x86/x86inc.asm  | 53 ++++++++++++++++++++-------------------
>  tests/checkasm/checkasm.c | 35 +++++++++++++-------------
>  7 files changed, 63 insertions(+), 49 deletions(-)

This patch LGTM (didn't look at the actual asm code yet though).
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/6] avcodec/x86/hevc_mc: add qpel_h8_8_avx512icl and qpel_hv8_8_avx512icl
  2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 2/6] avcodec/x86/hevc_mc: add qpel_h8_8_avx512icl and qpel_hv8_8_avx512icl jianhua.wu-at-intel.com
@ 2022-03-10 15:22   ` Henrik Gramner
  2022-03-11  7:55     ` Wu, Jianhua
  0 siblings, 1 reply; 11+ messages in thread
From: Henrik Gramner @ 2022-03-10 15:22 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, Feb 23, 2022 at 9:58 AM <jianhua.wu-at-intel.com@ffmpeg.org> wrote:
> +%macro HEVC_PUT_HEVC_QPEL_AVX512ICL 2
> [...]
> +    vpmovdw             xm6, m6
> +    movu             [dstq], xm6

vpmovdw can take a memory operand as dst directly:
vpmovdw          [dstq], m6

(the same applies to the hv function)

> +%macro HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 2
> +cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 7, 8, dst, src, srcstride, height, mx, my, tmp

This functions uses 27(?) vector registers but only specifies 8, so it
will break on Windows unless corrected.

> +        if (EXTERNAL_AVX512ICL(cpu_flags)) {
> +            c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_avx512icl;
> +            c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_avx512icl;
> +        }

Needs an ARCH_X86_64 guard as the code is 64-bit only.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/6] avcodec/x86/hevc_mc: add qpel_h8_8_avx512icl and qpel_hv8_8_avx512icl
  2022-03-10 15:22   ` Henrik Gramner
@ 2022-03-11  7:55     ` Wu, Jianhua
  0 siblings, 0 replies; 11+ messages in thread
From: Wu, Jianhua @ 2022-03-11  7:55 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Henrik Gramner:
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Henrik Gramner
> Sent: Thursday, March 10, 2022 11:22 PM
> To: FFmpeg development discussions and patches <ffmpeg-
> devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH 2/6] avcodec/x86/hevc_mc: add
> qpel_h8_8_avx512icl and qpel_hv8_8_avx512icl
> 
> On Wed, Feb 23, 2022 at 9:58 AM <jianhua.wu-at-intel.com@ffmpeg.org>
> wrote:
> > +%macro HEVC_PUT_HEVC_QPEL_AVX512ICL 2
> > [...]
> > +    vpmovdw             xm6, m6
> > +    movu             [dstq], xm6
> 
> vpmovdw can take a memory operand as dst directly:
> vpmovdw          [dstq], m6
> 
> (the same applies to the hv function)
> 
> > +%macro HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 2 cglobal
> > +hevc_put_hevc_qpel_hv%1_%2, 6, 7, 8, dst, src, srcstride, height, mx,
> > +my, tmp
> 
> This functions uses 27(?) vector registers but only specifies 8, so it will break
> on Windows unless corrected.
> 
> > +        if (EXTERNAL_AVX512ICL(cpu_flags)) {
> > +            c->put_hevc_qpel[3][0][1] =
> ff_hevc_put_hevc_qpel_h8_8_avx512icl;
> > +            c->put_hevc_qpel[3][1][1] =
> ff_hevc_put_hevc_qpel_hv8_8_avx512icl;
> > +        }
> 
> Needs an ARCH_X86_64 guard as the code is 64-bit only.
> 

Thanks for the careful review. I updated a version 2 here:
http://ffmpeg.org/pipermail/ffmpeg-devel/2022-March/293872.html

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2022-03-11  7:55 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-23  8:57 [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag jianhua.wu-at-intel.com
2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 2/6] avcodec/x86/hevc_mc: add qpel_h8_8_avx512icl and qpel_hv8_8_avx512icl jianhua.wu-at-intel.com
2022-03-10 15:22   ` Henrik Gramner
2022-03-11  7:55     ` Wu, Jianhua
2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 3/6] avcodec/x86/hevc_mc: add qpel_h16_8_avx512icl jianhua.wu-at-intel.com
2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 4/6] avcodec/x86/hevc_mc: add qpel_h4_8_avx512icl jianhua.wu-at-intel.com
2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 5/6] avcodec/x86/hevc_mc: add qpel_h32_8_avx512icl jianhua.wu-at-intel.com
2022-02-23  8:57 ` [FFmpeg-devel] [PATCH 6/6] avcodec/x86/hevc_mc: add qpel_h64_8_avx512icl jianhua.wu-at-intel.com
2022-03-02  5:33 ` [FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag Wu, Jianhua
2022-03-09  7:38   ` Wu, Jianhua
2022-03-10 14:34 ` Henrik Gramner

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git