Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] vp9: Add AVX-512ICL asm for 8bpc subpel mc (PR #20358)
@ 2025-08-28  1:18 gramner via ffmpeg-devel
  0 siblings, 0 replies; only message in thread
From: gramner via ffmpeg-devel @ 2025-08-28  1:18 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gramner

PR #20358 opened by gramner
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20358
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20358.patch

Checkasm numbers on Zen 5 (Strix Halo):
```
vp9_put_8tap_smooth_4h_8bpp_ssse3:         18.0
vp9_put_8tap_smooth_4h_8bpp_avx512icl:      7.9

vp9_put_8tap_smooth_4v_8bpp_ssse3:         18.0
vp9_put_8tap_smooth_4v_8bpp_avx512icl:     10.4

vp9_put_8tap_smooth_4hv_8bpp_ssse3:        67.8
vp9_put_8tap_smooth_4hv_8bpp_avx512icl:    14.7

vp9_put_8tap_smooth_8h_8bpp_ssse3:         28.0
vp9_put_8tap_smooth_8h_8bpp_avx512icl:     11.1

vp9_put_8tap_smooth_8v_8bpp_ssse3:         28.9
vp9_put_8tap_smooth_8v_8bpp_avx512icl:     15.1

vp9_put_8tap_smooth_8hv_8bpp_ssse3:        79.2
vp9_put_8tap_smooth_8hv_8bpp_avx512icl:    25.8

vp9_put_8tap_smooth_16h_8bpp_ssse3:        78.7
vp9_put_8tap_smooth_16h_8bpp_avx512icl:    27.3

vp9_put_8tap_smooth_16v_8bpp_ssse3:        79.1
vp9_put_8tap_smooth_16v_8bpp_avx512icl:    34.0

vp9_put_8tap_smooth_16hv_8bpp_ssse3:      199.8
vp9_put_8tap_smooth_16hv_8bpp_avx512icl:   70.2

vp9_put_8tap_smooth_32h_8bpp_avx2:        151.5
vp9_put_8tap_smooth_32h_8bpp_avx512icl:    81.6

vp9_put_8tap_smooth_32v_8bpp_avx2:        148.0
vp9_put_8tap_smooth_32v_8bpp_avx512icl:   101.0

vp9_put_8tap_smooth_32hv_8bpp_avx2:       337.7
vp9_put_8tap_smooth_32hv_8bpp_avx512icl:  247.1

vp9_put_8tap_smooth_64h_8bpp_avx2:        600.1
vp9_put_8tap_smooth_64h_8bpp_avx512icl:   259.9

vp9_put_8tap_smooth_64v_8bpp_avx2:        590.6
vp9_put_8tap_smooth_64v_8bpp_avx512icl:   252.4

vp9_put_8tap_smooth_64hv_8bpp_avx2:      1343.4
vp9_put_8tap_smooth_64hv_8bpp_avx512icl:  938.1
```



>From 915710f2e618f0fcc7a9daacfbd2fcdb3797cf2f Mon Sep 17 00:00:00 2001
From: Henrik Gramner <gramner@twoorioles.com>
Date: Wed, 27 Aug 2025 22:51:08 +0200
Subject: [PATCH] vp9: Add AVX-512ICL asm for 8bpc subpel mc

---
 libavcodec/x86/vp9dsp_init.c |  12 +
 libavcodec/x86/vp9dsp_init.h |  31 ++
 libavcodec/x86/vp9mc.asm     | 866 +++++++++++++++++++++++++++++++++++
 tests/checkasm/vp9dsp.c      |   6 +-
 4 files changed, 912 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 4373fa3f04..15862f43bf 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -50,6 +50,13 @@ decl_mc_funcs(8, ssse3, int8_t, 32, 8);
 #if ARCH_X86_64
 decl_mc_funcs(16, ssse3, int8_t, 32, 8);
 decl_mc_funcs(32, avx2, int8_t, 32, 8);
+#if HAVE_AVX512ICL_EXTERNAL
+decl_subpel_asm( 4, 8, avx512icl);
+decl_subpel_asm( 8, 8, avx512icl);
+decl_subpel_asm(16, 8, avx512icl);
+decl_subpel_asm(32, 8, avx512icl);
+decl_subpel_asm(64, 8, avx512icl);
+#endif
 #endif
 
 mc_rep_funcs(16,  8,  8,  sse2, int16_t,  8, 8)
@@ -418,6 +425,11 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
         dsp->itxfm_add[TX_32X32][ADST_DCT]  =
         dsp->itxfm_add[TX_32X32][DCT_ADST]  =
         dsp->itxfm_add[TX_32X32][DCT_DCT]   = ff_vp9_idct_idct_32x32_add_avx512icl;
+        init_subpel_asm(4,  4, 8, avx512icl);
+        init_subpel_asm(3,  8, 8, avx512icl);
+        init_subpel_asm(2, 16, 8, avx512icl);
+        init_subpel_asm(1, 32, 8, avx512icl);
+        init_subpel_asm(0, 64, 8, avx512icl);
     }
 #endif
 
diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
index fc1e0557fa..5690d16970 100644
--- a/libavcodec/x86/vp9dsp_init.h
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -172,6 +172,37 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
     init_subpel3_8to64(idx, type, bpp, opt); \
     init_subpel2(4, idx,  4, type, bpp, opt)
 
+#define decl_subpel_asm_fn(type) \
+    void ff_vp9_put_8tap_##type(uint8_t *dst, ptrdiff_t dst_stride, \
+                                const uint8_t *src, ptrdiff_t src_stride, \
+                                int h, int mx, int my); \
+    void ff_vp9_avg_8tap_##type(uint8_t *dst, ptrdiff_t dst_stride, \
+                                const uint8_t *src, ptrdiff_t src_stride, \
+                                int h, int mx, int my)
+
+#define decl_subpel_asm_dir(type) \
+    decl_subpel_asm_fn(regular_##type); \
+    decl_subpel_asm_fn(smooth_##type); \
+    decl_subpel_asm_fn(sharp_##type)
+
+#define decl_subpel_asm(sz, bpp, opt) \
+    decl_subpel_asm_dir(sz##h_##bpp##_##opt); \
+    decl_subpel_asm_dir(sz##v_##bpp##_##opt); \
+    decl_subpel_asm_dir(sz##hv_##bpp##_##opt)
+
+#define init_subpel_asm_dir(idx1, idx2, idx3, type) \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][0][idx2][idx3] = ff_vp9_put_8tap_regular_##type; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][0][idx2][idx3] = ff_vp9_put_8tap_sharp_##type; \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][0][idx2][idx3] = ff_vp9_put_8tap_smooth_##type; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][1][idx2][idx3] = ff_vp9_avg_8tap_regular_##type; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][1][idx2][idx3] = ff_vp9_avg_8tap_sharp_##type; \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][idx2][idx3] = ff_vp9_avg_8tap_smooth_##type
+
+#define init_subpel_asm(idx, sz, bpp, opt) \
+    init_subpel_asm_dir(idx, 1, 0, sz##h_##bpp##_##opt); \
+    init_subpel_asm_dir(idx, 0, 1, sz##v_##bpp##_##opt); \
+    init_subpel_asm_dir(idx, 1, 1, sz##hv_##bpp##_##opt)
+
 #define init_ipred_func(type, enum, sz, bpp, opt) \
     dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
         cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
index efc4cfbef1..286f23c999 100644
--- a/libavcodec/x86/vp9mc.asm
+++ b/libavcodec/x86/vp9mc.asm
@@ -2,6 +2,7 @@
 ;* VP9 motion compensation SIMD optimizations
 ;*
 ;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;* Copyright (c) 2025 Two Orioles, LLC
 ;*
 ;* This file is part of FFmpeg.
 ;*
@@ -113,6 +114,89 @@ FILTER sse2
 ; int16_t ff_filters_16bpp[3][15][4][16]
 FILTER 16bpp
 
+%if HAVE_AVX512ICL_EXTERNAL && ARCH_X86_64
+ALIGN 64
+spel_h_perm16:  db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+                db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+                db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+                db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+spel_v_perm16:  db 32,  0, 33,  1, 34,  2, 35,  3, 36,  4, 37,  5, 38,  6, 39,  7
+                db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
+                db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
+                db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+spel_v_perm32:  db  0, 32,  1, 33,  2, 34,  3, 35,  4, 36,  5, 37,  6, 38,  7, 39
+                db  8, 40,  9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
+                db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
+                db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+spel_hv_perm4:  db 16, 32, 48,  8, 18, 34, 50, 10, 20, 36, 52, 12, 22, 38, 54, 14
+                db 32, 48,  8, 24, 34, 50, 10, 26, 36, 52, 12, 28, 38, 54, 14, 30
+                db 48,  8, 24, 40, 50, 10, 26, 42, 52, 12, 28, 44, 54, 14, 30, 46
+                db  8, 24, 40, 56, 10, 26, 42, 58, 12, 28, 44, 60, 14, 30, 46, 62
+spel_hv_perm8:  db 16, 32, 48,  8, 17, 33, 49,  9, 18, 34, 50, 10, 19, 35, 51, 11
+                db 32, 48,  8, 24, 33, 49,  9, 25, 34, 50, 10, 26, 35, 51, 11, 27
+                db 48,  8, 24, 40, 49,  9, 25, 41, 50, 10, 26, 42, 51, 11, 27, 43
+                db  8, 24, 40, 56,  9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
+spel_hv_perm16: db 32,  8, 33,  9, 34, 10, 35, 11, 36, 12, 37, 13, 38, 14, 39, 15
+                db  8, 40,  9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
+                db 48, 24, 49, 25, 50, 26, 51, 27, 52, 28, 53, 29, 54, 30, 55, 31
+                db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+spel_h_shufB:   db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+
+%define spel_h_shufA (spel_h_perm16+ 0)
+%define spel_h_shufC (spel_h_perm16+16)
+
+vp9_spel_filter_regular: db   0,   1,  -5, 126,   8,  -3,   1,   0
+                         db  -1,   3, -10, 122,  18,  -6,   2,   0
+                         db  -1,   4, -13, 118,  27,  -9,   3,  -1
+                         db  -1,   4, -16, 112,  37, -11,   4,  -1
+                         db  -1,   5, -18, 105,  48, -14,   4,  -1
+                         db  -1,   5, -19,  97,  58, -16,   5,  -1
+                         db  -1,   6, -19,  88,  68, -18,   5,  -1
+                         db  -1,   6, -19,  78,  78, -19,   6,  -1
+                         db  -1,   5, -18,  68,  88, -19,   6,  -1
+                         db  -1,   5, -16,  58,  97, -19,   5,  -1
+                         db  -1,   4, -14,  48, 105, -18,   5,  -1
+                         db  -1,   4, -11,  37, 112, -16,   4,  -1
+                         db  -1,   3,  -9,  27, 118, -13,   4,  -1
+                         db   0,   2,  -6,  18, 122, -10,   3,  -1
+                         db   0,   1,  -3,   8, 126,  -5,   1,   0
+vp9_spel_filter_sharp:   db  -1,   3,  -7, 127,   8,  -3,   1,   0
+                         db  -2,   5, -13, 125,  17,  -6,   3,  -1
+                         db  -3,   7, -17, 121,  27, -10,   5,  -2
+                         db  -4,   9, -20, 115,  37, -13,   6,  -2
+                         db  -4,  10, -23, 108,  48, -16,   8,  -3
+                         db  -4,  10, -24, 100,  59, -19,   9,  -3
+                         db  -4,  11, -24,  90,  70, -21,  10,  -4
+                         db  -4,  11, -23,  80,  80, -23,  11,  -4
+                         db  -4,  10, -21,  70,  90, -24,  11,  -4
+                         db  -3,   9, -19,  59, 100, -24,  10,  -4
+                         db  -3,   8, -16,  48, 108, -23,  10,  -4
+                         db  -2,   6, -13,  37, 115, -20,   9,  -4
+                         db  -2,   5, -10,  27, 121, -17,   7,  -3
+                         db  -1,   3,  -6,  17, 125, -13,   5,  -2
+                         db   0,   1,  -3,   8, 127,  -7,   3,  -1
+vp9_spel_filter_smooth:  db  -3,  -1,  32,  64,  38,   1,  -3,   0
+                         db  -2,  -2,  29,  63,  41,   2,  -3,   0
+                         db  -2,  -2,  26,  63,  43,   4,  -4,   0
+                         db  -2,  -3,  24,  62,  46,   5,  -4,   0
+                         db  -2,  -3,  21,  60,  49,   7,  -4,   0
+                         db  -1,  -4,  18,  59,  51,   9,  -4,   0
+                         db  -1,  -4,  16,  57,  53,  12,  -4,  -1
+                         db  -1,  -4,  14,  55,  55,  14,  -4,  -1
+                         db  -1,  -4,  12,  53,  57,  16,  -4,  -1
+                         db   0,  -4,   9,  51,  59,  18,  -4,  -1
+                         db   0,  -4,   7,  49,  60,  21,  -3,  -2
+                         db   0,  -4,   5,  46,  62,  24,  -3,  -2
+                         db   0,  -4,   4,  43,  63,  26,  -2,  -2
+                         db   0,  -3,   2,  41,  63,  29,  -2,  -2
+                         db   0,  -3,   1,  38,  64,  32,  -1,  -3
+
+pb_02461357:    db  0,  2,  4,  6,  1,  3,  5,  7
+pd_64:          dd 64
+pw_m33:         times 2 dw -33
+pb_4:           times 4 db 4
+%endif
+
 SECTION .text
 
 %macro filter_sse2_h_fn 1
@@ -678,3 +762,785 @@ fpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16
 %undef d16
 %undef s32
 %undef d32
+
+%if HAVE_AVX512ICL_EXTERNAL && ARCH_X86_64
+%macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb
+%if %5
+    vpermb              m%2, m6, m%1
+    vpermb              m%3, m7, m%1
+    vpermb              m%4, m8, m%1
+%else
+%if %2 < %4 ; reuse a previous value if possible
+    pshufb              m%2, m%1, m6
+%endif
+    pshufb              m%3, m%1, m7
+    pshufb              m%4, m%1, m8
+%endif
+    mova                m%1, m5
+    vpdpbusd            m%1, m%2, m9
+    mova                m%2, m5
+    vpdpbusd            m%2, m%3, m9
+    vpdpbusd            m%1, m%3, m10
+    vpdpbusd            m%2, m%4, m10
+    packusdw            m%1, m%2
+    psrlw               m%1, 7
+%endmacro
+
+%macro SPEL_H_INIT 2 ; put/avg, w
+cglobal vp9_%1_8tap_smooth_%2h_8, 4, 7, 0
+    lea                  r6, [vp9_spel_filter_smooth-8]
+    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2h_8 %+ SUFFIX).main
+cglobal vp9_%1_8tap_sharp_%2h_8, 4, 7, 0
+    lea                  r6, [vp9_spel_filter_sharp-8]
+    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2h_8 %+ SUFFIX).main
+cglobal vp9_%1_8tap_regular_%2h_8, 4, 7, 0, dst, ds, src, ss, h, mx
+    lea                  r6, [vp9_spel_filter_regular-8]
+.main:
+    mov                 mxd, mxm
+    movifnidn            hd, hm
+    sub                srcq, 3
+    vpbroadcastd         m5, [pd_64]
+    vpbroadcastd         m9, [r6+mxq*8+0]
+    vpbroadcastd        m10, [r6+mxq*8+4]
+%endmacro
+
+%macro SPEL_V_INIT 2 ; put/avg, w
+cglobal vp9_%1_8tap_smooth_%2v_8, 4, 7, 0
+    lea                  r5, [vp9_spel_filter_smooth-8]
+    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2v_8 %+ SUFFIX).main
+cglobal vp9_%1_8tap_sharp_%2v_8, 4, 7, 0
+    lea                  r5, [vp9_spel_filter_sharp-8]
+    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2v_8 %+ SUFFIX).main
+cglobal vp9_%1_8tap_regular_%2v_8, 4, 7, 0, dst, ds, src, ss, h, mx, my
+    lea                  r5, [vp9_spel_filter_regular-8]
+.main:
+    mov                 myd, mym
+    movifnidn            hd, hm
+    lea                 myq, [r5+myq*8]
+    vpbroadcastd         m7, [pw_256]
+    vpbroadcastw         m8, [myq+0]
+    vpbroadcastw         m9, [myq+2]
+    lea                  r5, [ssq*3]
+    vpbroadcastw        m10, [myq+4]
+    sub                srcq, r5
+    vpbroadcastw        m11, [myq+6]
+%endmacro
+
+%macro SPEL_HV_INIT 2 ; put/avg, w
+cglobal vp9_%1_8tap_smooth_%2hv_8, 4, 8, 0
+    lea                  r6, [vp9_spel_filter_smooth-8]
+    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2hv_8 %+ SUFFIX).main
+cglobal vp9_%1_8tap_sharp_%2hv_8, 4, 8, 0
+    lea                  r6, [vp9_spel_filter_sharp-8]
+    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2hv_8 %+ SUFFIX).main
+cglobal vp9_%1_8tap_regular_%2hv_8, 4, 8, 0, dst, ds, src, ss, h, mx, my
+    lea                  r6, [vp9_spel_filter_regular-8]
+.main:
+%if %2 == 16
+    xor                r7d, r7d
+.main2:
+%endif
+    mov                 mxd, mxm
+    movifnidn            hd, hm
+    sub                srcq, 3
+    vpbroadcastd         m9, [r6+mxq*8+0]
+    vpbroadcastd        m10, [r6+mxq*8+4]
+    mov                 mxd, mym
+    vpbroadcastd         m5, [pd_64]
+    lea                 myq, [r6+mxq*8]
+    lea                  r5, [ssq*3]
+    sub                srcq, r5
+%endmacro
+
+%macro MC_AVX512 1 ; put/avg
+    SPEL_H_INIT          %1, 4
+    vbroadcasti32x4      m6, [spel_h_shufA]
+    lea                  r5, [ssq*3]
+    vbroadcasti32x4      m7, [spel_h_shufB]
+    lea                  r6, [dsq*3]
+    vbroadcasti32x4      m8, [spel_h_shufC]
+.h_w4_loop:
+    movu                xm0, [srcq+ssq*0]
+    vinserti32x4        ym0, [srcq+ssq*1], 1
+    vinserti32x4         m0, [srcq+ssq*2], 2
+    vinserti32x4         m0, [srcq+r5   ], 3
+    lea                srcq, [srcq+ssq*4]
+    pshufb               m1, m0, m6
+    pshufb               m0, m7
+    mova                 m2, m5
+    vpdpbusd             m2, m1, m9
+    vpdpbusd             m2, m0, m10
+    vpmovsdw            ym0, m2
+    psraw               ym0, 7
+    packuswb            ym0, ym0
+    vextracti32x4       xm1, ym0, 1
+%ifidn %1, avg
+    movd               xmm2, [dstq+dsq*0]
+    pinsrd             xmm2, [dstq+dsq*1], 1
+    movd               xmm3, [dstq+dsq*2]
+    pinsrd             xmm3, [dstq+r6   ], 1
+    pavgb               xm0, xmm2
+    pavgb               xm1, xmm3
+%endif
+    movd       [dstq+dsq*0], xm0
+    pextrd     [dstq+dsq*1], xm0, 1
+    movd       [dstq+dsq*2], xm1
+    pextrd     [dstq+r6   ], xm1, 1
+    lea                dstq, [dstq+dsq*4]
+    sub                  hd, 4
+    jg .h_w4_loop
+    RET
+
+    SPEL_H_INIT          %1, 8
+    vbroadcasti32x4      m6, [spel_h_shufA]
+    lea                  r5, [ssq*3]
+    vbroadcasti32x4      m7, [spel_h_shufB]
+    lea                  r6, [dsq*3]
+    vbroadcasti32x4      m8, [spel_h_shufC]
+.h_w8_loop:
+    movu                xm0, [srcq+ssq*0]
+    vinserti32x4        ym0, [srcq+ssq*1], 1
+    vinserti32x4         m0, [srcq+ssq*2], 2
+    vinserti32x4         m0, [srcq+r5   ], 3
+    lea                srcq, [srcq+ssq*4]
+    PUT_8TAP_H            0, 1, 2, 3
+    vpmovuswb           ym0, m0
+    vextracti32x4       xm1, ym0, 1
+%ifidn %1, avg
+    movq               xmm2, [dstq+dsq*0]
+    movhps             xmm2, [dstq+dsq*1]
+    movq               xmm3, [dstq+dsq*2]
+    movhps             xmm3, [dstq+r6   ]
+    pavgb               xm0, xmm2
+    pavgb               xm1, xmm3
+%endif
+    movq       [dstq+dsq*0], xm0
+    movhps     [dstq+dsq*1], xm0
+    movq       [dstq+dsq*2], xm1
+    movhps     [dstq+r6   ], xm1
+    lea                dstq, [dstq+dsq*4]
+    sub                  hd, 4
+    jg .h_w8_loop
+    RET
+
+    SPEL_H_INIT          %1, 16
+    mova                 m6, [spel_h_perm16]
+    vpbroadcastd         m8, [pb_4]
+    paddb                m7, m8, m6
+    paddb                m8, m7
+.h_w16_loop:
+    movu                ym0, [srcq+ssq*0]
+    vinserti32x8         m0, [srcq+ssq*1], 1
+    lea                srcq, [srcq+ssq*2]
+    PUT_8TAP_H            0, 1, 2, 3, 1
+    vpmovuswb           ym0, m0
+%ifidn %1, avg
+    movu                xm1, [dstq+dsq*0]
+    vinserti32x4        ym1, [dstq+dsq*1], 1
+    pavgb               ym0, ym1
+%endif
+    mova         [dstq+dsq*0], xm0
+    vextracti128 [dstq+dsq*1], ym0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w16_loop
+    RET
+
+    SPEL_H_INIT          %1, 32
+    vbroadcasti32x4      m6, [spel_h_shufA]
+    vbroadcasti32x4      m7, [spel_h_shufB]
+    vbroadcasti32x4      m8, [spel_h_shufC]
+.h_w32_loop:
+    movu                ym0, [srcq+ssq*0+8*0]
+    vinserti32x8         m0, [srcq+ssq*1+8*0], 1
+    movu                ym1, [srcq+ssq*0+8*1]
+    vinserti32x8         m1, [srcq+ssq*1+8*1], 1
+    lea                srcq, [srcq+ssq*2]
+    PUT_8TAP_H            0, 2, 3, 4
+    PUT_8TAP_H            1, 4, 3, 2
+    packuswb             m0, m1
+%ifidn %1, avg
+    movu                ym1, [dstq+dsq*0]
+    vinserti32x8         m1, [dstq+dsq*1], 1
+    pavgb                m0, m1
+%endif
+    mova          [dstq+dsq*0], ym0
+    vextracti32x8 [dstq+dsq*1], m0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w32_loop
+    RET
+
+    SPEL_H_INIT          %1, 64
+    vbroadcasti32x4      m6, [spel_h_shufA]
+    vbroadcasti32x4      m7, [spel_h_shufB]
+    vbroadcasti32x4      m8, [spel_h_shufC]
+.h_w64_loop:
+    movu                 m0, [srcq+8*0]
+    movu                 m1, [srcq+8*1]
+    add                srcq, ssq
+    PUT_8TAP_H            0, 2, 3, 4
+    PUT_8TAP_H            1, 4, 3, 2
+    packuswb             m0, m1
+%ifidn %1, avg
+    pavgb                m0, [dstq]
+%endif
+    mova             [dstq], m0
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w64_loop
+    RET
+
+    SPEL_V_INIT          %1, 4
+    movd               xmm2, [srcq+ssq*0]
+    pinsrd             xmm2, [srcq+ssq*1], 1
+    pinsrd             xmm2, [srcq+ssq*2], 2
+    add                srcq, r5
+    pinsrd             xmm2, [srcq+ssq*0], 3  ; 0 1 2 3
+    movd               xmm3, [srcq+ssq*1]
+    vpbroadcastd       xmm1, [srcq+ssq*2]
+    add                srcq, r5
+    vpbroadcastd       xmm0, [srcq+ssq*0]
+    vpblendd           xmm3, xmm3, xmm1, 0x02 ; 4 5
+    vpblendd           xmm1, xmm1, xmm0, 0x02 ; 5 6
+    palignr            xmm4, xmm3, xmm2, 4    ; 1 2 3 4
+    punpcklbw          xmm3, xmm1             ; 45 56
+    punpcklbw          xmm1, xmm2, xmm4       ; 01 12
+    punpckhbw          xmm2, xmm4             ; 23 34
+%if WIN64
+    movaps          [rsp+8], xmm6
+%endif
+.v_w4_loop:
+    vpbroadcastd       xmm4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pmaddubsw          xmm5, xmm1, xm8        ; a0 b0
+    mova               xmm1, xmm2
+    pmaddubsw          xmm6, xmm2, xm9        ; a1 b1
+    mova               xmm2, xmm3
+    pmaddubsw          xmm3, xm10             ; a2 b2
+    paddw              xmm5, xmm3
+    vpblendd           xmm3, xmm0, xmm4, 0x02 ; 6 7
+    vpbroadcastd       xmm0, [srcq+ssq*0]
+    vpblendd           xmm4, xmm0, 0x02       ; 7 8
+    punpcklbw          xmm3, xmm4             ; 67 78
+    pmaddubsw          xmm4, xmm3, xm11       ; a3 b3
+    paddw              xmm6, xmm4
+    paddsw             xmm5, xmm6
+    pmulhrsw           xmm5, xm7
+    packuswb           xmm5, xmm5
+%ifidn %1, avg
+    movd               xmm4, [dstq+dsq*0]
+    pinsrd             xmm4, [dstq+dsq*1], 1
+    pavgb              xmm5, xmm4
+%endif
+    movd       [dstq+dsq*0], xmm5
+    pextrd     [dstq+dsq*1], xmm5, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w4_loop
+%if WIN64
+    movaps             xmm6, [rsp+8]
+%endif
+    RET
+
+    SPEL_V_INIT          %1, 8
+    movq               xmm1, [srcq+ssq*0]
+    vpbroadcastq       ymm0, [srcq+ssq*1]
+    vpbroadcastq       ymm2, [srcq+ssq*2]
+    add                srcq, r5
+    vpbroadcastq       ymm5, [srcq+ssq*0]
+    vpbroadcastq       ymm3, [srcq+ssq*1]
+    vpbroadcastq       ymm4, [srcq+ssq*2]
+    add                srcq, r5
+    vpblendd           ymm1, ymm0, 0x30
+    vpblendd           ymm0, ymm2, 0x30
+    punpcklbw          ymm1, ymm0       ; 01 12
+    vpbroadcastq       ymm0, [srcq+ssq*0]
+    vpblendd           ymm2, ymm5, 0x30
+    vpblendd           ymm5, ymm3, 0x30
+    punpcklbw          ymm2, ymm5       ; 23 34
+    vpblendd           ymm3, ymm4, 0x30
+    vpblendd           ymm4, ymm0, 0x30
+    punpcklbw          ymm3, ymm4       ; 45 56
+%if WIN64
+    movaps          [rsp+8], xmm6
+%endif
+.v_w8_loop:
+    vpbroadcastq       ymm4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pmaddubsw          ymm5, ymm1, ym8  ; a0 b0
+    mova               ymm1, ymm2
+    pmaddubsw          ymm6, ymm2, ym9  ; a1 b1
+    mova               ymm2, ymm3
+    pmaddubsw          ymm3, ym10       ; a2 b2
+    paddw              ymm5, ymm3
+    vpblendd           ymm3, ymm0, ymm4, 0x30
+    vpbroadcastq       ymm0, [srcq+ssq*0]
+    vpblendd           ymm4, ymm4, ymm0, 0x30
+    punpcklbw          ymm3, ymm4       ; 67 78
+    pmaddubsw          ymm4, ymm3, ym11 ; a3 b3
+    paddw              ymm6, ymm4
+    paddsw             ymm5, ymm6
+    pmulhrsw           ymm5, ym7
+    vextracti128       xmm4, ymm5, 1
+    packuswb           xmm5, xmm4
+%ifidn %1, avg
+    movq               xmm4, [dstq+dsq*0]
+    movhps             xmm4, [dstq+dsq*1]
+    pavgb              xmm5, xmm4
+%endif
+    movq       [dstq+dsq*0], xmm5
+    movhps     [dstq+dsq*1], xmm5
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w8_loop
+%if WIN64
+    movaps             xmm6, [rsp+8]
+%endif
+    vzeroupper
+    RET
+
+    SPEL_V_INIT          %1, 16
+    mova                m12, [spel_v_perm16]
+    vbroadcasti32x4      m1, [srcq+ssq*0]
+    vbroadcasti32x4     ym4, [srcq+ssq*1]
+    mov                 r6d, 0x0f
+    vbroadcasti32x4      m2, [srcq+ssq*2]
+    add                srcq, r5
+    vbroadcasti32x4     ym5, [srcq+ssq*0]
+    kmovb                k1, r6d
+    vbroadcasti32x4      m3, [srcq+ssq*1]
+    vbroadcasti32x4     ym6, [srcq+ssq*2]
+    add                srcq, r5
+    vbroadcasti32x4      m0, [srcq+ssq*0]
+    vshufpd          m1{k1}, m4, m2, 0xcc
+    vshufpd          m2{k1}, m5, m3, 0xcc
+    vshufpd          m3{k1}, m6, m0, 0xcc
+    vpermb               m1, m12, m1 ; 01 12
+    vpermb               m2, m12, m2 ; 23 34
+    vpermb               m3, m12, m3 ; 45 56
+.v_w16_loop:
+    pmaddubsw            m4, m1, m8  ; a0 b0
+    mova                 m1, m2
+    pmaddubsw            m5, m2, m9  ; a1 b1
+    mova                 m2, m3
+    pmaddubsw            m6, m3, m10 ; a2 b2
+    mova                 m3, m0
+    paddw                m4, m6
+    vbroadcasti32x4     ym6, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vbroadcasti32x4      m0, [srcq+ssq*0]
+    vshufpd          m3{k1}, m6, m0, 0xcc
+    vpermb               m3, m12, m3 ; 67 78
+    pmaddubsw            m6, m3, m11 ; a3 b3
+    paddw                m5, m6
+    paddsw               m4, m5
+    pmulhrsw             m4, m7
+    vextracti32x8       ym5, m4, 1
+    packuswb            ym4, ym5
+%ifidn %1, avg
+    mova                xm5, [dstq+dsq*0]
+    vinserti32x4        ym5, [dstq+dsq*1], 1
+    pavgb               ym4, ym5
+%endif
+    mova          [dstq+dsq*0], xm4
+    vextracti32x4 [dstq+dsq*1], ym4, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w16_loop
+    RET
+
+    SPEL_V_INIT          %1, 32
+    mova                m12, [spel_v_perm32]
+    pmovzxbq            m14, [pb_02461357]
+    vpshrdw             m13, m12, m12, 8
+    movu                ym0, [srcq+ssq*0]
+    vinserti32x8         m0, [srcq+ssq*1], 1
+    vpermb               m1, m12, m0 ; 01
+    vinserti32x8         m0, [srcq+ssq*2], 0
+    add                srcq, r5
+    vpermb               m2, m13, m0 ; 12
+    vinserti32x8         m0, [srcq+ssq*0], 1
+    vpermb               m3, m12, m0 ; 23
+    vinserti32x8         m0, [srcq+ssq*1], 0
+    vpermb               m4, m13, m0 ; 34
+    vinserti32x8         m0, [srcq+ssq*2], 1
+    add                srcq, r5
+    vpermb               m5, m12, m0 ; 45
+    vinserti32x8         m0, [srcq+ssq*0], 0
+    vpermb               m6, m13, m0 ; 56
+.v_w32_loop:
+    vinserti32x8         m0, [srcq+ssq*1], 1
+    lea                srcq, [srcq+ssq*2]
+    pmaddubsw           m15, m1, m8
+    mova                 m1, m3
+    pmaddubsw           m16, m2, m8
+    mova                 m2, m4
+    pmaddubsw           m17, m3, m9
+    mova                 m3, m5
+    pmaddubsw           m18, m4, m9
+    mova                 m4, m6
+    pmaddubsw           m19, m5, m10
+    vpermb               m5, m12, m0 ; 67
+    vinserti32x8         m0, [srcq+ssq*0], 0
+    pmaddubsw           m20, m6, m10
+    vpermb               m6, m13, m0 ; 78
+    paddw               m15, m19
+    pmaddubsw           m19, m5, m11
+    paddw               m16, m20
+    pmaddubsw           m20, m6, m11
+    paddw               m17, m19
+    paddw               m18, m20
+    paddsw              m15, m17
+    paddsw              m16, m18
+    pmulhrsw            m15, m7
+    pmulhrsw            m16, m7
+    packuswb            m15, m16
+    vpermq              m15, m14, m15
+%ifidn %1, avg
+    mova               ym16, [dstq+dsq*0]
+    vinserti32x8        m16, [dstq+dsq*1], 1
+    pavgb               m15, m16
+%endif
+    mova          [dstq+dsq*0], ym15
+    vextracti32x8 [dstq+dsq*1], m15, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w32_loop
+    vzeroupper
+    RET
+
+    SPEL_V_INIT          %1, 64
+    movu                 m2, [srcq+ssq*0]
+    movu                 m4, [srcq+ssq*1]
+    movu                 m6, [srcq+ssq*2]
+    add                srcq, r5
+    movu                m13, [srcq+ssq*0]
+    movu                m15, [srcq+ssq*1]
+    movu                m17, [srcq+ssq*2]
+    add                srcq, r5
+    movu                 m0, [srcq+ssq*0]
+    punpcklbw            m1, m2, m4   ; 01l
+    punpckhbw            m2, m4       ; 01h
+    punpcklbw            m3, m4, m6   ; 12l
+    punpckhbw            m4, m6       ; 12h
+    punpcklbw            m5, m6, m13  ; 23l
+    punpckhbw            m6, m13      ; 23h
+    punpcklbw           m12, m13, m15 ; 34l
+    punpckhbw           m13, m15      ; 34h
+    punpcklbw           m14, m15, m17 ; 45l
+    punpckhbw           m15, m17      ; 45h
+    punpcklbw           m16, m17, m0  ; 56l
+    punpckhbw           m17, m0       ; 56h
+%if WIN64
+    movaps          [rsp+8], xmm6
+%endif
+.v_w64_loop:
+    movu                m22, [srcq+ssq*1]
+    pmaddubsw            m1, m8       ; a0l
+    pmaddubsw           m18, m14, m10 ; a2l
+    lea                srcq, [srcq+ssq*2]
+    pmaddubsw            m2, m8       ; a0h
+    pmaddubsw           m19, m15, m10 ; a2h
+    paddw               m18, m1
+    mova                 m1, m5
+    paddw               m19, m2
+    mova                 m2, m6
+    pmaddubsw           m20, m5, m9   ; a1l
+    mova                 m5, m14
+    pmaddubsw           m21, m6, m9   ; a1h
+    mova                 m6, m15
+    punpcklbw           m14, m0, m22  ; 67l
+    punpckhbw           m15, m0, m22  ; 67h
+    pmaddubsw            m0, m14, m11 ; a3l
+    paddw               m20, m0
+    pmaddubsw            m0, m15, m11 ; a3h
+    paddw               m21, m0
+    movu                 m0, [srcq+ssq*0]
+    paddsw              m18, m20
+    paddsw              m19, m21
+    pmaddubsw            m3, m8       ; b0l
+    pmaddubsw           m20, m16, m10 ; b2l
+    pmaddubsw            m4, m8       ; b0h
+    pmaddubsw           m21, m17, m10 ; b2h
+    pmulhrsw            m18, m7
+    pmulhrsw            m19, m7
+    paddw               m20, m3
+    mova                 m3, m12
+    paddw               m21, m4
+    mova                 m4, m13
+    packuswb            m18, m19
+%ifidn %1, avg
+    pavgb               m18, [dstq+dsq*0]
+%endif
+    mova       [dstq+dsq*0], m18
+    pmaddubsw           m18, m12, m9  ; b1l
+    mova                m12, m16
+    punpcklbw           m16, m22, m0  ; 78l
+    pmaddubsw           m19, m13, m9  ; b1h
+    mova                m13, m17
+    punpckhbw           m17, m22, m0  ; 78h
+    pmaddubsw           m22, m16, m11 ; b3l
+    paddw               m18, m22
+    pmaddubsw           m22, m17, m11 ; b3h
+    paddw               m19, m22
+    paddsw              m18, m20
+    paddsw              m19, m21
+    pmulhrsw            m18, m7
+    pmulhrsw            m19, m7
+    packuswb            m18, m19
+%ifidn %1, avg
+    pavgb               m18, [dstq+dsq*1]
+%endif
+    mova       [dstq+dsq*1], m18
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w64_loop
+%if WIN64
+    movaps             xmm6, [rsp+8]
+%endif
+    vzeroupper
+    RET
+
+    SPEL_HV_INIT         %1, 4
+    vbroadcasti32x4     ym2, [srcq+ssq*0]
+    vinserti32x4         m2, [srcq+ssq*1], 2
+    vbroadcasti32x4      m6, [spel_h_shufA]
+    vinserti32x4         m2, [srcq+ssq*2], 3 ; _ 0 1 2
+    add                srcq, r5
+    movu                xm0, [srcq+ssq*0]
+    vinserti32x4        ym0, [srcq+ssq*1], 1
+    vbroadcasti32x4      m7, [spel_h_shufB]
+    vinserti32x4         m0, [srcq+ssq*2], 2
+    add                srcq, r5
+    vpbroadcastd        m11, [myq+0]
+    vinserti32x4         m0, [srcq+ssq*0], 3 ; 3 4 5 6
+    vpbroadcastd        m12, [myq+4]
+    lea                  r6, [dsq*3]
+    mova                 m8, [spel_hv_perm4]
+    pshufb               m4, m2, m6
+    mova                 m1, m5
+    vpdpbusd             m1, m4, m9
+    pshufb               m4, m0, m6
+    mova                 m3, m5
+    vpdpbusd             m3, m4, m9
+    pshufb               m2, m7
+    pshufb               m0, m7
+    vpdpbusd             m1, m2, m10
+    vpdpbusd             m3, m0, m10
+    psrad                m1, 7
+    psrad                m0, m3, 7
+    packuswb             m1, m0     ; _3   04   15   26
+    vpermb               m1, m8, m1 ; 0123 1234 2345 3456
+.hv_w4_loop:
+    movu                xm4, [srcq+ssq*1]
+    vinserti32x4        ym4, [srcq+ssq*2], 1
+    vinserti32x4         m4, [srcq+r5   ], 2
+    lea                srcq, [srcq+ssq*4]
+    vinserti32x4         m4, [srcq+ssq*0], 3 ; 7 8 9 a
+    mova                 m3, m5
+    pshufb               m2, m4, m6
+    vpdpbusd             m3, m2, m9
+    mova                 m2, m5
+    vpdpbusd             m2, m1, m11
+    pshufb               m4, m7
+    vpdpbusd             m3, m4, m10
+    psrad                m3, 7
+    packuswb             m1, m0, m3 ; 37   48   59   6a
+    mova                 m0, m3
+    vpermb               m1, m8, m1 ; 4567 5678 6789 789a
+    vpdpbusd             m2, m1, m12
+    psrad                m2, 7
+    vpmovdw             ym2, m2
+    packuswb            ym2, ym2
+    vextracti32x4       xm3, ym2, 1
+%ifidn %1, avg
+    movd               xmm4, [dstq+dsq*0]
+    pinsrd             xmm4, [dstq+dsq*1], 1
+    pavgb               xm2, xmm4
+    movd               xmm4, [dstq+dsq*2]
+    pinsrd             xmm4, [dstq+r6   ], 1
+    pavgb               xm3, xmm4
+%endif
+    movd       [dstq+dsq*0], xm2
+    pextrd     [dstq+dsq*1], xm2, 1
+    movd       [dstq+dsq*2], xm3
+    pextrd     [dstq+r6   ], xm3, 1
+    lea                dstq, [dstq+dsq*4]
+    sub                  hd, 4
+    jg .hv_w4_loop
+    RET
+
+    SPEL_HV_INIT         %1, 8
+    vbroadcasti32x4     ym2, [srcq+ssq*0]
+    vinserti32x4         m2, [srcq+ssq*1], 2
+    vbroadcasti32x4      m6, [spel_h_shufA]
+    vinserti32x4         m2, [srcq+ssq*2], 3 ; _ 0 1 2
+    add                srcq, r5
+    movu                xm0, [srcq+ssq*0]
+    vinserti32x4        ym0, [srcq+ssq*1], 1
+    vbroadcasti32x4      m7, [spel_h_shufB]
+    vinserti32x4         m0, [srcq+ssq*2], 2
+    add                srcq, r5
+    vpbroadcastd        m11, [myq+0]
+    vinserti32x4         m0, [srcq+ssq*0], 3 ; 3 4 5 6
+    vpbroadcastd        m12, [myq+4]
+    lea                  r6, [dsq*3]
+    vbroadcasti32x4      m8, [spel_h_shufC]
+    mova                m13, [spel_hv_perm8]
+    vpaddd              m14, m13, [pb_4] {1to16}
+    PUT_8TAP_H            2, 1, 3, 4
+    PUT_8TAP_H            0, 1, 3, 4
+    packuswb             m2, m0      ; _3   04   15   26
+    vpermb               m1, m13, m2 ; 0123 1234 2345 3456 (abcd)
+    vpermb               m2, m14, m2 ; 0123 1234 2345 3456 (efgh)
+.hv_w8_loop:
+    movu               xm18, [srcq+ssq*1]
+    vinserti128        ym18, [srcq+ssq*2], 1
+    vinserti32x4        m18, [srcq+r5   ], 2
+    lea                srcq, [srcq+ssq*4]
+    vinserti32x4        m18, [srcq+ssq*0], 3 ; 7 8 9 a
+    PUT_8TAP_H           18, 4, 16, 17
+    mova                m16, m5
+    vpdpbusd            m16, m1, m11
+    mova                m17, m5
+    vpdpbusd            m17, m2, m11
+    packuswb             m2, m0, m18 ; 37   48   59   6a
+    mova                 m0, m18
+    vpermb               m1, m13, m2 ; 4567 5678 6789 789a (abcd)
+    vpermb               m2, m14, m2 ; 4567 5678 6789 789a (efgh)
+    vpdpbusd            m16, m1, m12
+    vpdpbusd            m17, m2, m12
+    packusdw            m16, m17
+    psrlw               m16, 7
+    vpmovuswb          ym16, m16
+    vextracti128       xm17, ym16, 1
+%ifidn %1, avg
+    movq               xm18, [dstq+dsq*0]
+    movhps             xm18, [dstq+dsq*1]
+    pavgb              xm16, xm18
+    movq               xm18, [dstq+dsq*2]
+    movhps             xm18, [dstq+r6   ]
+    pavgb              xm17, xm18
+%endif
+    movq       [dstq+dsq*0], xm16
+    movhps     [dstq+dsq*1], xm16
+    movq       [dstq+dsq*2], xm17
+    movhps     [dstq+r6   ], xm17
+    lea                dstq, [dstq+dsq*4]
+    sub                  hd, 4
+    jg .hv_w8_loop
+    vzeroupper
+    RET
+
+cglobal vp9_%1_8tap_smooth_32hv_8, 4, 8, 0
+    lea                  r6, [vp9_spel_filter_smooth-8]
+    mov                 r7d, 256*1
+    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
+cglobal vp9_%1_8tap_sharp_32hv_8, 4, 8, 0
+    lea                  r6, [vp9_spel_filter_sharp-8]
+    mov                 r7d, 256*1
+    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
+cglobal vp9_%1_8tap_regular_32hv_8, 4, 8, 0, dst, ds, src, ss, h, mx, my
+    lea                  r6, [vp9_spel_filter_regular-8]
+    mov                 r7d, 256*1
+    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
+cglobal vp9_%1_8tap_smooth_64hv_8, 4, 8, 0
+    lea                  r6, [vp9_spel_filter_smooth-8]
+    mov                 r7d, 256*3
+    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
+cglobal vp9_%1_8tap_sharp_64hv_8, 4, 8, 0
+    lea                  r6, [vp9_spel_filter_sharp-8]
+    mov                 r7d, 256*3
+    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
+cglobal vp9_%1_8tap_regular_64hv_8, 4, 8, 0, dst, ds, src, ss, h, mx, my
+    lea                  r6, [vp9_spel_filter_regular-8]
+    mov                 r7d, 256*3
+    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
+
+    SPEL_HV_INIT         %1, 16
+    vpbroadcastw        m11, [myq+0]
+    mova                 m6, [spel_h_perm16]
+    vpbroadcastw        m12, [myq+2]
+    vpbroadcastd         m8, [pb_4]
+    vpbroadcastw        m13, [myq+4]
+    vpbroadcastd        m15, [pw_256]
+    vpbroadcastw        m14, [myq+6]
+    mova                m19, [spel_hv_perm16]
+    vpandd              m20, m19, [pw_m33] {1to16} ; even indices & ~32
+    paddb                m7, m6, m8
+    lea                 r6d, [hq+r7]
+    paddb                m8, m7
+%if WIN64
+    push                 r8
+%endif
+.hv_w16_loop0:
+    movu               ym16, [srcq+ssq*0]    ; 0
+    movu               ym17, [srcq+ssq*1]
+    lea                  r7, [srcq+r5]
+    vinserti32x8        m17, [srcq+ssq*2], 1 ; 1 2
+    movu               ym18, [r7+ssq*0]
+    mov                  r8, dstq
+    vinserti32x8        m18, [r7+ssq*1], 1   ; 3 4
+    movu                ym0, [r7+ssq*2]
+    add                  r7, r5
+    vinserti32x8         m0, [r7+ssq*0], 1   ; 5 6
+INIT_YMM avx512icl
+    PUT_8TAP_H           16, 1, 2, 3, 1
+INIT_ZMM avx512icl
+    PUT_8TAP_H           17, 1, 2, 3, 1
+    PUT_8TAP_H           18, 1, 2, 3, 1
+    PUT_8TAP_H            0, 1, 2, 3, 1
+    packuswb            m16, m17
+    packuswb            m17, m18
+    packuswb            m18, m0
+    vpermb               m1, m20, m16 ; 01 12
+    vpermb               m2, m19, m17 ; 23 34
+    vpermb               m3, m19, m18 ; 45 56
+.hv_w16_loop:
+    movu               ym18, [r7+ssq*1]
+    lea                  r7, [r7+ssq*2]
+    vinserti32x8        m18, [r7+ssq*0], 1
+    PUT_8TAP_H           18, 4, 16, 17, 1
+    pmaddubsw           m16, m1, m11 ; a0 b0
+    mova                 m1, m2
+    pmaddubsw           m17, m2, m12 ; a1 b1
+    mova                 m2, m3
+    pmaddubsw            m3, m13     ; a2 b2
+    packuswb             m4, m0, m18
+    paddw               m16, m3
+    vpermb               m3, m19, m4 ; 67 78
+    mova                 m0, m18
+    pmaddubsw            m4, m3, m14 ; a3 b3
+    paddw               m17, m4
+    paddsw              m16, m17
+    pmulhrsw            m16, m15
+    vextracti32x8      ym17, m16, 1
+    packuswb           ym16, ym17
+%ifidn %1, avg
+    mova               xm17, [r8+dsq*0]
+    vinserti128        ym17, [r8+dsq*1], 1
+    pavgb              ym16, ym17
+%endif
+    mova         [r8+dsq*0], xm16
+    vextracti128 [r8+dsq*1], ym16, 1
+    lea                  r8, [r8+dsq*2]
+    sub                  hd, 2
+    jg .hv_w16_loop
+    add                srcq, 16
+    add                dstq, 16
+    movzx                hd, r6b
+    sub                 r6d, 1<<8
+    jg .hv_w16_loop0
+    vzeroupper
+%if WIN64
+    pop                  r8
+%endif
+    RET
+%endmacro
+
+INIT_ZMM avx512icl
+MC_AVX512 put
+MC_AVX512 avg
+
+%endif
\ No newline at end of file
diff --git a/tests/checkasm/vp9dsp.c b/tests/checkasm/vp9dsp.c
index 2fc45cff8e..2a3374541f 100644
--- a/tests/checkasm/vp9dsp.c
+++ b/tests/checkasm/vp9dsp.c
@@ -558,9 +558,9 @@ static void check_loopfilter(void)
 
 static void check_mc(void)
 {
-    LOCAL_ALIGNED_32(uint8_t, buf, [72 * 72 * 2]);
-    LOCAL_ALIGNED_32(uint8_t, dst0, [64 * 64 * 2]);
-    LOCAL_ALIGNED_32(uint8_t, dst1, [64 * 64 * 2]);
+    LOCAL_ALIGNED_64(uint8_t, buf, [72 * 72 * 2]);
+    LOCAL_ALIGNED_64(uint8_t, dst0, [64 * 64 * 2]);
+    LOCAL_ALIGNED_64(uint8_t, dst1, [64 * 64 * 2]);
     VP9DSPContext dsp;
     int op, hsize, bit_depth, filter, dx, dy;
     declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride,
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2025-08-28  1:18 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-08-28  1:18 [FFmpeg-devel] [PATCH] vp9: Add AVX-512ICL asm for 8bpc subpel mc (PR #20358) gramner via ffmpeg-devel

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git