* [FFmpeg-devel] [PATCH] vp9: Add AVX-512ICL asm for 8bpc subpel mc (PR #20358)
@ 2025-08-28 1:18 gramner via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: gramner via ffmpeg-devel @ 2025-08-28 1:18 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gramner
PR #20358 opened by gramner
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20358
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20358.patch
Checkasm numbers on Zen 5 (Strix Halo):
```
vp9_put_8tap_smooth_4h_8bpp_ssse3: 18.0
vp9_put_8tap_smooth_4h_8bpp_avx512icl: 7.9
vp9_put_8tap_smooth_4v_8bpp_ssse3: 18.0
vp9_put_8tap_smooth_4v_8bpp_avx512icl: 10.4
vp9_put_8tap_smooth_4hv_8bpp_ssse3: 67.8
vp9_put_8tap_smooth_4hv_8bpp_avx512icl: 14.7
vp9_put_8tap_smooth_8h_8bpp_ssse3: 28.0
vp9_put_8tap_smooth_8h_8bpp_avx512icl: 11.1
vp9_put_8tap_smooth_8v_8bpp_ssse3: 28.9
vp9_put_8tap_smooth_8v_8bpp_avx512icl: 15.1
vp9_put_8tap_smooth_8hv_8bpp_ssse3: 79.2
vp9_put_8tap_smooth_8hv_8bpp_avx512icl: 25.8
vp9_put_8tap_smooth_16h_8bpp_ssse3: 78.7
vp9_put_8tap_smooth_16h_8bpp_avx512icl: 27.3
vp9_put_8tap_smooth_16v_8bpp_ssse3: 79.1
vp9_put_8tap_smooth_16v_8bpp_avx512icl: 34.0
vp9_put_8tap_smooth_16hv_8bpp_ssse3: 199.8
vp9_put_8tap_smooth_16hv_8bpp_avx512icl: 70.2
vp9_put_8tap_smooth_32h_8bpp_avx2: 151.5
vp9_put_8tap_smooth_32h_8bpp_avx512icl: 81.6
vp9_put_8tap_smooth_32v_8bpp_avx2: 148.0
vp9_put_8tap_smooth_32v_8bpp_avx512icl: 101.0
vp9_put_8tap_smooth_32hv_8bpp_avx2: 337.7
vp9_put_8tap_smooth_32hv_8bpp_avx512icl: 247.1
vp9_put_8tap_smooth_64h_8bpp_avx2: 600.1
vp9_put_8tap_smooth_64h_8bpp_avx512icl: 259.9
vp9_put_8tap_smooth_64v_8bpp_avx2: 590.6
vp9_put_8tap_smooth_64v_8bpp_avx512icl: 252.4
vp9_put_8tap_smooth_64hv_8bpp_avx2: 1343.4
vp9_put_8tap_smooth_64hv_8bpp_avx512icl: 938.1
```
>From 915710f2e618f0fcc7a9daacfbd2fcdb3797cf2f Mon Sep 17 00:00:00 2001
From: Henrik Gramner <gramner@twoorioles.com>
Date: Wed, 27 Aug 2025 22:51:08 +0200
Subject: [PATCH] vp9: Add AVX-512ICL asm for 8bpc subpel mc
---
libavcodec/x86/vp9dsp_init.c | 12 +
libavcodec/x86/vp9dsp_init.h | 31 ++
libavcodec/x86/vp9mc.asm | 866 +++++++++++++++++++++++++++++++++++
tests/checkasm/vp9dsp.c | 6 +-
4 files changed, 912 insertions(+), 3 deletions(-)
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 4373fa3f04..15862f43bf 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -50,6 +50,13 @@ decl_mc_funcs(8, ssse3, int8_t, 32, 8);
#if ARCH_X86_64
decl_mc_funcs(16, ssse3, int8_t, 32, 8);
decl_mc_funcs(32, avx2, int8_t, 32, 8);
+#if HAVE_AVX512ICL_EXTERNAL
+decl_subpel_asm( 4, 8, avx512icl);
+decl_subpel_asm( 8, 8, avx512icl);
+decl_subpel_asm(16, 8, avx512icl);
+decl_subpel_asm(32, 8, avx512icl);
+decl_subpel_asm(64, 8, avx512icl);
+#endif
#endif
mc_rep_funcs(16, 8, 8, sse2, int16_t, 8, 8)
@@ -418,6 +425,11 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] =
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx512icl;
+ init_subpel_asm(4, 4, 8, avx512icl);
+ init_subpel_asm(3, 8, 8, avx512icl);
+ init_subpel_asm(2, 16, 8, avx512icl);
+ init_subpel_asm(1, 32, 8, avx512icl);
+ init_subpel_asm(0, 64, 8, avx512icl);
}
#endif
diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
index fc1e0557fa..5690d16970 100644
--- a/libavcodec/x86/vp9dsp_init.h
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -172,6 +172,37 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
init_subpel3_8to64(idx, type, bpp, opt); \
init_subpel2(4, idx, 4, type, bpp, opt)
+#define decl_subpel_asm_fn(type) \
+ void ff_vp9_put_8tap_##type(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my); \
+ void ff_vp9_avg_8tap_##type(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+
+#define decl_subpel_asm_dir(type) \
+ decl_subpel_asm_fn(regular_##type); \
+ decl_subpel_asm_fn(smooth_##type); \
+ decl_subpel_asm_fn(sharp_##type)
+
+#define decl_subpel_asm(sz, bpp, opt) \
+ decl_subpel_asm_dir(sz##h_##bpp##_##opt); \
+ decl_subpel_asm_dir(sz##v_##bpp##_##opt); \
+ decl_subpel_asm_dir(sz##hv_##bpp##_##opt)
+
+#define init_subpel_asm_dir(idx1, idx2, idx3, type) \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][0][idx2][idx3] = ff_vp9_put_8tap_regular_##type; \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][0][idx2][idx3] = ff_vp9_put_8tap_sharp_##type; \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][0][idx2][idx3] = ff_vp9_put_8tap_smooth_##type; \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][1][idx2][idx3] = ff_vp9_avg_8tap_regular_##type; \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][1][idx2][idx3] = ff_vp9_avg_8tap_sharp_##type; \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][idx2][idx3] = ff_vp9_avg_8tap_smooth_##type
+
+#define init_subpel_asm(idx, sz, bpp, opt) \
+ init_subpel_asm_dir(idx, 1, 0, sz##h_##bpp##_##opt); \
+ init_subpel_asm_dir(idx, 0, 1, sz##v_##bpp##_##opt); \
+ init_subpel_asm_dir(idx, 1, 1, sz##hv_##bpp##_##opt)
+
#define init_ipred_func(type, enum, sz, bpp, opt) \
dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
index efc4cfbef1..286f23c999 100644
--- a/libavcodec/x86/vp9mc.asm
+++ b/libavcodec/x86/vp9mc.asm
@@ -2,6 +2,7 @@
;* VP9 motion compensation SIMD optimizations
;*
;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;* Copyright (c) 2025 Two Orioles, LLC
;*
;* This file is part of FFmpeg.
;*
@@ -113,6 +114,89 @@ FILTER sse2
; int16_t ff_filters_16bpp[3][15][4][16]
FILTER 16bpp
+%if HAVE_AVX512ICL_EXTERNAL && ARCH_X86_64
+ALIGN 64
+spel_h_perm16: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+ db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+spel_v_perm16: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7
+ db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+ db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
+ db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39
+ db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
+ db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
+ db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+spel_hv_perm4: db 16, 32, 48, 8, 18, 34, 50, 10, 20, 36, 52, 12, 22, 38, 54, 14
+ db 32, 48, 8, 24, 34, 50, 10, 26, 36, 52, 12, 28, 38, 54, 14, 30
+ db 48, 8, 24, 40, 50, 10, 26, 42, 52, 12, 28, 44, 54, 14, 30, 46
+ db 8, 24, 40, 56, 10, 26, 42, 58, 12, 28, 44, 60, 14, 30, 46, 62
+spel_hv_perm8: db 16, 32, 48, 8, 17, 33, 49, 9, 18, 34, 50, 10, 19, 35, 51, 11
+ db 32, 48, 8, 24, 33, 49, 9, 25, 34, 50, 10, 26, 35, 51, 11, 27
+ db 48, 8, 24, 40, 49, 9, 25, 41, 50, 10, 26, 42, 51, 11, 27, 43
+ db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
+spel_hv_perm16: db 32, 8, 33, 9, 34, 10, 35, 11, 36, 12, 37, 13, 38, 14, 39, 15
+ db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
+ db 48, 24, 49, 25, 50, 26, 51, 27, 52, 28, 53, 29, 54, 30, 55, 31
+ db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+spel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+
+%define spel_h_shufA (spel_h_perm16+ 0)
+%define spel_h_shufC (spel_h_perm16+16)
+
+vp9_spel_filter_regular: db 0, 1, -5, 126, 8, -3, 1, 0
+ db -1, 3, -10, 122, 18, -6, 2, 0
+ db -1, 4, -13, 118, 27, -9, 3, -1
+ db -1, 4, -16, 112, 37, -11, 4, -1
+ db -1, 5, -18, 105, 48, -14, 4, -1
+ db -1, 5, -19, 97, 58, -16, 5, -1
+ db -1, 6, -19, 88, 68, -18, 5, -1
+ db -1, 6, -19, 78, 78, -19, 6, -1
+ db -1, 5, -18, 68, 88, -19, 6, -1
+ db -1, 5, -16, 58, 97, -19, 5, -1
+ db -1, 4, -14, 48, 105, -18, 5, -1
+ db -1, 4, -11, 37, 112, -16, 4, -1
+ db -1, 3, -9, 27, 118, -13, 4, -1
+ db 0, 2, -6, 18, 122, -10, 3, -1
+ db 0, 1, -3, 8, 126, -5, 1, 0
+vp9_spel_filter_sharp: db -1, 3, -7, 127, 8, -3, 1, 0
+ db -2, 5, -13, 125, 17, -6, 3, -1
+ db -3, 7, -17, 121, 27, -10, 5, -2
+ db -4, 9, -20, 115, 37, -13, 6, -2
+ db -4, 10, -23, 108, 48, -16, 8, -3
+ db -4, 10, -24, 100, 59, -19, 9, -3
+ db -4, 11, -24, 90, 70, -21, 10, -4
+ db -4, 11, -23, 80, 80, -23, 11, -4
+ db -4, 10, -21, 70, 90, -24, 11, -4
+ db -3, 9, -19, 59, 100, -24, 10, -4
+ db -3, 8, -16, 48, 108, -23, 10, -4
+ db -2, 6, -13, 37, 115, -20, 9, -4
+ db -2, 5, -10, 27, 121, -17, 7, -3
+ db -1, 3, -6, 17, 125, -13, 5, -2
+ db 0, 1, -3, 8, 127, -7, 3, -1
+vp9_spel_filter_smooth: db -3, -1, 32, 64, 38, 1, -3, 0
+ db -2, -2, 29, 63, 41, 2, -3, 0
+ db -2, -2, 26, 63, 43, 4, -4, 0
+ db -2, -3, 24, 62, 46, 5, -4, 0
+ db -2, -3, 21, 60, 49, 7, -4, 0
+ db -1, -4, 18, 59, 51, 9, -4, 0
+ db -1, -4, 16, 57, 53, 12, -4, -1
+ db -1, -4, 14, 55, 55, 14, -4, -1
+ db -1, -4, 12, 53, 57, 16, -4, -1
+ db 0, -4, 9, 51, 59, 18, -4, -1
+ db 0, -4, 7, 49, 60, 21, -3, -2
+ db 0, -4, 5, 46, 62, 24, -3, -2
+ db 0, -4, 4, 43, 63, 26, -2, -2
+ db 0, -3, 2, 41, 63, 29, -2, -2
+ db 0, -3, 1, 38, 64, 32, -1, -3
+
+pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7
+pd_64: dd 64
+pw_m33: times 2 dw -33
+pb_4: times 4 db 4
+%endif
+
SECTION .text
%macro filter_sse2_h_fn 1
@@ -678,3 +762,785 @@ fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16
%undef d16
%undef s32
%undef d32
+
+%if HAVE_AVX512ICL_EXTERNAL && ARCH_X86_64
+%macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb
+%if %5
+ vpermb m%2, m6, m%1
+ vpermb m%3, m7, m%1
+ vpermb m%4, m8, m%1
+%else
+%if %2 < %4 ; reuse a previous value if possible
+ pshufb m%2, m%1, m6
+%endif
+ pshufb m%3, m%1, m7
+ pshufb m%4, m%1, m8
+%endif
+ mova m%1, m5
+ vpdpbusd m%1, m%2, m9
+ mova m%2, m5
+ vpdpbusd m%2, m%3, m9
+ vpdpbusd m%1, m%3, m10
+ vpdpbusd m%2, m%4, m10
+ packusdw m%1, m%2
+ psrlw m%1, 7
+%endmacro
+
+%macro SPEL_H_INIT 2 ; put/avg, w
+cglobal vp9_%1_8tap_smooth_%2h_8, 4, 7, 0
+ lea r6, [vp9_spel_filter_smooth-8]
+ jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2h_8 %+ SUFFIX).main
+cglobal vp9_%1_8tap_sharp_%2h_8, 4, 7, 0
+ lea r6, [vp9_spel_filter_sharp-8]
+ jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2h_8 %+ SUFFIX).main
+cglobal vp9_%1_8tap_regular_%2h_8, 4, 7, 0, dst, ds, src, ss, h, mx
+ lea r6, [vp9_spel_filter_regular-8]
+.main:
+ mov mxd, mxm
+ movifnidn hd, hm
+ sub srcq, 3
+ vpbroadcastd m5, [pd_64]
+ vpbroadcastd m9, [r6+mxq*8+0]
+ vpbroadcastd m10, [r6+mxq*8+4]
+%endmacro
+
+%macro SPEL_V_INIT 2 ; put/avg, w
+cglobal vp9_%1_8tap_smooth_%2v_8, 4, 7, 0
+ lea r5, [vp9_spel_filter_smooth-8]
+ jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2v_8 %+ SUFFIX).main
+cglobal vp9_%1_8tap_sharp_%2v_8, 4, 7, 0
+ lea r5, [vp9_spel_filter_sharp-8]
+ jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2v_8 %+ SUFFIX).main
+cglobal vp9_%1_8tap_regular_%2v_8, 4, 7, 0, dst, ds, src, ss, h, mx, my
+ lea r5, [vp9_spel_filter_regular-8]
+.main:
+ mov myd, mym
+ movifnidn hd, hm
+ lea myq, [r5+myq*8]
+ vpbroadcastd m7, [pw_256]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ lea r5, [ssq*3]
+ vpbroadcastw m10, [myq+4]
+ sub srcq, r5
+ vpbroadcastw m11, [myq+6]
+%endmacro
+
+%macro SPEL_HV_INIT 2 ; put/avg, w
+cglobal vp9_%1_8tap_smooth_%2hv_8, 4, 8, 0
+ lea r6, [vp9_spel_filter_smooth-8]
+ jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2hv_8 %+ SUFFIX).main
+cglobal vp9_%1_8tap_sharp_%2hv_8, 4, 8, 0
+ lea r6, [vp9_spel_filter_sharp-8]
+ jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2hv_8 %+ SUFFIX).main
+cglobal vp9_%1_8tap_regular_%2hv_8, 4, 8, 0, dst, ds, src, ss, h, mx, my
+ lea r6, [vp9_spel_filter_regular-8]
+.main:
+%if %2 == 16
+ xor r7d, r7d
+.main2:
+%endif
+ mov mxd, mxm
+ movifnidn hd, hm
+ sub srcq, 3
+ vpbroadcastd m9, [r6+mxq*8+0]
+ vpbroadcastd m10, [r6+mxq*8+4]
+ mov mxd, mym
+ vpbroadcastd m5, [pd_64]
+ lea myq, [r6+mxq*8]
+ lea r5, [ssq*3]
+ sub srcq, r5
+%endmacro
+
+%macro MC_AVX512 1 ; put/avg
+ SPEL_H_INIT %1, 4
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ lea r5, [ssq*3]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ lea r6, [dsq*3]
+ vbroadcasti32x4 m8, [spel_h_shufC]
+.h_w4_loop:
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
+ vinserti32x4 m0, [srcq+ssq*2], 2
+ vinserti32x4 m0, [srcq+r5 ], 3
+ lea srcq, [srcq+ssq*4]
+ pshufb m1, m0, m6
+ pshufb m0, m7
+ mova m2, m5
+ vpdpbusd m2, m1, m9
+ vpdpbusd m2, m0, m10
+ vpmovsdw ym0, m2
+ psraw ym0, 7
+ packuswb ym0, ym0
+ vextracti32x4 xm1, ym0, 1
+%ifidn %1, avg
+ movd xmm2, [dstq+dsq*0]
+ pinsrd xmm2, [dstq+dsq*1], 1
+ movd xmm3, [dstq+dsq*2]
+ pinsrd xmm3, [dstq+r6 ], 1
+ pavgb xm0, xmm2
+ pavgb xm1, xmm3
+%endif
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ movd [dstq+dsq*2], xm1
+ pextrd [dstq+r6 ], xm1, 1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+
+ SPEL_H_INIT %1, 8
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ lea r5, [ssq*3]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ lea r6, [dsq*3]
+ vbroadcasti32x4 m8, [spel_h_shufC]
+.h_w8_loop:
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
+ vinserti32x4 m0, [srcq+ssq*2], 2
+ vinserti32x4 m0, [srcq+r5 ], 3
+ lea srcq, [srcq+ssq*4]
+ PUT_8TAP_H 0, 1, 2, 3
+ vpmovuswb ym0, m0
+ vextracti32x4 xm1, ym0, 1
+%ifidn %1, avg
+ movq xmm2, [dstq+dsq*0]
+ movhps xmm2, [dstq+dsq*1]
+ movq xmm3, [dstq+dsq*2]
+ movhps xmm3, [dstq+r6 ]
+ pavgb xm0, xmm2
+ pavgb xm1, xmm3
+%endif
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ movq [dstq+dsq*2], xm1
+ movhps [dstq+r6 ], xm1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+
+ SPEL_H_INIT %1, 16
+ mova m6, [spel_h_perm16]
+ vpbroadcastd m8, [pb_4]
+ paddb m7, m8, m6
+ paddb m8, m7
+.h_w16_loop:
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 1, 2, 3, 1
+ vpmovuswb ym0, m0
+%ifidn %1, avg
+ movu xm1, [dstq+dsq*0]
+ vinserti32x4 ym1, [dstq+dsq*1], 1
+ pavgb ym0, ym1
+%endif
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+
+ SPEL_H_INIT %1, 32
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ vbroadcasti32x4 m8, [spel_h_shufC]
+.h_w32_loop:
+ movu ym0, [srcq+ssq*0+8*0]
+ vinserti32x8 m0, [srcq+ssq*1+8*0], 1
+ movu ym1, [srcq+ssq*0+8*1]
+ vinserti32x8 m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 4, 3, 2
+ packuswb m0, m1
+%ifidn %1, avg
+ movu ym1, [dstq+dsq*0]
+ vinserti32x8 m1, [dstq+dsq*1], 1
+ pavgb m0, m1
+%endif
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w32_loop
+ RET
+
+ SPEL_H_INIT %1, 64
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ vbroadcasti32x4 m8, [spel_h_shufC]
+.h_w64_loop:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 4, 3, 2
+ packuswb m0, m1
+%ifidn %1, avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w64_loop
+ RET
+
+ SPEL_V_INIT %1, 4
+ movd xmm2, [srcq+ssq*0]
+ pinsrd xmm2, [srcq+ssq*1], 1
+ pinsrd xmm2, [srcq+ssq*2], 2
+ add srcq, r5
+ pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3
+ movd xmm3, [srcq+ssq*1]
+ vpbroadcastd xmm1, [srcq+ssq*2]
+ add srcq, r5
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5
+ vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6
+ palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4
+ punpcklbw xmm3, xmm1 ; 45 56
+ punpcklbw xmm1, xmm2, xmm4 ; 01 12
+ punpckhbw xmm2, xmm4 ; 23 34
+%if WIN64
+ movaps [rsp+8], xmm6
+%endif
+.v_w4_loop:
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw xmm5, xmm1, xm8 ; a0 b0
+ mova xmm1, xmm2
+ pmaddubsw xmm6, xmm2, xm9 ; a1 b1
+ mova xmm2, xmm3
+ pmaddubsw xmm3, xm10 ; a2 b2
+ paddw xmm5, xmm3
+ vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm0, 0x02 ; 7 8
+ punpcklbw xmm3, xmm4 ; 67 78
+ pmaddubsw xmm4, xmm3, xm11 ; a3 b3
+ paddw xmm6, xmm4
+ paddsw xmm5, xmm6
+ pmulhrsw xmm5, xm7
+ packuswb xmm5, xmm5
+%ifidn %1, avg
+ movd xmm4, [dstq+dsq*0]
+ pinsrd xmm4, [dstq+dsq*1], 1
+ pavgb xmm5, xmm4
+%endif
+ movd [dstq+dsq*0], xmm5
+ pextrd [dstq+dsq*1], xmm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+%if WIN64
+ movaps xmm6, [rsp+8]
+%endif
+ RET
+
+ SPEL_V_INIT %1, 8
+ movq xmm1, [srcq+ssq*0]
+ vpbroadcastq ymm0, [srcq+ssq*1]
+ vpbroadcastq ymm2, [srcq+ssq*2]
+ add srcq, r5
+ vpbroadcastq ymm5, [srcq+ssq*0]
+ vpbroadcastq ymm3, [srcq+ssq*1]
+ vpbroadcastq ymm4, [srcq+ssq*2]
+ add srcq, r5
+ vpblendd ymm1, ymm0, 0x30
+ vpblendd ymm0, ymm2, 0x30
+ punpcklbw ymm1, ymm0 ; 01 12
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm2, ymm5, 0x30
+ vpblendd ymm5, ymm3, 0x30
+ punpcklbw ymm2, ymm5 ; 23 34
+ vpblendd ymm3, ymm4, 0x30
+ vpblendd ymm4, ymm0, 0x30
+ punpcklbw ymm3, ymm4 ; 45 56
+%if WIN64
+ movaps [rsp+8], xmm6
+%endif
+.v_w8_loop:
+ vpbroadcastq ymm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw ymm5, ymm1, ym8 ; a0 b0
+ mova ymm1, ymm2
+ pmaddubsw ymm6, ymm2, ym9 ; a1 b1
+ mova ymm2, ymm3
+ pmaddubsw ymm3, ym10 ; a2 b2
+ paddw ymm5, ymm3
+ vpblendd ymm3, ymm0, ymm4, 0x30
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm4, ymm4, ymm0, 0x30
+ punpcklbw ymm3, ymm4 ; 67 78
+ pmaddubsw ymm4, ymm3, ym11 ; a3 b3
+ paddw ymm6, ymm4
+ paddsw ymm5, ymm6
+ pmulhrsw ymm5, ym7
+ vextracti128 xmm4, ymm5, 1
+ packuswb xmm5, xmm4
+%ifidn %1, avg
+ movq xmm4, [dstq+dsq*0]
+ movhps xmm4, [dstq+dsq*1]
+ pavgb xmm5, xmm4
+%endif
+ movq [dstq+dsq*0], xmm5
+ movhps [dstq+dsq*1], xmm5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+%if WIN64
+ movaps xmm6, [rsp+8]
+%endif
+ vzeroupper
+ RET
+
+ SPEL_V_INIT %1, 16
+ mova m12, [spel_v_perm16]
+ vbroadcasti32x4 m1, [srcq+ssq*0]
+ vbroadcasti32x4 ym4, [srcq+ssq*1]
+ mov r6d, 0x0f
+ vbroadcasti32x4 m2, [srcq+ssq*2]
+ add srcq, r5
+ vbroadcasti32x4 ym5, [srcq+ssq*0]
+ kmovb k1, r6d
+ vbroadcasti32x4 m3, [srcq+ssq*1]
+ vbroadcasti32x4 ym6, [srcq+ssq*2]
+ add srcq, r5
+ vbroadcasti32x4 m0, [srcq+ssq*0]
+ vshufpd m1{k1}, m4, m2, 0xcc
+ vshufpd m2{k1}, m5, m3, 0xcc
+ vshufpd m3{k1}, m6, m0, 0xcc
+ vpermb m1, m12, m1 ; 01 12
+ vpermb m2, m12, m2 ; 23 34
+ vpermb m3, m12, m3 ; 45 56
+.v_w16_loop:
+ pmaddubsw m4, m1, m8 ; a0 b0
+ mova m1, m2
+ pmaddubsw m5, m2, m9 ; a1 b1
+ mova m2, m3
+ pmaddubsw m6, m3, m10 ; a2 b2
+ mova m3, m0
+ paddw m4, m6
+ vbroadcasti32x4 ym6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m0, [srcq+ssq*0]
+ vshufpd m3{k1}, m6, m0, 0xcc
+ vpermb m3, m12, m3 ; 67 78
+ pmaddubsw m6, m3, m11 ; a3 b3
+ paddw m5, m6
+ paddsw m4, m5
+ pmulhrsw m4, m7
+ vextracti32x8 ym5, m4, 1
+ packuswb ym4, ym5
+%ifidn %1, avg
+ mova xm5, [dstq+dsq*0]
+ vinserti32x4 ym5, [dstq+dsq*1], 1
+ pavgb ym4, ym5
+%endif
+ mova [dstq+dsq*0], xm4
+ vextracti32x4 [dstq+dsq*1], ym4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+
+ SPEL_V_INIT %1, 32
+ mova m12, [spel_v_perm32]
+ pmovzxbq m14, [pb_02461357]
+ vpshrdw m13, m12, m12, 8
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ vpermb m1, m12, m0 ; 01
+ vinserti32x8 m0, [srcq+ssq*2], 0
+ add srcq, r5
+ vpermb m2, m13, m0 ; 12
+ vinserti32x8 m0, [srcq+ssq*0], 1
+ vpermb m3, m12, m0 ; 23
+ vinserti32x8 m0, [srcq+ssq*1], 0
+ vpermb m4, m13, m0 ; 34
+ vinserti32x8 m0, [srcq+ssq*2], 1
+ add srcq, r5
+ vpermb m5, m12, m0 ; 45
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ vpermb m6, m13, m0 ; 56
+.v_w32_loop:
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m15, m1, m8
+ mova m1, m3
+ pmaddubsw m16, m2, m8
+ mova m2, m4
+ pmaddubsw m17, m3, m9
+ mova m3, m5
+ pmaddubsw m18, m4, m9
+ mova m4, m6
+ pmaddubsw m19, m5, m10
+ vpermb m5, m12, m0 ; 67
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ pmaddubsw m20, m6, m10
+ vpermb m6, m13, m0 ; 78
+ paddw m15, m19
+ pmaddubsw m19, m5, m11
+ paddw m16, m20
+ pmaddubsw m20, m6, m11
+ paddw m17, m19
+ paddw m18, m20
+ paddsw m15, m17
+ paddsw m16, m18
+ pmulhrsw m15, m7
+ pmulhrsw m16, m7
+ packuswb m15, m16
+ vpermq m15, m14, m15
+%ifidn %1, avg
+ mova ym16, [dstq+dsq*0]
+ vinserti32x8 m16, [dstq+dsq*1], 1
+ pavgb m15, m16
+%endif
+ mova [dstq+dsq*0], ym15
+ vextracti32x8 [dstq+dsq*1], m15, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ vzeroupper
+ RET
+
+ SPEL_V_INIT %1, 64
+ movu m2, [srcq+ssq*0]
+ movu m4, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, r5
+ movu m13, [srcq+ssq*0]
+ movu m15, [srcq+ssq*1]
+ movu m17, [srcq+ssq*2]
+ add srcq, r5
+ movu m0, [srcq+ssq*0]
+ punpcklbw m1, m2, m4 ; 01l
+ punpckhbw m2, m4 ; 01h
+ punpcklbw m3, m4, m6 ; 12l
+ punpckhbw m4, m6 ; 12h
+ punpcklbw m5, m6, m13 ; 23l
+ punpckhbw m6, m13 ; 23h
+ punpcklbw m12, m13, m15 ; 34l
+ punpckhbw m13, m15 ; 34h
+ punpcklbw m14, m15, m17 ; 45l
+ punpckhbw m15, m17 ; 45h
+ punpcklbw m16, m17, m0 ; 56l
+ punpckhbw m17, m0 ; 56h
+%if WIN64
+ movaps [rsp+8], xmm6
+%endif
+.v_w64_loop:
+ movu m22, [srcq+ssq*1]
+ pmaddubsw m1, m8 ; a0l
+ pmaddubsw m18, m14, m10 ; a2l
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m2, m8 ; a0h
+ pmaddubsw m19, m15, m10 ; a2h
+ paddw m18, m1
+ mova m1, m5
+ paddw m19, m2
+ mova m2, m6
+ pmaddubsw m20, m5, m9 ; a1l
+ mova m5, m14
+ pmaddubsw m21, m6, m9 ; a1h
+ mova m6, m15
+ punpcklbw m14, m0, m22 ; 67l
+ punpckhbw m15, m0, m22 ; 67h
+ pmaddubsw m0, m14, m11 ; a3l
+ paddw m20, m0
+ pmaddubsw m0, m15, m11 ; a3h
+ paddw m21, m0
+ movu m0, [srcq+ssq*0]
+ paddsw m18, m20
+ paddsw m19, m21
+ pmaddubsw m3, m8 ; b0l
+ pmaddubsw m20, m16, m10 ; b2l
+ pmaddubsw m4, m8 ; b0h
+ pmaddubsw m21, m17, m10 ; b2h
+ pmulhrsw m18, m7
+ pmulhrsw m19, m7
+ paddw m20, m3
+ mova m3, m12
+ paddw m21, m4
+ mova m4, m13
+ packuswb m18, m19
+%ifidn %1, avg
+ pavgb m18, [dstq+dsq*0]
+%endif
+ mova [dstq+dsq*0], m18
+ pmaddubsw m18, m12, m9 ; b1l
+ mova m12, m16
+ punpcklbw m16, m22, m0 ; 78l
+ pmaddubsw m19, m13, m9 ; b1h
+ mova m13, m17
+ punpckhbw m17, m22, m0 ; 78h
+ pmaddubsw m22, m16, m11 ; b3l
+ paddw m18, m22
+ pmaddubsw m22, m17, m11 ; b3h
+ paddw m19, m22
+ paddsw m18, m20
+ paddsw m19, m21
+ pmulhrsw m18, m7
+ pmulhrsw m19, m7
+ packuswb m18, m19
+%ifidn %1, avg
+ pavgb m18, [dstq+dsq*1]
+%endif
+ mova [dstq+dsq*1], m18
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w64_loop
+%if WIN64
+ movaps xmm6, [rsp+8]
+%endif
+ vzeroupper
+ RET
+
+ SPEL_HV_INIT %1, 4
+ vbroadcasti32x4 ym2, [srcq+ssq*0]
+ vinserti32x4 m2, [srcq+ssq*1], 2
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 0 1 2
+ add srcq, r5
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ vinserti32x4 m0, [srcq+ssq*2], 2
+ add srcq, r5
+ vpbroadcastd m11, [myq+0]
+ vinserti32x4 m0, [srcq+ssq*0], 3 ; 3 4 5 6
+ vpbroadcastd m12, [myq+4]
+ lea r6, [dsq*3]
+ mova m8, [spel_hv_perm4]
+ pshufb m4, m2, m6
+ mova m1, m5
+ vpdpbusd m1, m4, m9
+ pshufb m4, m0, m6
+ mova m3, m5
+ vpdpbusd m3, m4, m9
+ pshufb m2, m7
+ pshufb m0, m7
+ vpdpbusd m1, m2, m10
+ vpdpbusd m3, m0, m10
+ psrad m1, 7
+ psrad m0, m3, 7
+ packuswb m1, m0 ; _3 04 15 26
+ vpermb m1, m8, m1 ; 0123 1234 2345 3456
+.hv_w4_loop:
+ movu xm4, [srcq+ssq*1]
+ vinserti32x4 ym4, [srcq+ssq*2], 1
+ vinserti32x4 m4, [srcq+r5 ], 2
+ lea srcq, [srcq+ssq*4]
+ vinserti32x4 m4, [srcq+ssq*0], 3 ; 7 8 9 a
+ mova m3, m5
+ pshufb m2, m4, m6
+ vpdpbusd m3, m2, m9
+ mova m2, m5
+ vpdpbusd m2, m1, m11
+ pshufb m4, m7
+ vpdpbusd m3, m4, m10
+ psrad m3, 7
+ packuswb m1, m0, m3 ; 37 48 59 6a
+ mova m0, m3
+ vpermb m1, m8, m1 ; 4567 5678 6789 789a
+ vpdpbusd m2, m1, m12
+ psrad m2, 7
+ vpmovdw ym2, m2
+ packuswb ym2, ym2
+ vextracti32x4 xm3, ym2, 1
+%ifidn %1, avg
+ movd xmm4, [dstq+dsq*0]
+ pinsrd xmm4, [dstq+dsq*1], 1
+ pavgb xm2, xmm4
+ movd xmm4, [dstq+dsq*2]
+ pinsrd xmm4, [dstq+r6 ], 1
+ pavgb xm3, xmm4
+%endif
+ movd [dstq+dsq*0], xm2
+ pextrd [dstq+dsq*1], xm2, 1
+ movd [dstq+dsq*2], xm3
+ pextrd [dstq+r6 ], xm3, 1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+
+ SPEL_HV_INIT %1, 8
+ vbroadcasti32x4 ym2, [srcq+ssq*0]
+ vinserti32x4 m2, [srcq+ssq*1], 2
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 0 1 2
+ add srcq, r5
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ vinserti32x4 m0, [srcq+ssq*2], 2
+ add srcq, r5
+ vpbroadcastd m11, [myq+0]
+ vinserti32x4 m0, [srcq+ssq*0], 3 ; 3 4 5 6
+ vpbroadcastd m12, [myq+4]
+ lea r6, [dsq*3]
+ vbroadcasti32x4 m8, [spel_h_shufC]
+ mova m13, [spel_hv_perm8]
+ vpaddd m14, m13, [pb_4] {1to16}
+ PUT_8TAP_H 2, 1, 3, 4
+ PUT_8TAP_H 0, 1, 3, 4
+ packuswb m2, m0 ; _3 04 15 26
+ vpermb m1, m13, m2 ; 0123 1234 2345 3456 (abcd)
+ vpermb m2, m14, m2 ; 0123 1234 2345 3456 (efgh)
+.hv_w8_loop:
+ movu xm18, [srcq+ssq*1]
+ vinserti128 ym18, [srcq+ssq*2], 1
+ vinserti32x4 m18, [srcq+r5 ], 2
+ lea srcq, [srcq+ssq*4]
+ vinserti32x4 m18, [srcq+ssq*0], 3 ; 7 8 9 a
+ PUT_8TAP_H 18, 4, 16, 17
+ mova m16, m5
+ vpdpbusd m16, m1, m11
+ mova m17, m5
+ vpdpbusd m17, m2, m11
+ packuswb m2, m0, m18 ; 37 48 59 6a
+ mova m0, m18
+ vpermb m1, m13, m2 ; 4567 5678 6789 789a (abcd)
+ vpermb m2, m14, m2 ; 4567 5678 6789 789a (efgh)
+ vpdpbusd m16, m1, m12
+ vpdpbusd m17, m2, m12
+ packusdw m16, m17
+ psrlw m16, 7
+ vpmovuswb ym16, m16
+ vextracti128 xm17, ym16, 1
+%ifidn %1, avg
+ movq xm18, [dstq+dsq*0]
+ movhps xm18, [dstq+dsq*1]
+ pavgb xm16, xm18
+ movq xm18, [dstq+dsq*2]
+ movhps xm18, [dstq+r6 ]
+ pavgb xm17, xm18
+%endif
+ movq [dstq+dsq*0], xm16
+ movhps [dstq+dsq*1], xm16
+ movq [dstq+dsq*2], xm17
+ movhps [dstq+r6 ], xm17
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .hv_w8_loop
+ vzeroupper
+ RET
+
+cglobal vp9_%1_8tap_smooth_32hv_8, 4, 8, 0
+ lea r6, [vp9_spel_filter_smooth-8]
+ mov r7d, 256*1
+ jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
+cglobal vp9_%1_8tap_sharp_32hv_8, 4, 8, 0
+ lea r6, [vp9_spel_filter_sharp-8]
+ mov r7d, 256*1
+ jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
+cglobal vp9_%1_8tap_regular_32hv_8, 4, 8, 0, dst, ds, src, ss, h, mx, my
+ lea r6, [vp9_spel_filter_regular-8]
+ mov r7d, 256*1
+ jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
+cglobal vp9_%1_8tap_smooth_64hv_8, 4, 8, 0
+ lea r6, [vp9_spel_filter_smooth-8]
+ mov r7d, 256*3
+ jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
+cglobal vp9_%1_8tap_sharp_64hv_8, 4, 8, 0
+ lea r6, [vp9_spel_filter_sharp-8]
+ mov r7d, 256*3
+ jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
+cglobal vp9_%1_8tap_regular_64hv_8, 4, 8, 0, dst, ds, src, ss, h, mx, my
+ lea r6, [vp9_spel_filter_regular-8]
+ mov r7d, 256*3
+ jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
+
+ SPEL_HV_INIT %1, 16
+ vpbroadcastw m11, [myq+0]
+ mova m6, [spel_h_perm16]
+ vpbroadcastw m12, [myq+2]
+ vpbroadcastd m8, [pb_4]
+ vpbroadcastw m13, [myq+4]
+ vpbroadcastd m15, [pw_256]
+ vpbroadcastw m14, [myq+6]
+ mova m19, [spel_hv_perm16]
+ vpandd m20, m19, [pw_m33] {1to16} ; even indices & ~32
+ paddb m7, m6, m8
+ lea r6d, [hq+r7]
+ paddb m8, m7
+%if WIN64
+ push r8
+%endif
+.hv_w16_loop0:
+ movu ym16, [srcq+ssq*0] ; 0
+ movu ym17, [srcq+ssq*1]
+ lea r7, [srcq+r5]
+ vinserti32x8 m17, [srcq+ssq*2], 1 ; 1 2
+ movu ym18, [r7+ssq*0]
+ mov r8, dstq
+ vinserti32x8 m18, [r7+ssq*1], 1 ; 3 4
+ movu ym0, [r7+ssq*2]
+ add r7, r5
+ vinserti32x8 m0, [r7+ssq*0], 1 ; 5 6
+INIT_YMM avx512icl
+ PUT_8TAP_H 16, 1, 2, 3, 1
+INIT_ZMM avx512icl
+ PUT_8TAP_H 17, 1, 2, 3, 1
+ PUT_8TAP_H 18, 1, 2, 3, 1
+ PUT_8TAP_H 0, 1, 2, 3, 1
+ packuswb m16, m17
+ packuswb m17, m18
+ packuswb m18, m0
+ vpermb m1, m20, m16 ; 01 12
+ vpermb m2, m19, m17 ; 23 34
+ vpermb m3, m19, m18 ; 45 56
+.hv_w16_loop:
+ movu ym18, [r7+ssq*1]
+ lea r7, [r7+ssq*2]
+ vinserti32x8 m18, [r7+ssq*0], 1
+ PUT_8TAP_H 18, 4, 16, 17, 1
+ pmaddubsw m16, m1, m11 ; a0 b0
+ mova m1, m2
+ pmaddubsw m17, m2, m12 ; a1 b1
+ mova m2, m3
+ pmaddubsw m3, m13 ; a2 b2
+ packuswb m4, m0, m18
+ paddw m16, m3
+ vpermb m3, m19, m4 ; 67 78
+ mova m0, m18
+ pmaddubsw m4, m3, m14 ; a3 b3
+ paddw m17, m4
+ paddsw m16, m17
+ pmulhrsw m16, m15
+ vextracti32x8 ym17, m16, 1
+ packuswb ym16, ym17
+%ifidn %1, avg
+ mova xm17, [r8+dsq*0]
+ vinserti128 ym17, [r8+dsq*1], 1
+ pavgb ym16, ym17
+%endif
+ mova [r8+dsq*0], xm16
+ vextracti128 [r8+dsq*1], ym16, 1
+ lea r8, [r8+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add srcq, 16
+ add dstq, 16
+ movzx hd, r6b
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+ vzeroupper
+%if WIN64
+ pop r8
+%endif
+ RET
+%endmacro
+
+INIT_ZMM avx512icl
+MC_AVX512 put
+MC_AVX512 avg
+
+%endif
\ No newline at end of file
diff --git a/tests/checkasm/vp9dsp.c b/tests/checkasm/vp9dsp.c
index 2fc45cff8e..2a3374541f 100644
--- a/tests/checkasm/vp9dsp.c
+++ b/tests/checkasm/vp9dsp.c
@@ -558,9 +558,9 @@ static void check_loopfilter(void)
static void check_mc(void)
{
- LOCAL_ALIGNED_32(uint8_t, buf, [72 * 72 * 2]);
- LOCAL_ALIGNED_32(uint8_t, dst0, [64 * 64 * 2]);
- LOCAL_ALIGNED_32(uint8_t, dst1, [64 * 64 * 2]);
+ LOCAL_ALIGNED_64(uint8_t, buf, [72 * 72 * 2]);
+ LOCAL_ALIGNED_64(uint8_t, dst0, [64 * 64 * 2]);
+ LOCAL_ALIGNED_64(uint8_t, dst1, [64 * 64 * 2]);
VP9DSPContext dsp;
int op, hsize, bit_depth, filter, dx, dy;
declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride,
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-08-28 1:18 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-08-28 1:18 [FFmpeg-devel] [PATCH] vp9: Add AVX-512ICL asm for 8bpc subpel mc (PR #20358) gramner via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git