From: gramner via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> To: ffmpeg-devel@ffmpeg.org Cc: gramner <code@ffmpeg.org> Subject: [FFmpeg-devel] [PATCH] vp9: Add AVX-512ICL asm for 8bpc subpel mc (PR #20358) Message-ID: <175634386154.25.12603703631889415666@463a07221176> (raw) PR #20358 opened by gramner URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20358 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20358.patch Checkasm numbers on Zen 5 (Strix Halo): ``` vp9_put_8tap_smooth_4h_8bpp_ssse3: 18.0 vp9_put_8tap_smooth_4h_8bpp_avx512icl: 7.9 vp9_put_8tap_smooth_4v_8bpp_ssse3: 18.0 vp9_put_8tap_smooth_4v_8bpp_avx512icl: 10.4 vp9_put_8tap_smooth_4hv_8bpp_ssse3: 67.8 vp9_put_8tap_smooth_4hv_8bpp_avx512icl: 14.7 vp9_put_8tap_smooth_8h_8bpp_ssse3: 28.0 vp9_put_8tap_smooth_8h_8bpp_avx512icl: 11.1 vp9_put_8tap_smooth_8v_8bpp_ssse3: 28.9 vp9_put_8tap_smooth_8v_8bpp_avx512icl: 15.1 vp9_put_8tap_smooth_8hv_8bpp_ssse3: 79.2 vp9_put_8tap_smooth_8hv_8bpp_avx512icl: 25.8 vp9_put_8tap_smooth_16h_8bpp_ssse3: 78.7 vp9_put_8tap_smooth_16h_8bpp_avx512icl: 27.3 vp9_put_8tap_smooth_16v_8bpp_ssse3: 79.1 vp9_put_8tap_smooth_16v_8bpp_avx512icl: 34.0 vp9_put_8tap_smooth_16hv_8bpp_ssse3: 199.8 vp9_put_8tap_smooth_16hv_8bpp_avx512icl: 70.2 vp9_put_8tap_smooth_32h_8bpp_avx2: 151.5 vp9_put_8tap_smooth_32h_8bpp_avx512icl: 81.6 vp9_put_8tap_smooth_32v_8bpp_avx2: 148.0 vp9_put_8tap_smooth_32v_8bpp_avx512icl: 101.0 vp9_put_8tap_smooth_32hv_8bpp_avx2: 337.7 vp9_put_8tap_smooth_32hv_8bpp_avx512icl: 247.1 vp9_put_8tap_smooth_64h_8bpp_avx2: 600.1 vp9_put_8tap_smooth_64h_8bpp_avx512icl: 259.9 vp9_put_8tap_smooth_64v_8bpp_avx2: 590.6 vp9_put_8tap_smooth_64v_8bpp_avx512icl: 252.4 vp9_put_8tap_smooth_64hv_8bpp_avx2: 1343.4 vp9_put_8tap_smooth_64hv_8bpp_avx512icl: 938.1 ``` >From 915710f2e618f0fcc7a9daacfbd2fcdb3797cf2f Mon Sep 17 00:00:00 2001 From: Henrik Gramner <gramner@twoorioles.com> Date: Wed, 27 Aug 2025 22:51:08 +0200 Subject: [PATCH] vp9: Add AVX-512ICL asm for 8bpc subpel mc --- libavcodec/x86/vp9dsp_init.c | 12 + libavcodec/x86/vp9dsp_init.h | 31 ++ libavcodec/x86/vp9mc.asm | 866 +++++++++++++++++++++++++++++++++++ tests/checkasm/vp9dsp.c | 6 +- 4 files changed, 912 insertions(+), 3 deletions(-) diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index 4373fa3f04..15862f43bf 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -50,6 +50,13 @@ decl_mc_funcs(8, ssse3, int8_t, 32, 8); #if ARCH_X86_64 decl_mc_funcs(16, ssse3, int8_t, 32, 8); decl_mc_funcs(32, avx2, int8_t, 32, 8); +#if HAVE_AVX512ICL_EXTERNAL +decl_subpel_asm( 4, 8, avx512icl); +decl_subpel_asm( 8, 8, avx512icl); +decl_subpel_asm(16, 8, avx512icl); +decl_subpel_asm(32, 8, avx512icl); +decl_subpel_asm(64, 8, avx512icl); +#endif #endif mc_rep_funcs(16, 8, 8, sse2, int16_t, 8, 8) @@ -418,6 +425,11 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) dsp->itxfm_add[TX_32X32][ADST_DCT] = dsp->itxfm_add[TX_32X32][DCT_ADST] = dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx512icl; + init_subpel_asm(4, 4, 8, avx512icl); + init_subpel_asm(3, 8, 8, avx512icl); + init_subpel_asm(2, 16, 8, avx512icl); + init_subpel_asm(1, 32, 8, avx512icl); + init_subpel_asm(0, 64, 8, avx512icl); } #endif diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h index fc1e0557fa..5690d16970 100644 --- a/libavcodec/x86/vp9dsp_init.h +++ b/libavcodec/x86/vp9dsp_init.h @@ -172,6 +172,37 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt) init_subpel3_8to64(idx, type, bpp, opt); \ init_subpel2(4, idx, 4, type, bpp, opt) +#define decl_subpel_asm_fn(type) \ + void ff_vp9_put_8tap_##type(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my); \ + void ff_vp9_avg_8tap_##type(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) + +#define decl_subpel_asm_dir(type) \ + decl_subpel_asm_fn(regular_##type); \ + decl_subpel_asm_fn(smooth_##type); \ + decl_subpel_asm_fn(sharp_##type) + +#define decl_subpel_asm(sz, bpp, opt) \ + decl_subpel_asm_dir(sz##h_##bpp##_##opt); \ + decl_subpel_asm_dir(sz##v_##bpp##_##opt); \ + decl_subpel_asm_dir(sz##hv_##bpp##_##opt) + +#define init_subpel_asm_dir(idx1, idx2, idx3, type) \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][0][idx2][idx3] = ff_vp9_put_8tap_regular_##type; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][0][idx2][idx3] = ff_vp9_put_8tap_sharp_##type; \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][0][idx2][idx3] = ff_vp9_put_8tap_smooth_##type; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][1][idx2][idx3] = ff_vp9_avg_8tap_regular_##type; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][1][idx2][idx3] = ff_vp9_avg_8tap_sharp_##type; \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][idx2][idx3] = ff_vp9_avg_8tap_smooth_##type + +#define init_subpel_asm(idx, sz, bpp, opt) \ + init_subpel_asm_dir(idx, 1, 0, sz##h_##bpp##_##opt); \ + init_subpel_asm_dir(idx, 0, 1, sz##v_##bpp##_##opt); \ + init_subpel_asm_dir(idx, 1, 1, sz##hv_##bpp##_##opt) + #define init_ipred_func(type, enum, sz, bpp, opt) \ dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \ cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt) diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm index efc4cfbef1..286f23c999 100644 --- a/libavcodec/x86/vp9mc.asm +++ b/libavcodec/x86/vp9mc.asm @@ -2,6 +2,7 @@ ;* VP9 motion compensation SIMD optimizations ;* ;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> +;* Copyright (c) 2025 Two Orioles, LLC ;* ;* This file is part of FFmpeg. ;* @@ -113,6 +114,89 @@ FILTER sse2 ; int16_t ff_filters_16bpp[3][15][4][16] FILTER 16bpp +%if HAVE_AVX512ICL_EXTERNAL && ARCH_X86_64 +ALIGN 64 +spel_h_perm16: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 + db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 +spel_v_perm16: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 + db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 + db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 + db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 +spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 + db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 + db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 + db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 +spel_hv_perm4: db 16, 32, 48, 8, 18, 34, 50, 10, 20, 36, 52, 12, 22, 38, 54, 14 + db 32, 48, 8, 24, 34, 50, 10, 26, 36, 52, 12, 28, 38, 54, 14, 30 + db 48, 8, 24, 40, 50, 10, 26, 42, 52, 12, 28, 44, 54, 14, 30, 46 + db 8, 24, 40, 56, 10, 26, 42, 58, 12, 28, 44, 60, 14, 30, 46, 62 +spel_hv_perm8: db 16, 32, 48, 8, 17, 33, 49, 9, 18, 34, 50, 10, 19, 35, 51, 11 + db 32, 48, 8, 24, 33, 49, 9, 25, 34, 50, 10, 26, 35, 51, 11, 27 + db 48, 8, 24, 40, 49, 9, 25, 41, 50, 10, 26, 42, 51, 11, 27, 43 + db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59 +spel_hv_perm16: db 32, 8, 33, 9, 34, 10, 35, 11, 36, 12, 37, 13, 38, 14, 39, 15 + db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 + db 48, 24, 49, 25, 50, 26, 51, 27, 52, 28, 53, 29, 54, 30, 55, 31 + db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 +spel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + +%define spel_h_shufA (spel_h_perm16+ 0) +%define spel_h_shufC (spel_h_perm16+16) + +vp9_spel_filter_regular: db 0, 1, -5, 126, 8, -3, 1, 0 + db -1, 3, -10, 122, 18, -6, 2, 0 + db -1, 4, -13, 118, 27, -9, 3, -1 + db -1, 4, -16, 112, 37, -11, 4, -1 + db -1, 5, -18, 105, 48, -14, 4, -1 + db -1, 5, -19, 97, 58, -16, 5, -1 + db -1, 6, -19, 88, 68, -18, 5, -1 + db -1, 6, -19, 78, 78, -19, 6, -1 + db -1, 5, -18, 68, 88, -19, 6, -1 + db -1, 5, -16, 58, 97, -19, 5, -1 + db -1, 4, -14, 48, 105, -18, 5, -1 + db -1, 4, -11, 37, 112, -16, 4, -1 + db -1, 3, -9, 27, 118, -13, 4, -1 + db 0, 2, -6, 18, 122, -10, 3, -1 + db 0, 1, -3, 8, 126, -5, 1, 0 +vp9_spel_filter_sharp: db -1, 3, -7, 127, 8, -3, 1, 0 + db -2, 5, -13, 125, 17, -6, 3, -1 + db -3, 7, -17, 121, 27, -10, 5, -2 + db -4, 9, -20, 115, 37, -13, 6, -2 + db -4, 10, -23, 108, 48, -16, 8, -3 + db -4, 10, -24, 100, 59, -19, 9, -3 + db -4, 11, -24, 90, 70, -21, 10, -4 + db -4, 11, -23, 80, 80, -23, 11, -4 + db -4, 10, -21, 70, 90, -24, 11, -4 + db -3, 9, -19, 59, 100, -24, 10, -4 + db -3, 8, -16, 48, 108, -23, 10, -4 + db -2, 6, -13, 37, 115, -20, 9, -4 + db -2, 5, -10, 27, 121, -17, 7, -3 + db -1, 3, -6, 17, 125, -13, 5, -2 + db 0, 1, -3, 8, 127, -7, 3, -1 +vp9_spel_filter_smooth: db -3, -1, 32, 64, 38, 1, -3, 0 + db -2, -2, 29, 63, 41, 2, -3, 0 + db -2, -2, 26, 63, 43, 4, -4, 0 + db -2, -3, 24, 62, 46, 5, -4, 0 + db -2, -3, 21, 60, 49, 7, -4, 0 + db -1, -4, 18, 59, 51, 9, -4, 0 + db -1, -4, 16, 57, 53, 12, -4, -1 + db -1, -4, 14, 55, 55, 14, -4, -1 + db -1, -4, 12, 53, 57, 16, -4, -1 + db 0, -4, 9, 51, 59, 18, -4, -1 + db 0, -4, 7, 49, 60, 21, -3, -2 + db 0, -4, 5, 46, 62, 24, -3, -2 + db 0, -4, 4, 43, 63, 26, -2, -2 + db 0, -3, 2, 41, 63, 29, -2, -2 + db 0, -3, 1, 38, 64, 32, -1, -3 + +pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7 +pd_64: dd 64 +pw_m33: times 2 dw -33 +pb_4: times 4 db 4 +%endif + SECTION .text %macro filter_sse2_h_fn 1 @@ -678,3 +762,785 @@ fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16 %undef d16 %undef s32 %undef d32 + +%if HAVE_AVX512ICL_EXTERNAL && ARCH_X86_64 +%macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb +%if %5 + vpermb m%2, m6, m%1 + vpermb m%3, m7, m%1 + vpermb m%4, m8, m%1 +%else +%if %2 < %4 ; reuse a previous value if possible + pshufb m%2, m%1, m6 +%endif + pshufb m%3, m%1, m7 + pshufb m%4, m%1, m8 +%endif + mova m%1, m5 + vpdpbusd m%1, m%2, m9 + mova m%2, m5 + vpdpbusd m%2, m%3, m9 + vpdpbusd m%1, m%3, m10 + vpdpbusd m%2, m%4, m10 + packusdw m%1, m%2 + psrlw m%1, 7 +%endmacro + +%macro SPEL_H_INIT 2 ; put/avg, w +cglobal vp9_%1_8tap_smooth_%2h_8, 4, 7, 0 + lea r6, [vp9_spel_filter_smooth-8] + jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2h_8 %+ SUFFIX).main +cglobal vp9_%1_8tap_sharp_%2h_8, 4, 7, 0 + lea r6, [vp9_spel_filter_sharp-8] + jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2h_8 %+ SUFFIX).main +cglobal vp9_%1_8tap_regular_%2h_8, 4, 7, 0, dst, ds, src, ss, h, mx + lea r6, [vp9_spel_filter_regular-8] +.main: + mov mxd, mxm + movifnidn hd, hm + sub srcq, 3 + vpbroadcastd m5, [pd_64] + vpbroadcastd m9, [r6+mxq*8+0] + vpbroadcastd m10, [r6+mxq*8+4] +%endmacro + +%macro SPEL_V_INIT 2 ; put/avg, w +cglobal vp9_%1_8tap_smooth_%2v_8, 4, 7, 0 + lea r5, [vp9_spel_filter_smooth-8] + jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2v_8 %+ SUFFIX).main +cglobal vp9_%1_8tap_sharp_%2v_8, 4, 7, 0 + lea r5, [vp9_spel_filter_sharp-8] + jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2v_8 %+ SUFFIX).main +cglobal vp9_%1_8tap_regular_%2v_8, 4, 7, 0, dst, ds, src, ss, h, mx, my + lea r5, [vp9_spel_filter_regular-8] +.main: + mov myd, mym + movifnidn hd, hm + lea myq, [r5+myq*8] + vpbroadcastd m7, [pw_256] + vpbroadcastw m8, [myq+0] + vpbroadcastw m9, [myq+2] + lea r5, [ssq*3] + vpbroadcastw m10, [myq+4] + sub srcq, r5 + vpbroadcastw m11, [myq+6] +%endmacro + +%macro SPEL_HV_INIT 2 ; put/avg, w +cglobal vp9_%1_8tap_smooth_%2hv_8, 4, 8, 0 + lea r6, [vp9_spel_filter_smooth-8] + jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2hv_8 %+ SUFFIX).main +cglobal vp9_%1_8tap_sharp_%2hv_8, 4, 8, 0 + lea r6, [vp9_spel_filter_sharp-8] + jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2hv_8 %+ SUFFIX).main +cglobal vp9_%1_8tap_regular_%2hv_8, 4, 8, 0, dst, ds, src, ss, h, mx, my + lea r6, [vp9_spel_filter_regular-8] +.main: +%if %2 == 16 + xor r7d, r7d +.main2: +%endif + mov mxd, mxm + movifnidn hd, hm + sub srcq, 3 + vpbroadcastd m9, [r6+mxq*8+0] + vpbroadcastd m10, [r6+mxq*8+4] + mov mxd, mym + vpbroadcastd m5, [pd_64] + lea myq, [r6+mxq*8] + lea r5, [ssq*3] + sub srcq, r5 +%endmacro + +%macro MC_AVX512 1 ; put/avg + SPEL_H_INIT %1, 4 + vbroadcasti32x4 m6, [spel_h_shufA] + lea r5, [ssq*3] + vbroadcasti32x4 m7, [spel_h_shufB] + lea r6, [dsq*3] + vbroadcasti32x4 m8, [spel_h_shufC] +.h_w4_loop: + movu xm0, [srcq+ssq*0] + vinserti32x4 ym0, [srcq+ssq*1], 1 + vinserti32x4 m0, [srcq+ssq*2], 2 + vinserti32x4 m0, [srcq+r5 ], 3 + lea srcq, [srcq+ssq*4] + pshufb m1, m0, m6 + pshufb m0, m7 + mova m2, m5 + vpdpbusd m2, m1, m9 + vpdpbusd m2, m0, m10 + vpmovsdw ym0, m2 + psraw ym0, 7 + packuswb ym0, ym0 + vextracti32x4 xm1, ym0, 1 +%ifidn %1, avg + movd xmm2, [dstq+dsq*0] + pinsrd xmm2, [dstq+dsq*1], 1 + movd xmm3, [dstq+dsq*2] + pinsrd xmm3, [dstq+r6 ], 1 + pavgb xm0, xmm2 + pavgb xm1, xmm3 +%endif + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + movd [dstq+dsq*2], xm1 + pextrd [dstq+r6 ], xm1, 1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .h_w4_loop + RET + + SPEL_H_INIT %1, 8 + vbroadcasti32x4 m6, [spel_h_shufA] + lea r5, [ssq*3] + vbroadcasti32x4 m7, [spel_h_shufB] + lea r6, [dsq*3] + vbroadcasti32x4 m8, [spel_h_shufC] +.h_w8_loop: + movu xm0, [srcq+ssq*0] + vinserti32x4 ym0, [srcq+ssq*1], 1 + vinserti32x4 m0, [srcq+ssq*2], 2 + vinserti32x4 m0, [srcq+r5 ], 3 + lea srcq, [srcq+ssq*4] + PUT_8TAP_H 0, 1, 2, 3 + vpmovuswb ym0, m0 + vextracti32x4 xm1, ym0, 1 +%ifidn %1, avg + movq xmm2, [dstq+dsq*0] + movhps xmm2, [dstq+dsq*1] + movq xmm3, [dstq+dsq*2] + movhps xmm3, [dstq+r6 ] + pavgb xm0, xmm2 + pavgb xm1, xmm3 +%endif + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + movq [dstq+dsq*2], xm1 + movhps [dstq+r6 ], xm1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .h_w8_loop + RET + + SPEL_H_INIT %1, 16 + mova m6, [spel_h_perm16] + vpbroadcastd m8, [pb_4] + paddb m7, m8, m6 + paddb m8, m7 +.h_w16_loop: + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 1, 2, 3, 1 + vpmovuswb ym0, m0 +%ifidn %1, avg + movu xm1, [dstq+dsq*0] + vinserti32x4 ym1, [dstq+dsq*1], 1 + pavgb ym0, ym1 +%endif + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16_loop + RET + + SPEL_H_INIT %1, 32 + vbroadcasti32x4 m6, [spel_h_shufA] + vbroadcasti32x4 m7, [spel_h_shufB] + vbroadcasti32x4 m8, [spel_h_shufC] +.h_w32_loop: + movu ym0, [srcq+ssq*0+8*0] + vinserti32x8 m0, [srcq+ssq*1+8*0], 1 + movu ym1, [srcq+ssq*0+8*1] + vinserti32x8 m1, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 4, 3, 2 + packuswb m0, m1 +%ifidn %1, avg + movu ym1, [dstq+dsq*0] + vinserti32x8 m1, [dstq+dsq*1], 1 + pavgb m0, m1 +%endif + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w32_loop + RET + + SPEL_H_INIT %1, 64 + vbroadcasti32x4 m6, [spel_h_shufA] + vbroadcasti32x4 m7, [spel_h_shufB] + vbroadcasti32x4 m8, [spel_h_shufC] +.h_w64_loop: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + add srcq, ssq + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 4, 3, 2 + packuswb m0, m1 +%ifidn %1, avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dsq + dec hd + jg .h_w64_loop + RET + + SPEL_V_INIT %1, 4 + movd xmm2, [srcq+ssq*0] + pinsrd xmm2, [srcq+ssq*1], 1 + pinsrd xmm2, [srcq+ssq*2], 2 + add srcq, r5 + pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 + movd xmm3, [srcq+ssq*1] + vpbroadcastd xmm1, [srcq+ssq*2] + add srcq, r5 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 + vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 + palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 + punpcklbw xmm3, xmm1 ; 45 56 + punpcklbw xmm1, xmm2, xmm4 ; 01 12 + punpckhbw xmm2, xmm4 ; 23 34 +%if WIN64 + movaps [rsp+8], xmm6 +%endif +.v_w4_loop: + vpbroadcastd xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw xmm5, xmm1, xm8 ; a0 b0 + mova xmm1, xmm2 + pmaddubsw xmm6, xmm2, xm9 ; a1 b1 + mova xmm2, xmm3 + pmaddubsw xmm3, xm10 ; a2 b2 + paddw xmm5, xmm3 + vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm4, xmm0, 0x02 ; 7 8 + punpcklbw xmm3, xmm4 ; 67 78 + pmaddubsw xmm4, xmm3, xm11 ; a3 b3 + paddw xmm6, xmm4 + paddsw xmm5, xmm6 + pmulhrsw xmm5, xm7 + packuswb xmm5, xmm5 +%ifidn %1, avg + movd xmm4, [dstq+dsq*0] + pinsrd xmm4, [dstq+dsq*1], 1 + pavgb xmm5, xmm4 +%endif + movd [dstq+dsq*0], xmm5 + pextrd [dstq+dsq*1], xmm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop +%if WIN64 + movaps xmm6, [rsp+8] +%endif + RET + + SPEL_V_INIT %1, 8 + movq xmm1, [srcq+ssq*0] + vpbroadcastq ymm0, [srcq+ssq*1] + vpbroadcastq ymm2, [srcq+ssq*2] + add srcq, r5 + vpbroadcastq ymm5, [srcq+ssq*0] + vpbroadcastq ymm3, [srcq+ssq*1] + vpbroadcastq ymm4, [srcq+ssq*2] + add srcq, r5 + vpblendd ymm1, ymm0, 0x30 + vpblendd ymm0, ymm2, 0x30 + punpcklbw ymm1, ymm0 ; 01 12 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm2, ymm5, 0x30 + vpblendd ymm5, ymm3, 0x30 + punpcklbw ymm2, ymm5 ; 23 34 + vpblendd ymm3, ymm4, 0x30 + vpblendd ymm4, ymm0, 0x30 + punpcklbw ymm3, ymm4 ; 45 56 +%if WIN64 + movaps [rsp+8], xmm6 +%endif +.v_w8_loop: + vpbroadcastq ymm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw ymm5, ymm1, ym8 ; a0 b0 + mova ymm1, ymm2 + pmaddubsw ymm6, ymm2, ym9 ; a1 b1 + mova ymm2, ymm3 + pmaddubsw ymm3, ym10 ; a2 b2 + paddw ymm5, ymm3 + vpblendd ymm3, ymm0, ymm4, 0x30 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm4, ymm4, ymm0, 0x30 + punpcklbw ymm3, ymm4 ; 67 78 + pmaddubsw ymm4, ymm3, ym11 ; a3 b3 + paddw ymm6, ymm4 + paddsw ymm5, ymm6 + pmulhrsw ymm5, ym7 + vextracti128 xmm4, ymm5, 1 + packuswb xmm5, xmm4 +%ifidn %1, avg + movq xmm4, [dstq+dsq*0] + movhps xmm4, [dstq+dsq*1] + pavgb xmm5, xmm4 +%endif + movq [dstq+dsq*0], xmm5 + movhps [dstq+dsq*1], xmm5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop +%if WIN64 + movaps xmm6, [rsp+8] +%endif + vzeroupper + RET + + SPEL_V_INIT %1, 16 + mova m12, [spel_v_perm16] + vbroadcasti32x4 m1, [srcq+ssq*0] + vbroadcasti32x4 ym4, [srcq+ssq*1] + mov r6d, 0x0f + vbroadcasti32x4 m2, [srcq+ssq*2] + add srcq, r5 + vbroadcasti32x4 ym5, [srcq+ssq*0] + kmovb k1, r6d + vbroadcasti32x4 m3, [srcq+ssq*1] + vbroadcasti32x4 ym6, [srcq+ssq*2] + add srcq, r5 + vbroadcasti32x4 m0, [srcq+ssq*0] + vshufpd m1{k1}, m4, m2, 0xcc + vshufpd m2{k1}, m5, m3, 0xcc + vshufpd m3{k1}, m6, m0, 0xcc + vpermb m1, m12, m1 ; 01 12 + vpermb m2, m12, m2 ; 23 34 + vpermb m3, m12, m3 ; 45 56 +.v_w16_loop: + pmaddubsw m4, m1, m8 ; a0 b0 + mova m1, m2 + pmaddubsw m5, m2, m9 ; a1 b1 + mova m2, m3 + pmaddubsw m6, m3, m10 ; a2 b2 + mova m3, m0 + paddw m4, m6 + vbroadcasti32x4 ym6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m0, [srcq+ssq*0] + vshufpd m3{k1}, m6, m0, 0xcc + vpermb m3, m12, m3 ; 67 78 + pmaddubsw m6, m3, m11 ; a3 b3 + paddw m5, m6 + paddsw m4, m5 + pmulhrsw m4, m7 + vextracti32x8 ym5, m4, 1 + packuswb ym4, ym5 +%ifidn %1, avg + mova xm5, [dstq+dsq*0] + vinserti32x4 ym5, [dstq+dsq*1], 1 + pavgb ym4, ym5 +%endif + mova [dstq+dsq*0], xm4 + vextracti32x4 [dstq+dsq*1], ym4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET + + SPEL_V_INIT %1, 32 + mova m12, [spel_v_perm32] + pmovzxbq m14, [pb_02461357] + vpshrdw m13, m12, m12, 8 + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 + vpermb m1, m12, m0 ; 01 + vinserti32x8 m0, [srcq+ssq*2], 0 + add srcq, r5 + vpermb m2, m13, m0 ; 12 + vinserti32x8 m0, [srcq+ssq*0], 1 + vpermb m3, m12, m0 ; 23 + vinserti32x8 m0, [srcq+ssq*1], 0 + vpermb m4, m13, m0 ; 34 + vinserti32x8 m0, [srcq+ssq*2], 1 + add srcq, r5 + vpermb m5, m12, m0 ; 45 + vinserti32x8 m0, [srcq+ssq*0], 0 + vpermb m6, m13, m0 ; 56 +.v_w32_loop: + vinserti32x8 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pmaddubsw m15, m1, m8 + mova m1, m3 + pmaddubsw m16, m2, m8 + mova m2, m4 + pmaddubsw m17, m3, m9 + mova m3, m5 + pmaddubsw m18, m4, m9 + mova m4, m6 + pmaddubsw m19, m5, m10 + vpermb m5, m12, m0 ; 67 + vinserti32x8 m0, [srcq+ssq*0], 0 + pmaddubsw m20, m6, m10 + vpermb m6, m13, m0 ; 78 + paddw m15, m19 + pmaddubsw m19, m5, m11 + paddw m16, m20 + pmaddubsw m20, m6, m11 + paddw m17, m19 + paddw m18, m20 + paddsw m15, m17 + paddsw m16, m18 + pmulhrsw m15, m7 + pmulhrsw m16, m7 + packuswb m15, m16 + vpermq m15, m14, m15 +%ifidn %1, avg + mova ym16, [dstq+dsq*0] + vinserti32x8 m16, [dstq+dsq*1], 1 + pavgb m15, m16 +%endif + mova [dstq+dsq*0], ym15 + vextracti32x8 [dstq+dsq*1], m15, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + vzeroupper + RET + + SPEL_V_INIT %1, 64 + movu m2, [srcq+ssq*0] + movu m4, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + add srcq, r5 + movu m13, [srcq+ssq*0] + movu m15, [srcq+ssq*1] + movu m17, [srcq+ssq*2] + add srcq, r5 + movu m0, [srcq+ssq*0] + punpcklbw m1, m2, m4 ; 01l + punpckhbw m2, m4 ; 01h + punpcklbw m3, m4, m6 ; 12l + punpckhbw m4, m6 ; 12h + punpcklbw m5, m6, m13 ; 23l + punpckhbw m6, m13 ; 23h + punpcklbw m12, m13, m15 ; 34l + punpckhbw m13, m15 ; 34h + punpcklbw m14, m15, m17 ; 45l + punpckhbw m15, m17 ; 45h + punpcklbw m16, m17, m0 ; 56l + punpckhbw m17, m0 ; 56h +%if WIN64 + movaps [rsp+8], xmm6 +%endif +.v_w64_loop: + movu m22, [srcq+ssq*1] + pmaddubsw m1, m8 ; a0l + pmaddubsw m18, m14, m10 ; a2l + lea srcq, [srcq+ssq*2] + pmaddubsw m2, m8 ; a0h + pmaddubsw m19, m15, m10 ; a2h + paddw m18, m1 + mova m1, m5 + paddw m19, m2 + mova m2, m6 + pmaddubsw m20, m5, m9 ; a1l + mova m5, m14 + pmaddubsw m21, m6, m9 ; a1h + mova m6, m15 + punpcklbw m14, m0, m22 ; 67l + punpckhbw m15, m0, m22 ; 67h + pmaddubsw m0, m14, m11 ; a3l + paddw m20, m0 + pmaddubsw m0, m15, m11 ; a3h + paddw m21, m0 + movu m0, [srcq+ssq*0] + paddsw m18, m20 + paddsw m19, m21 + pmaddubsw m3, m8 ; b0l + pmaddubsw m20, m16, m10 ; b2l + pmaddubsw m4, m8 ; b0h + pmaddubsw m21, m17, m10 ; b2h + pmulhrsw m18, m7 + pmulhrsw m19, m7 + paddw m20, m3 + mova m3, m12 + paddw m21, m4 + mova m4, m13 + packuswb m18, m19 +%ifidn %1, avg + pavgb m18, [dstq+dsq*0] +%endif + mova [dstq+dsq*0], m18 + pmaddubsw m18, m12, m9 ; b1l + mova m12, m16 + punpcklbw m16, m22, m0 ; 78l + pmaddubsw m19, m13, m9 ; b1h + mova m13, m17 + punpckhbw m17, m22, m0 ; 78h + pmaddubsw m22, m16, m11 ; b3l + paddw m18, m22 + pmaddubsw m22, m17, m11 ; b3h + paddw m19, m22 + paddsw m18, m20 + paddsw m19, m21 + pmulhrsw m18, m7 + pmulhrsw m19, m7 + packuswb m18, m19 +%ifidn %1, avg + pavgb m18, [dstq+dsq*1] +%endif + mova [dstq+dsq*1], m18 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w64_loop +%if WIN64 + movaps xmm6, [rsp+8] +%endif + vzeroupper + RET + + SPEL_HV_INIT %1, 4 + vbroadcasti32x4 ym2, [srcq+ssq*0] + vinserti32x4 m2, [srcq+ssq*1], 2 + vbroadcasti32x4 m6, [spel_h_shufA] + vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 0 1 2 + add srcq, r5 + movu xm0, [srcq+ssq*0] + vinserti32x4 ym0, [srcq+ssq*1], 1 + vbroadcasti32x4 m7, [spel_h_shufB] + vinserti32x4 m0, [srcq+ssq*2], 2 + add srcq, r5 + vpbroadcastd m11, [myq+0] + vinserti32x4 m0, [srcq+ssq*0], 3 ; 3 4 5 6 + vpbroadcastd m12, [myq+4] + lea r6, [dsq*3] + mova m8, [spel_hv_perm4] + pshufb m4, m2, m6 + mova m1, m5 + vpdpbusd m1, m4, m9 + pshufb m4, m0, m6 + mova m3, m5 + vpdpbusd m3, m4, m9 + pshufb m2, m7 + pshufb m0, m7 + vpdpbusd m1, m2, m10 + vpdpbusd m3, m0, m10 + psrad m1, 7 + psrad m0, m3, 7 + packuswb m1, m0 ; _3 04 15 26 + vpermb m1, m8, m1 ; 0123 1234 2345 3456 +.hv_w4_loop: + movu xm4, [srcq+ssq*1] + vinserti32x4 ym4, [srcq+ssq*2], 1 + vinserti32x4 m4, [srcq+r5 ], 2 + lea srcq, [srcq+ssq*4] + vinserti32x4 m4, [srcq+ssq*0], 3 ; 7 8 9 a + mova m3, m5 + pshufb m2, m4, m6 + vpdpbusd m3, m2, m9 + mova m2, m5 + vpdpbusd m2, m1, m11 + pshufb m4, m7 + vpdpbusd m3, m4, m10 + psrad m3, 7 + packuswb m1, m0, m3 ; 37 48 59 6a + mova m0, m3 + vpermb m1, m8, m1 ; 4567 5678 6789 789a + vpdpbusd m2, m1, m12 + psrad m2, 7 + vpmovdw ym2, m2 + packuswb ym2, ym2 + vextracti32x4 xm3, ym2, 1 +%ifidn %1, avg + movd xmm4, [dstq+dsq*0] + pinsrd xmm4, [dstq+dsq*1], 1 + pavgb xm2, xmm4 + movd xmm4, [dstq+dsq*2] + pinsrd xmm4, [dstq+r6 ], 1 + pavgb xm3, xmm4 +%endif + movd [dstq+dsq*0], xm2 + pextrd [dstq+dsq*1], xm2, 1 + movd [dstq+dsq*2], xm3 + pextrd [dstq+r6 ], xm3, 1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .hv_w4_loop + RET + + SPEL_HV_INIT %1, 8 + vbroadcasti32x4 ym2, [srcq+ssq*0] + vinserti32x4 m2, [srcq+ssq*1], 2 + vbroadcasti32x4 m6, [spel_h_shufA] + vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 0 1 2 + add srcq, r5 + movu xm0, [srcq+ssq*0] + vinserti32x4 ym0, [srcq+ssq*1], 1 + vbroadcasti32x4 m7, [spel_h_shufB] + vinserti32x4 m0, [srcq+ssq*2], 2 + add srcq, r5 + vpbroadcastd m11, [myq+0] + vinserti32x4 m0, [srcq+ssq*0], 3 ; 3 4 5 6 + vpbroadcastd m12, [myq+4] + lea r6, [dsq*3] + vbroadcasti32x4 m8, [spel_h_shufC] + mova m13, [spel_hv_perm8] + vpaddd m14, m13, [pb_4] {1to16} + PUT_8TAP_H 2, 1, 3, 4 + PUT_8TAP_H 0, 1, 3, 4 + packuswb m2, m0 ; _3 04 15 26 + vpermb m1, m13, m2 ; 0123 1234 2345 3456 (abcd) + vpermb m2, m14, m2 ; 0123 1234 2345 3456 (efgh) +.hv_w8_loop: + movu xm18, [srcq+ssq*1] + vinserti128 ym18, [srcq+ssq*2], 1 + vinserti32x4 m18, [srcq+r5 ], 2 + lea srcq, [srcq+ssq*4] + vinserti32x4 m18, [srcq+ssq*0], 3 ; 7 8 9 a + PUT_8TAP_H 18, 4, 16, 17 + mova m16, m5 + vpdpbusd m16, m1, m11 + mova m17, m5 + vpdpbusd m17, m2, m11 + packuswb m2, m0, m18 ; 37 48 59 6a + mova m0, m18 + vpermb m1, m13, m2 ; 4567 5678 6789 789a (abcd) + vpermb m2, m14, m2 ; 4567 5678 6789 789a (efgh) + vpdpbusd m16, m1, m12 + vpdpbusd m17, m2, m12 + packusdw m16, m17 + psrlw m16, 7 + vpmovuswb ym16, m16 + vextracti128 xm17, ym16, 1 +%ifidn %1, avg + movq xm18, [dstq+dsq*0] + movhps xm18, [dstq+dsq*1] + pavgb xm16, xm18 + movq xm18, [dstq+dsq*2] + movhps xm18, [dstq+r6 ] + pavgb xm17, xm18 +%endif + movq [dstq+dsq*0], xm16 + movhps [dstq+dsq*1], xm16 + movq [dstq+dsq*2], xm17 + movhps [dstq+r6 ], xm17 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .hv_w8_loop + vzeroupper + RET + +cglobal vp9_%1_8tap_smooth_32hv_8, 4, 8, 0 + lea r6, [vp9_spel_filter_smooth-8] + mov r7d, 256*1 + jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2 +cglobal vp9_%1_8tap_sharp_32hv_8, 4, 8, 0 + lea r6, [vp9_spel_filter_sharp-8] + mov r7d, 256*1 + jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2 +cglobal vp9_%1_8tap_regular_32hv_8, 4, 8, 0, dst, ds, src, ss, h, mx, my + lea r6, [vp9_spel_filter_regular-8] + mov r7d, 256*1 + jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2 +cglobal vp9_%1_8tap_smooth_64hv_8, 4, 8, 0 + lea r6, [vp9_spel_filter_smooth-8] + mov r7d, 256*3 + jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2 +cglobal vp9_%1_8tap_sharp_64hv_8, 4, 8, 0 + lea r6, [vp9_spel_filter_sharp-8] + mov r7d, 256*3 + jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2 +cglobal vp9_%1_8tap_regular_64hv_8, 4, 8, 0, dst, ds, src, ss, h, mx, my + lea r6, [vp9_spel_filter_regular-8] + mov r7d, 256*3 + jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2 + + SPEL_HV_INIT %1, 16 + vpbroadcastw m11, [myq+0] + mova m6, [spel_h_perm16] + vpbroadcastw m12, [myq+2] + vpbroadcastd m8, [pb_4] + vpbroadcastw m13, [myq+4] + vpbroadcastd m15, [pw_256] + vpbroadcastw m14, [myq+6] + mova m19, [spel_hv_perm16] + vpandd m20, m19, [pw_m33] {1to16} ; even indices & ~32 + paddb m7, m6, m8 + lea r6d, [hq+r7] + paddb m8, m7 +%if WIN64 + push r8 +%endif +.hv_w16_loop0: + movu ym16, [srcq+ssq*0] ; 0 + movu ym17, [srcq+ssq*1] + lea r7, [srcq+r5] + vinserti32x8 m17, [srcq+ssq*2], 1 ; 1 2 + movu ym18, [r7+ssq*0] + mov r8, dstq + vinserti32x8 m18, [r7+ssq*1], 1 ; 3 4 + movu ym0, [r7+ssq*2] + add r7, r5 + vinserti32x8 m0, [r7+ssq*0], 1 ; 5 6 +INIT_YMM avx512icl + PUT_8TAP_H 16, 1, 2, 3, 1 +INIT_ZMM avx512icl + PUT_8TAP_H 17, 1, 2, 3, 1 + PUT_8TAP_H 18, 1, 2, 3, 1 + PUT_8TAP_H 0, 1, 2, 3, 1 + packuswb m16, m17 + packuswb m17, m18 + packuswb m18, m0 + vpermb m1, m20, m16 ; 01 12 + vpermb m2, m19, m17 ; 23 34 + vpermb m3, m19, m18 ; 45 56 +.hv_w16_loop: + movu ym18, [r7+ssq*1] + lea r7, [r7+ssq*2] + vinserti32x8 m18, [r7+ssq*0], 1 + PUT_8TAP_H 18, 4, 16, 17, 1 + pmaddubsw m16, m1, m11 ; a0 b0 + mova m1, m2 + pmaddubsw m17, m2, m12 ; a1 b1 + mova m2, m3 + pmaddubsw m3, m13 ; a2 b2 + packuswb m4, m0, m18 + paddw m16, m3 + vpermb m3, m19, m4 ; 67 78 + mova m0, m18 + pmaddubsw m4, m3, m14 ; a3 b3 + paddw m17, m4 + paddsw m16, m17 + pmulhrsw m16, m15 + vextracti32x8 ym17, m16, 1 + packuswb ym16, ym17 +%ifidn %1, avg + mova xm17, [r8+dsq*0] + vinserti128 ym17, [r8+dsq*1], 1 + pavgb ym16, ym17 +%endif + mova [r8+dsq*0], xm16 + vextracti128 [r8+dsq*1], ym16, 1 + lea r8, [r8+dsq*2] + sub hd, 2 + jg .hv_w16_loop + add srcq, 16 + add dstq, 16 + movzx hd, r6b + sub r6d, 1<<8 + jg .hv_w16_loop0 + vzeroupper +%if WIN64 + pop r8 +%endif + RET +%endmacro + +INIT_ZMM avx512icl +MC_AVX512 put +MC_AVX512 avg + +%endif \ No newline at end of file diff --git a/tests/checkasm/vp9dsp.c b/tests/checkasm/vp9dsp.c index 2fc45cff8e..2a3374541f 100644 --- a/tests/checkasm/vp9dsp.c +++ b/tests/checkasm/vp9dsp.c @@ -558,9 +558,9 @@ static void check_loopfilter(void) static void check_mc(void) { - LOCAL_ALIGNED_32(uint8_t, buf, [72 * 72 * 2]); - LOCAL_ALIGNED_32(uint8_t, dst0, [64 * 64 * 2]); - LOCAL_ALIGNED_32(uint8_t, dst1, [64 * 64 * 2]); + LOCAL_ALIGNED_64(uint8_t, buf, [72 * 72 * 2]); + LOCAL_ALIGNED_64(uint8_t, dst0, [64 * 64 * 2]); + LOCAL_ALIGNED_64(uint8_t, dst1, [64 * 64 * 2]); VP9DSPContext dsp; int op, hsize, bit_depth, filter, dx, dy; declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride, -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2025-08-28 1:18 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=175634386154.25.12603703631889415666@463a07221176 \ --to=ffmpeg-devel@ffmpeg.org \ --cc=code@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git