* [FFmpeg-devel] [PATCH] vp9: Improve 8bpc AVX2 inverse transform asm (PR #20526)
@ 2025-09-15 13:25 Henrik Gramner via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: Henrik Gramner via ffmpeg-devel @ 2025-09-15 13:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Henrik Gramner
PR #20526 opened by Henrik Gramner (gramner)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20526
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20526.patch
This adds size-efficient AVX2 implementations for all the inverse transform types (replacing a few fully unrolled existing ones in the process). This reduces code size by ~27 kB despite supporting more sizes.
The existing AVX implementations are also removed as they serve very little purpose now. This reduces code size by another ~65 kB.
The time to compile `vp9itxfm.asm` (which was kind of an extreme outlier before) is reduced to less than half, with the AVX2 code split out to a separate file.
Checkasm numbers on Zen 4:
```
old new
vp9_inv_adst_adst_4x4_sub4_add_8_ssse3: 29.1
vp9_inv_adst_adst_4x4_sub4_add_8_avx2: N/A 22.5
vp9_inv_dct_dct_4x4_sub4_add_8_ssse3: 26.2
vp9_inv_dct_dct_4x4_sub4_add_8_avx2: N/A 16.6
vp9_inv_adst_adst_8x8_sub8_add_8_ssse3: 105.2
vp9_inv_adst_adst_8x8_sub8_add_8_avx2: N/A 62.3
vp9_inv_dct_dct_8x8_sub8_add_8_ssse3: 55.7
vp9_inv_dct_dct_8x8_sub8_add_8_avx2: N/A 47.1
vp9_inv_adst_adst_16x16_sub16_add_8_ssse3: 526.9
vp9_inv_adst_adst_16x16_sub16_add_8_avx2: 261.5 225.8
vp9_inv_dct_dct_16x16_sub8_add_8_ssse3: 142.4
vp9_inv_dct_dct_16x16_sub8_add_8_avx2: 163.3 89.0
vp9_inv_dct_dct_16x16_sub16_add_8_ssse3: 305.6
vp9_inv_dct_dct_16x16_sub16_add_8_avx2: 163.3 163.2
vp9_inv_dct_dct_32x32_sub16_add_8_ssse3: 893.5
vp9_inv_dct_dct_32x32_sub16_add_8_avx2: 465.2 462.2
vp9_inv_dct_dct_32x32_sub32_add_8_ssse3: 1760.7
vp9_inv_dct_dct_32x32_sub32_add_8_avx2: 903.7 879.9
vp9_inv_wht_wht_4x4_sub4_add_8_mmx: 16.7
vp9_inv_wht_wht_4x4_sub4_add_8_avx2: N/A 14.8
```
>From 41bf9b5cfcd97cbab5a4554101397b0b31bf0ee1 Mon Sep 17 00:00:00 2001
From: Henrik Gramner <gramner@twoorioles.com>
Date: Mon, 15 Sep 2025 14:11:57 +0200
Subject: [PATCH 1/2] vp9: Add 8bpc AVX2 asm for inverse transforms
---
libavcodec/x86/Makefile | 1 +
libavcodec/x86/vp9dsp_init.c | 15 +
libavcodec/x86/vp9itxfm.asm | 369 +------
libavcodec/x86/vp9itxfm_avx2.asm | 1640 ++++++++++++++++++++++++++++++
4 files changed, 1666 insertions(+), 359 deletions(-)
create mode 100644 libavcodec/x86/vp9itxfm_avx2.asm
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index ebd2bdb310..461753c2fe 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -186,6 +186,7 @@ X86ASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
x86/vp9intrapred_16bpp.o \
x86/vp9itxfm.o \
+ x86/vp9itxfm_avx2.o \
x86/vp9itxfm_avx512.o \
x86/vp9itxfm_16bpp.o \
x86/vp9itxfm_16bpp_avx512.o \
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index bbabcf38c3..a1e47445a8 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -108,9 +108,11 @@ itxfm_func(idct, iadst, 4, sse2);
itxfm_func(iadst, idct, 4, sse2);
itxfm_func(iadst, iadst, 4, sse2);
itxfm_funcs(4, ssse3);
+itxfm_funcs(4, avx2);
itxfm_funcs(8, sse2);
itxfm_funcs(8, ssse3);
itxfm_funcs(8, avx);
+itxfm_funcs(8, avx2);
itxfm_funcs(16, sse2);
itxfm_funcs(16, ssse3);
itxfm_funcs(16, avx);
@@ -118,6 +120,7 @@ itxfm_func(idct, idct, 32, sse2);
itxfm_func(idct, idct, 32, ssse3);
itxfm_func(idct, idct, 32, avx);
itxfm_func(iwht, iwht, 4, mmx);
+itxfm_func(iwht, iwht, 4, avx2);
itxfm_funcs(16, avx2);
itxfm_funcs(16, avx512icl);
itxfm_func(idct, idct, 32, avx2);
@@ -392,6 +395,18 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
init_fpel_func(0, 1, 64, avg, _8, avx2);
if (ARCH_X86_64) {
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+ dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
+ dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
+ dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
+ dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_avx2;
+ dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_avx2;
+ dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_avx2;
+ dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_avx2;
+ dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_avx2;
+ dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx2;
+ dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx2;
+ dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx2;
+ dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx2;
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2;
dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx2;
dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx2;
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
index 2f290f2f88..a78ea56c22 100644
--- a/libavcodec/x86/vp9itxfm.asm
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -24,36 +24,36 @@
%include "libavutil/x86/x86util.asm"
%include "vp9itxfm_template.asm"
-SECTION_RODATA 32
+SECTION_RODATA 16
%macro VP9_IDCT_COEFFS 2-3 0
const pw_m%1_%2
-times 8 dw -%1, %2
+times 4 dw -%1, %2
const pw_%2_%1
-times 8 dw %2, %1
+times 4 dw %2, %1
%if %3 == 1
const pw_m%2_m%1
-times 8 dw -%2, -%1
+times 4 dw -%2, -%1
%if %1 != %2
const pw_m%2_%1
-times 8 dw -%2, %1
+times 4 dw -%2, %1
const pw_%1_%2
-times 8 dw %1, %2
+times 4 dw %1, %2
%endif
%endif
%if %1 < 11585
-pw_m%1x2: times 16 dw -%1*2
+pw_m%1x2: times 8 dw -%1*2
%elif %1 > 11585
-pw_%1x2: times 16 dw %1*2
+pw_%1x2: times 8 dw %1*2
%else
const pw_%1x2
-times 16 dw %1*2
+times 8 dw %1*2
%endif
%if %2 != %1
-pw_%2x2: times 16 dw %2*2
+pw_%2x2: times 8 dw %2*2
%endif
%endmacro
@@ -1534,83 +1534,6 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
vextracti128 [dstq+stride3q], m%4, 1
%endmacro
-%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
-cglobal vp9_idct_idct_16x16_add, 4, 4, 16, dst, stride, block, eob
- cmp eobd, 1 ; faster path for when only DC is set
- jg .idctfull
-
- ; dc-only
- mova m1, [pw_11585x2]
- vpbroadcastw m0, [blockq]
- pmulhrsw m0, m1
- pmulhrsw m0, m1
- pxor m5, m5
- pmulhrsw m0, [pw_512]
- movd [blockq], xm5
-
- DEFINE_ARGS dst, stride, stride3, cnt
- mov cntd, 4
- lea stride3q, [strideq*3]
-.loop_dc:
- VP9_STORE_YMM_DC_4X 0, 1, 2, 3, 4, 5
- lea dstq, [dstq+4*strideq]
- dec cntd
- jg .loop_dc
- RET
-
- DEFINE_ARGS dst, stride, block, eob
-.idctfull:
- mova m1, [blockq+ 32]
- mova m2, [blockq+ 64]
- mova m3, [blockq+ 96]
- mova m5, [blockq+160]
- mova m6, [blockq+192]
- mova m7, [blockq+224]
- mova m8, [blockq+256]
- mova m9, [blockq+288]
- mova m10, [blockq+320]
- mova m11, [blockq+352]
- mova m12, [blockq+384]
- mova m13, [blockq+416]
- mova m14, [blockq+448]
- mova m15, [blockq+480]
-
- VP9_IDCT16_YMM_1D
- TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
- [blockq+192], [blockq+128], 1
- mova [blockq+ 0], m0
- VP9_IDCT16_YMM_1D
-
- mova [blockq+224], m7
-
- ; store
- VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
- mova m6, [blockq+192]
- mova m7, [blockq+224]
- VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
-
- ; at the end of the loop, m0 should still be zero
- ; use that to zero out block coefficients
- pxor m0, m0
- ZERO_BLOCK blockq, 32, 16, m0
- RET
-%endif
-
;---------------------------------------------------------------------------------------------
; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
;---------------------------------------------------------------------------------------------
@@ -2094,65 +2017,6 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
SWAP 5, 9, 15
%endmacro
-%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
-%macro IADST16_YMM_FN 4
-INIT_YMM avx2
-cglobal vp9_%1_%3_16x16_add, 4, 4, 16, dst, stride, block, eob
- mova m1, [blockq+ 32]
- mova m2, [blockq+ 64]
- mova m3, [blockq+ 96]
- mova m5, [blockq+160]
- mova m6, [blockq+192]
- mova m7, [blockq+224]
- mova m8, [blockq+256]
- mova m9, [blockq+288]
- mova m10, [blockq+320]
- mova m11, [blockq+352]
- mova m12, [blockq+384]
- mova m13, [blockq+416]
- mova m14, [blockq+448]
- mova m15, [blockq+480]
-
- VP9_%2_YMM_1D
- TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
- [blockq+192], [blockq+128], 1
- mova [blockq+ 0], m0
- VP9_%4_YMM_1D
-
- mova [blockq+224], m7
-
- ; store
- VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
- mova m6, [blockq+192]
- mova m7, [blockq+224]
- VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6
- lea dstq, [dstq+2*strideq]
-
- ; at the end of the loop, m0 should still be zero
- ; use that to zero out block coefficients
- pxor m0, m0
- ZERO_BLOCK blockq, 32, 16, m0
- RET
-%endmacro
-
-IADST16_YMM_FN idct, IDCT16, iadst, IADST16
-IADST16_YMM_FN iadst, IADST16, idct, IDCT16
-IADST16_YMM_FN iadst, IADST16, iadst, IADST16
-%endif
-
;---------------------------------------------------------------------------------------------
; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
;---------------------------------------------------------------------------------------------
@@ -2468,115 +2332,6 @@ IADST16_YMM_FN iadst, IADST16, iadst, IADST16
SUMSUB_BA w, 5, 13, 8
mova m10, [tmpq+28*%%str] ; t7
SUMSUB_BA w, 4, 10, 8
-%if cpuflag(avx2)
- ; the "shitty" about this idct is that the final pass does the outermost
- ; interleave sumsubs (t0/31, t1/30, etc) but the tN for the 16x16 need
- ; to be sequential, which means I need to load/store half of the sumsub
- ; intermediates back to/from memory to get a 16x16 transpose going...
- ; This would be easier if we had more (e.g. 32) YMM regs here.
- mova [tmpq+ 7*%%str], m9
- mova [tmpq+11*%%str], m12
- mova [tmpq+15*%%str], m11
- mova [tmpq+19*%%str], m2
- mova [tmpq+23*%%str], m3
- mova [tmpq+27*%%str], m13
- mova [tmpq+31*%%str], m10
- mova [tmpq+12*%%str], m5
-
- mova m13, [tmpq+30*%%str] ; t8
- mova m12, [tmpq+26*%%str] ; t9
- mova m11, [tmpq+22*%%str] ; t10
- mova m10, [tmpq+18*%%str] ; t11
- mova m9, [tmpq+17*%%str] ; t20
- mova m8, [tmpq+ 1*%%str] ; t21
- mova m3, [tmpq+25*%%str] ; t22
- mova m2, [tmpq+ 5*%%str] ; t23
-
- SUMSUB_BA w, 9, 10, 5
- SUMSUB_BA w, 8, 11, 5
- SUMSUB_BA w, 3, 12, 5
- SUMSUB_BA w, 2, 13, 5
- mova [tmpq+ 1*%%str], m10
- mova [tmpq+ 5*%%str], m11
- mova [tmpq+17*%%str], m12
- mova [tmpq+25*%%str], m13
-
- mova m13, [tmpq+14*%%str] ; t12
- mova m12, [tmpq+10*%%str] ; t13
- mova m11, [tmpq+ 9*%%str] ; t18
- mova m10, [tmpq+13*%%str] ; t19
-
- SUMSUB_BA w, 11, 12, 5
- SUMSUB_BA w, 10, 13, 5
- mova [tmpq+ 9*%%str], m13
- mova [tmpq+13*%%str], m12
- mova [tmpq+10*%%str], m10
- mova [tmpq+14*%%str], m11
-
- mova m13, [tmpq+ 6*%%str] ; t14
- mova m12, [tmpq+ 2*%%str] ; t15
- mova m11, [tmpq+21*%%str] ; t16
- mova m10, [tmpq+29*%%str] ; t17
- SUMSUB_BA w, 11, 12, 5
- SUMSUB_BA w, 10, 13, 5
- mova [tmpq+21*%%str], m12
- mova [tmpq+29*%%str], m13
- mova m12, [tmpq+10*%%str]
- mova m13, [tmpq+14*%%str]
-
- TRANSPOSE16x16W 6, 0, 15, 14, 1, 7, 5, 4, \
- 2, 3, 8, 9, 12, 13, 10, 11, \
- [tmpq+12*%%str], [tmpq+ 8*%%str], 1
- mova [tmpq+ 0*%%str], m6
- mova [tmpq+ 2*%%str], m0
- mova [tmpq+ 4*%%str], m15
- mova [tmpq+ 6*%%str], m14
- mova [tmpq+10*%%str], m7
- mova [tmpq+12*%%str], m5
- mova [tmpq+14*%%str], m4
- mova [tmpq+16*%%str], m2
- mova [tmpq+18*%%str], m3
- mova [tmpq+20*%%str], m8
- mova [tmpq+22*%%str], m9
- mova [tmpq+24*%%str], m12
- mova [tmpq+26*%%str], m13
- mova [tmpq+28*%%str], m10
- mova [tmpq+30*%%str], m11
-
- mova m0, [tmpq+21*%%str]
- mova m1, [tmpq+29*%%str]
- mova m2, [tmpq+13*%%str]
- mova m3, [tmpq+ 9*%%str]
- mova m4, [tmpq+ 1*%%str]
- mova m5, [tmpq+ 5*%%str]
- mova m7, [tmpq+25*%%str]
- mova m8, [tmpq+31*%%str]
- mova m9, [tmpq+27*%%str]
- mova m10, [tmpq+23*%%str]
- mova m11, [tmpq+19*%%str]
- mova m12, [tmpq+15*%%str]
- mova m13, [tmpq+11*%%str]
- mova m14, [tmpq+ 7*%%str]
- mova m15, [tmpq+ 3*%%str]
- TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, \
- 8, 9, 10, 11, 12, 13, 14, 15, \
- [tmpq+17*%%str], [tmpq+ 9*%%str], 1
- mova [tmpq+ 1*%%str], m0
- mova [tmpq+ 3*%%str], m1
- mova [tmpq+ 5*%%str], m2
- mova [tmpq+ 7*%%str], m3
- mova [tmpq+11*%%str], m5
- mova [tmpq+13*%%str], m6
- mova [tmpq+15*%%str], m7
- mova [tmpq+17*%%str], m8
- mova [tmpq+19*%%str], m9
- mova [tmpq+21*%%str], m10
- mova [tmpq+23*%%str], m11
- mova [tmpq+25*%%str], m12
- mova [tmpq+27*%%str], m13
- mova [tmpq+29*%%str], m14
- mova [tmpq+31*%%str], m15
-%else ; !avx2
TRANSPOSE8x8W 6, 0, 15, 14, 1, 7, 5, 4, 8
mova [tmpq+ 0*%%str], m6
mova [tmpq+ 4*%%str], m0
@@ -2645,7 +2400,6 @@ IADST16_YMM_FN iadst, IADST16, iadst, IADST16
mova [tmpq+22*%%str], m13
mova [tmpq+26*%%str], m14
mova [tmpq+30*%%str], m15
-%endif ; avx2
%else
mova m2, [tmpq+24*%%str] ; t6
mova m3, [tmpq+28*%%str] ; t7
@@ -3094,106 +2848,3 @@ cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride,
VP9_IDCT_IDCT_32x32_ADD_XMM sse2
VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
VP9_IDCT_IDCT_32x32_ADD_XMM avx
-
-; this is almost identical to VP9_STORE_2X, but it does two rows
-; for slightly improved interleaving, and it omits vpermq since the
-; input is DC so all values are identical
-%macro VP9_STORE_YMM_DC_2X2 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero
- mova m%2, [dstq]
- mova m%4, [dstq+strideq]
- punpckhbw m%3, m%2, m%6
- punpcklbw m%2, m%6
- punpckhbw m%5, m%4, m%6
- punpcklbw m%4, m%6
- paddw m%3, m%1
- paddw m%2, m%1
- paddw m%5, m%1
- paddw m%4, m%1
- packuswb m%2, m%3
- packuswb m%4, m%5
- mova [dstq+strideq*0], m%2
- mova [dstq+strideq*1], m%4
-%endmacro
-
-%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
-cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob
- cmp eobd, 135
- jg .idctfull
- cmp eobd, 1
- jg .idct16x16
-
- ; dc-only case
- mova m1, [pw_11585x2]
- vpbroadcastw m0, [blockq]
- pmulhrsw m0, m1
- pmulhrsw m0, m1
- pxor m5, m5
- pmulhrsw m0, [pw_512]
- movd [blockq], xm5
-
- DEFINE_ARGS dst, stride, cnt
- mov cntd, 16
-.loop_dc:
- VP9_STORE_YMM_DC_2X2 0, 1, 2, 3, 4, 5
- lea dstq, [dstq+2*strideq]
- dec cntd
- jg .loop_dc
- RET
-
- DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
-.idct16x16:
- mov tmpq, rsp
- VP9_IDCT32_1D blockq, 1, 16
-
- mov stride30q, strideq ; stride
- lea stride2q, [strideq*2] ; stride*2
- shl stride30q, 5 ; stride*32
- mov cntd, 2
- sub stride30q, stride2q ; stride*30
-.loop2_16x16:
- mov dstq, dst_bakq
- lea dst_endq, [dstq+stride30q]
- VP9_IDCT32_1D tmpq, 2, 16
- add dst_bakq, 16
- add tmpq, 32
- dec cntd
- jg .loop2_16x16
-
- ; at the end of the loop, m1 should still be zero
- ; use that to zero out block coefficients
- ZERO_BLOCK blockq, 64, 16, m1
- RET
-
-.idctfull:
- mov cntd, 2
- mov tmpq, rsp
-.loop1_full:
- VP9_IDCT32_1D blockq, 1
- add blockq, 32
- add tmpq, 1024
- dec cntd
- jg .loop1_full
-
- sub blockq, 64
-
- mov stride30q, strideq ; stride
- lea stride2q, [strideq*2] ; stride*2
- shl stride30q, 5 ; stride*32
- mov cntd, 2
- mov tmpq, rsp
- sub stride30q, stride2q ; stride*30
-.loop2_full:
- mov dstq, dst_bakq
- lea dst_endq, [dstq+stride30q]
- VP9_IDCT32_1D tmpq, 2
- add dst_bakq, 16
- add tmpq, 32
- dec cntd
- jg .loop2_full
-
- ; at the end of the loop, m1 should still be zero
- ; use that to zero out block coefficients
- ZERO_BLOCK blockq, 64, 32, m1
- RET
-%endif
diff --git a/libavcodec/x86/vp9itxfm_avx2.asm b/libavcodec/x86/vp9itxfm_avx2.asm
new file mode 100644
index 0000000000..3d12aa9946
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm_avx2.asm
@@ -0,0 +1,1640 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2025 Two Orioles, LLC
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+
+SECTION_RODATA 16
+
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+
+pw_512: times 2 dw 512 ; 16-byte aligned
+pd_8192: dd 8192
+pw_m512: times 2 dw -512
+pw_2048: times 2 dw 2048
+pw_1024: times 2 dw 1024 ; 16-byte aligned
+
+pw_804x2: times 2 dw 804*2
+pw_1606x2: times 2 dw 1606*2
+pw_3196x2: times 2 dw 3196*2
+pw_3981x2: times 2 dw 3981*2
+pw_6270x2: times 2 dw 6270*2
+pw_7005x2: times 2 dw 7005*2
+pw_7723x2: times 2 dw 7723*2
+pw_9760x2: times 2 dw 9760*2
+pw_11585x2: times 2 dw 11585*2
+pw_12140x2: times 2 dw 12140*2
+pw_12665x2: times 2 dw 12665*2
+pw_13160x2: times 2 dw 13160*2
+pw_13623x2: times 2 dw 13623*2
+pw_14053x2: times 2 dw 14053*2
+pw_14449x2: times 2 dw 14449*2
+pw_14811x2: times 2 dw 14811*2
+pw_15137x2: times 2 dw 15137*2
+pw_15426x2: times 2 dw 15426*2
+pw_15679x2: times 2 dw 15679*2
+pw_15893x2: times 2 dw 15893*2
+pw_16069x2: times 2 dw 16069*2
+pw_16207x2: times 2 dw 16207*2
+pw_16305x2: times 2 dw 16305*2
+pw_16364x2: times 2 dw 16364*2
+pw_m2404x2: times 2 dw -2404*2
+pw_m4756x2: times 2 dw -4756*2
+pw_m5520x2: times 2 dw -5520*2
+pw_m8423x2: times 2 dw -8423*2
+pw_m9102x2: times 2 dw -9102*2
+pw_m10394x2: times 2 dw -10394*2
+pw_m11003x2: times 2 dw -11003*2
+pw_m11585x2: times 2 dw -11585*2
+
+%macro COEF_PAIR 2-3
+pw_%1_%2: dw %1, %2
+pw_m%2_%1: dw -%2, %1
+%if %0 == 3
+pw_m%1_m%2: dw -%1, -%2
+%endif
+%endmacro
+
+COEF_PAIR 804, 16364
+COEF_PAIR 1606, 16305
+COEF_PAIR 3196, 16069, 1
+COEF_PAIR 3981, 15893
+COEF_PAIR 6270, 15137, 1
+COEF_PAIR 7005, 14811
+COEF_PAIR 7723, 14449
+COEF_PAIR 9102, 13623
+COEF_PAIR 9760, 13160
+COEF_PAIR 11585, 11585
+COEF_PAIR 12140, 11003
+COEF_PAIR 12665, 10394
+COEF_PAIR 13623, 9102, 1
+COEF_PAIR 14053, 8423
+COEF_PAIR 15137, 6270
+COEF_PAIR 15426, 5520
+COEF_PAIR 15679, 4756
+COEF_PAIR 16069, 3196
+COEF_PAIR 16207, 2404
+
+; ADST4-only:
+pw_0_13377: dw 0, 13377
+pw_13377_m13377: dw 13377, -13377
+pw_m13377_m5283: dw -13377, -5283
+pw_13377_m15212: dw 13377, -15212
+pw_9929_m5283: dw 9929, -5283
+pw_5283_15212: dw 5283, 15212
+pw_13377_9929: dw 13377, 9929
+
+; ADST16-only:
+pw_8423_3981: dw 8423, 3981
+pw_m8423_3981: dw -8423, 3981
+pw_14053_m15893: dw 14053, -15893
+pw_m14053_m15893: dw -14053, -15893
+pw_2404_9760: dw 2404, 9760
+pw_m2404_9760: dw -2404, 9760
+pw_16207_m13160: dw 16207, -13160
+pw_m16207_m13160: dw -16207, -13160
+pw_11003_804: dw 11003, 804
+pw_m11003_804: dw -11003, 804
+pw_12140_m16364: dw 12140, -16364
+pw_m12140_m16364: dw -12140, -16364
+pw_5520_7005: dw 5520, 7005
+pw_m5520_7005: dw -5520, 7005
+pw_15426_m14811: dw 15426, -14811
+pw_m15426_m14811: dw -15426, -14811
+
+SECTION .text
+
+%define o_base pw_512 + 128
+%define o(x) (r6 - (o_base) + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+%macro IWHT4_1D_PACKED 0
+ psubw m2, m0, m3
+ paddw m0, m3
+ punpckhqdq m2, m2 ; t2 t2
+ punpcklqdq m0, m0 ; t0 t0
+ psubw m1, m0, m2
+ psraw m1, 1
+ psubw m1, m3 ; t1 t3
+ psubw m0, m1 ; ____ out0
+ paddw m2, m1 ; out3 ____
+%endmacro
+
+INIT_XMM avx2
+cglobal vp9_iwht_iwht_4x4_add, 3, 3, 6, dst, stride, c
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ pxor m2, m2
+ mova [cq+16*0], m2
+ mova [cq+16*1], m2
+ lea r2, [dstq+strideq*2]
+ punpckhqdq m3, m0, m1 ; in1 in3
+ punpcklqdq m0, m1 ; in0 in2
+ movd m4, [r2 +strideq*0]
+ pinsrd m4, [dstq+strideq*1], 1
+ movd m5, [r2 +strideq*1]
+ pinsrd m5, [dstq+strideq*0], 1
+ psraw m3, 2
+ psraw m0, 2
+ IWHT4_1D_PACKED
+ punpckhwd m0, m1
+ punpcklwd m1, m2
+ punpckhdq m2, m0, m1
+ punpckldq m0, m1
+ punpckhqdq m3, m0, m2
+ punpcklqdq m0, m2
+ IWHT4_1D_PACKED
+ pmovzxbw m4, m4
+ pmovzxbw m5, m5
+ vpblendd m0, m2, 0x03
+ paddw m1, m4
+ paddw m0, m5
+ packuswb m0, m1
+ pextrd [dstq+strideq*0], m0, 1
+ pextrd [dstq+strideq*1], m0, 3
+ pextrd [r2 +strideq*0], m0, 2
+ movd [r2 +strideq*1], m0
+ RET
+
+%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], swap
+%if %7
+ vpbroadcastd m%2, [o(pw_%5_%6)]
+ vpbroadcastd m%3, [o(pw_m%6_%5)]
+%else
+ vpbroadcastd m%2, [o(pw_m%6_%5)]
+ vpbroadcastd m%3, [o(pw_%5_%6)]
+%endif
+ pmaddwd m%2, m%1
+ pmaddwd m%1, m%3
+ paddd m%2, m%4
+ paddd m%1, m%4
+ psrad m%2, 14
+ psrad m%1, 14
+ packssdw m%1, m%2
+%endmacro
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
+ punpckhwd m%3, m%2, m%1
+ punpcklwd m%2, m%1
+%if %7 < 32
+ pmaddwd m%1, m%7, m%2
+ pmaddwd m%4, m%7, m%3
+%else
+ vpbroadcastd m%1, [o(pw_m%7_%6)]
+ pmaddwd m%4, m%3, m%1
+ pmaddwd m%1, m%2
+%endif
+ paddd m%4, m%5
+ paddd m%1, m%5
+ psrad m%4, 14
+ psrad m%1, 14
+ packssdw m%1, m%4
+%if %7 < 32
+ pmaddwd m%3, m%6
+ pmaddwd m%2, m%6
+%else
+ vpbroadcastd m%4, [o(pw_%6_%7)]
+ pmaddwd m%3, m%4
+ pmaddwd m%2, m%4
+%endif
+ paddd m%3, m%5
+ paddd m%2, m%5
+ psrad m%3, 14
+ psrad m%2, 14
+ packssdw m%2, m%3
+%endmacro
+
+%macro ADST_MULSUB_2W 8-10 ; dst/src[1-2], dst[3-4], tmp, rnd, coef[1-4]
+ vpbroadcastd m%3, [o(pw_m%8_%7)]
+ vpbroadcastd m%4, [o(pw_%7_%8)]
+ pmaddwd m%3, m%1
+ pmaddwd m%1, m%4
+%if %0 == 8
+ vpbroadcastd m%5, [o(pw_%8_%7)]
+ vpbroadcastd m%4, [o(pw_m%7_%8)]
+%else
+ vpbroadcastd m%5, [o(pw_m%10_%9)]
+ vpbroadcastd m%4, [o(pw_%9_%10)]
+%endif
+ pmaddwd m%5, m%2
+ pmaddwd m%4, m%2
+ paddd m%3, m%6
+ paddd m%1, m%6
+ psubd m%2, m%1, m%4
+ paddd m%1, m%4
+ psubd m%4, m%3, m%5
+ paddd m%3, m%5
+%endmacro
+
+%macro ADST_MULSUB_4W 12-14 ; dst/src[1-4], tmp[1-5], rnd, coef[1-4]
+ vpbroadcastd m%8, [o(pw_%11_%12)]
+ vpbroadcastd m%7, [o(pw_m%12_%11)]
+ punpckhwd m%5, m%2, m%1
+ punpcklwd m%2, m%1
+%if %0 == 12
+ vpbroadcastd m%1, [o(pw_m%11_%12)]
+ vpbroadcastd m%9, [o(pw_%12_%11)]
+%else
+ vpbroadcastd m%1, [o(pw_%13_%14)]
+ vpbroadcastd m%9, [o(pw_m%14_%13)]
+%endif
+ pmaddwd m%6, m%5, m%8
+ pmaddwd m%8, m%2
+ pmaddwd m%5, m%7
+ pmaddwd m%2, m%7
+ punpckhwd m%7, m%4, m%3
+ punpcklwd m%4, m%3
+ pmaddwd m%3, m%7, m%1
+ pmaddwd m%1, m%4
+ pmaddwd m%7, m%9
+ pmaddwd m%9, m%4
+ REPX {paddd x, m%10}, m%6, m%8, m%5, m%2
+ psubd m%4, m%6, m%3
+ paddd m%6, m%3
+ psubd m%3, m%8, m%1
+ paddd m%1, m%8
+ REPX {psrad x, 14}, m%4, m%3, m%6, m%1
+ psubd m%8, m%5, m%7
+ paddd m%5, m%7
+ packssdw m%3, m%4
+ psubd m%4, m%2, m%9
+ paddd m%2, m%9
+ packssdw m%1, m%6
+ REPX {psrad x, 14}, m%8, m%4, m%5, m%2
+ packssdw m%4, m%8
+ packssdw m%2, m%5
+%endmacro
+
+%macro INV_TXFM_FN 3-4 0 ; type1, type2, size, eob_offset
+cglobal vp9_i%1_i%2_%3_add, 4, 5, 0, dst, stride, c, eob, tx2
+ %undef cmp
+ %define %%p1 m(vp9_i%1_%3_internal)
+ lea r6, [o_base]
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(vp9_i%2_%3_internal).pass2]
+%ifidn %1_%2, dct_dct
+ cmp eobd, 1
+ jne %%p1
+%else
+%if %4
+ add eobd, %4
+%endif
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x4
+%ifidn %1_%2, dct_dct
+ vpbroadcastw m0, [cq]
+ vpbroadcastd m1, [o(pw_11585x2)]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+ mova m1, m0
+ jmp m(vp9_idct_4x4_internal).pass2_end
+%endif
+%endmacro
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+
+cglobal vp9_idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call .main
+.pass1_end:
+ mova m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m1, q0220
+ pshufb m0, m2
+ pshufb m1, m3, m2
+ jmp tx2q
+.pass2:
+ call .main
+.pass2_end:
+ vpbroadcastd m2, [o(pw_2048)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ lea r3, [dstq+strideq*2]
+ movd m2, [dstq+strideq*0]
+ pinsrd m2, [dstq+strideq*1], 1
+ movd m3, [r3 +strideq*1]
+ pinsrd m3, [r3 +strideq*0], 1
+ pxor m4, m4
+ pmovzxbw m2, m2
+ mova [cq+16*0], m4
+ pmovzxbw m3, m3
+ mova [cq+16*1], m4
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ movd [dstq+strideq*0], m0
+ pextrd [dstq+strideq*1], m0, 1
+ pextrd [r3 +strideq*0], m0, 3
+ pextrd [r3 +strideq*1], m0, 2
+ RET
+ALIGN function_align
+.main:
+ vpbroadcastd m4, [o(pd_8192)]
+ punpckhwd m2, m1, m0
+ psubw m3, m0, m1
+ paddw m0, m1
+ punpcklqdq m0, m3
+ ITX_MUL2X_PACK 2, 1, 3, 4, 6270, 15137
+ vpbroadcastd m4, [o(pw_11585x2)]
+ pmulhrsw m0, m4 ; t0 t1
+ psubw m1, m0, m2 ; out3 out2
+ paddw m0, m2 ; out0 out1
+ ret
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+
+cglobal vp9_iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call .main
+ jmp m(vp9_idct_4x4_internal).pass1_end
+.pass2:
+ call .main
+ jmp m(vp9_idct_4x4_internal).pass2_end
+ALIGN function_align
+.main:
+ vpbroadcastd m4, [o(pw_0_13377)]
+ punpckhwd m2, m0, m1
+ vpbroadcastd m5, [o(pw_13377_m13377)]
+ punpcklwd m0, m1
+ vpbroadcastd m1, [o(pw_m13377_m5283)]
+ vpbroadcastd m3, [o(pw_13377_m15212)]
+ pmaddwd m4, m2
+ pmaddwd m5, m0
+ pmaddwd m1, m2
+ pmaddwd m3, m2
+ paddd m4, m5 ; 2
+ vpbroadcastd m5, [o(pw_9929_m5283)]
+ pmaddwd m5, m0
+ paddd m1, m5
+ paddd m3, m5 ; 1
+ vpbroadcastd m5, [o(pw_5283_15212)]
+ pmaddwd m0, m5
+ vpbroadcastd m5, [o(pw_13377_9929)]
+ pmaddwd m2, m5
+ vpbroadcastd m5, [o(pd_8192)]
+ paddd m4, m5
+ paddd m0, m5
+ paddd m3, m5
+ paddd m1, m0 ; 3
+ paddd m0, m2 ; 0
+ REPX {psrad x, 14}, m4, m1, m3, m0
+ packssdw m1, m4 ; out3 out2
+ packssdw m0, m3 ; out0 out1
+ ret
+
+%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3]
+ movq xm%3, [dstq ]
+ movhps xm%3, [dstq+%5]
+ movq xm%4, [dstq+%6]
+ movhps xm%4, [dstq+%7]
+ pmovzxbw m%3, xm%3
+ pmovzxbw m%4, xm%4
+ paddw m%3, m%1
+ paddw m%4, m%2
+ packuswb m%3, m%4
+ vextracti128 xm%4, m%3, 1
+ movq [dstq ], xm%3
+ movhps [dstq+%6], xm%3
+ movq [dstq+%5], xm%4
+ movhps [dstq+%7], xm%4
+%endmacro
+
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x8
+%ifidn %1_%2, dct_dct
+ vpbroadcastw xm2, [cq]
+ vpbroadcastd xm1, [o(pw_11585x2)]
+ vpbroadcastd xm0, [o(pw_1024)]
+ mov word [cq], 0
+ pmulhrsw xm2, xm1
+ add r3d, 3
+ pmulhrsw xm2, xm1
+ pmulhrsw xm2, xm0
+.dconly_loop:
+ pmovzxbw xm0, [dstq+strideq*0]
+ pmovzxbw xm1, [dstq+strideq*1]
+ paddw xm0, xm2
+ paddw xm1, xm2
+ packuswb xm0, xm1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ lea dstq, [dstq+strideq*2]
+ dec r3d
+ jge .dconly_loop
+ RET
+%endif
+%endmacro
+
+INIT_YMM avx2
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, adst
+
+cglobal vp9_idct_8x8_internal, 0, 5, 8, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ call .main
+ shufps m4, m0, m1, q0220
+ shufps m5, m0, m1, q1331
+ vbroadcasti128 m0, [o(deint_shuf)]
+ shufps m1, m2, m3, q0220
+ shufps m3, m2, m3, q1331
+ REPX {pshufb x, m0}, m4, m5, m1, m3
+ vinserti128 m0, m4, xm1, 1
+ vperm2i128 m2, m4, m1, 0x31
+ vinserti128 m1, m5, xm3, 1
+ vperm2i128 m3, m5, m3, 0x31
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m4, [o(pw_1024)]
+ vpermq m1, m1, q2031
+ vpermq m3, m3, q2031
+.end:
+ vpermq m0, m0, q3120
+ vpermq m2, m2, q3120
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ lea r3, [strideq*3]
+ WRITE_8X4 0, 1, 4, 5
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 2, 3, 4, 5
+ RET
+ALIGN function_align
+.main:
+ vpbroadcastd m6, [o(pd_8192)]
+ punpckhwd m5, m3, m0 ; in7 in1
+ punpckhwd m4, m1, m2 ; in3 in5
+ punpcklwd m3, m1 ; in2 in6
+ psubw m1, m0, m2
+ paddw m0, m2
+ punpcklqdq m0, m1 ; in0+in4 in0-in4
+ ITX_MUL2X_PACK 5, 1, 2, 6, 3196, 16069, 1 ; t4a t7a
+ ITX_MUL2X_PACK 4, 1, 2, 6, 13623, 9102, 1 ; t5a t6a
+ ITX_MUL2X_PACK 3, 1, 2, 6, 6270, 15137 ; t3 t2
+ vpbroadcastd m6, [o(pw_11585x2)]
+ psubw m2, m5, m4 ; t4 t7
+ paddw m5, m4 ; t5a t6a
+ pshufd m4, m2, q1032
+ psubw m1, m2, m4
+ paddw m4, m2
+ vpblendd m4, m1, 0xcc
+ pmulhrsw m0, m6 ; t0 t1
+ pmulhrsw m4, m6 ; t6 t5
+ psubw m1, m0, m3 ; tmp3 tmp2
+ paddw m0, m3 ; tmp0 tmp1
+ shufps m2, m5, m4, q1032
+ vpblendd m5, m4, 0xcc
+ psubw m3, m0, m2 ; out7 out6
+ paddw m0, m2 ; out0 out1
+ psubw m2, m1, m5 ; out4 out5
+ paddw m1, m5 ; out3 out2
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+
+cglobal vp9_iadst_8x8_internal, 0, 5, 8, dst, stride, c, eob, tx2
+ vpermq m4, [cq+32*0], q1302 ; 1 0
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m5, [cq+32*1], q1302 ; 3 2
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ call .main
+ punpcklwd m4, m0, m1
+ punpckhwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ pxor m3, m3
+ psubw m0, m3, m0
+ psubw m2, m3, m2
+ punpcklwd m3, m4, m0
+ punpckhwd m4, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ vperm2i128 m2, m3, m0, 0x31
+ vinserti128 m0, m3, xm0, 1
+ vperm2i128 m3, m4, m1, 0x31
+ vinserti128 m1, m4, xm1, 1
+ jmp tx2q
+.pass2:
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call .main
+ vpbroadcastd m5, [o(pw_1024)]
+ vpbroadcastd xm4, [o(pw_2048)]
+ psubw m4, m5 ; lower half = 1024, upper half = -1024
+ REPX {vpermq x, x, q3120}, m1, m3
+ jmp m(vp9_idct_8x8_internal).end
+ALIGN function_align
+.main:
+ vpbroadcastd m7, [o(pd_8192)]
+ punpckhwd m0, m4, m3 ; 0 7
+ punpckhwd m1, m5, m2 ; 2 5
+ punpcklwd m2, m5 ; 4 3
+ punpcklwd m3, m4 ; 6 1
+ ADST_MULSUB_2W 0, 2, 4, 5, 6, 7, 1606, 16305, 12665, 10394 ; t0, t4, t1, t5
+ pslld m2, 2
+ REPX {psrad x, 14}, m0, m5, m4
+ pblendw m2, m5, 0x55 ; t5 t4
+ packssdw m0, m4 ; t0 t1
+ ADST_MULSUB_2W 1, 3, 4, 5, 6, 7, 7723, 14449, 15679, 4756 ; t2, t6, t3, t7
+ pslld m5, 2
+ REPX {psrad x, 14}, m3, m1, m4
+ pblendw m3, m5, 0xaa ; t6 t7
+ packssdw m1, m4 ; t2 t3
+ ADST_MULSUB_2W 2, 3, 4, 5, 6, 7, 6270, 15137 ; t4, t6, t5, t7
+ REPX {psrad x, 14}, m3, m2, m5, m4
+ packssdw m3, m5 ; t6 t7
+ packssdw m2, m4 ; -out1 out6
+ vpbroadcastd m5, [o(pw_11585x2)]
+ psubw m4, m0, m1 ; t2 t3
+ paddw m0, m1 ; out0 -out7
+ punpckhqdq m1, m4, m3 ; t3 t7
+ punpcklqdq m4, m3 ; t2 t6
+ punpckhqdq m3, m2, m0 ; out6 -out7
+ punpcklqdq m0, m2 ; out0 -out1
+ psubw m2, m4, m1
+ paddw m1, m4
+ pshufd m1, m1, q1032
+ pmulhrsw m2, m5 ; out4 -out5
+ pmulhrsw m1, m5 ; out2 -out3
+ ret
+
+%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
+ pmovzxbw m%3, [dstq+%5]
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+ pmovzxbw m%4, [dstq+%6]
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ vpermq m%3, m%3, q3120
+ mova [dstq+%5], xm%3
+ vextracti128 [dstq+%6], m%3, 1
+%endmacro
+
+%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, 16x16, %3
+%ifidn %1_%2, dct_dct
+ movd xm0, [o(pw_11585x2)]
+ pmulhrsw xm3, xm0, [cq]
+ pxor m2, m2
+ pmulhrsw xm3, xm0
+ pmulhrsw xm3, [o(pw_512)]
+ movd [cq], xm2
+ add r3d, 7
+ vpbroadcastw m3, xm3
+.dconly_loop:
+ mova xm1, [dstq+strideq*0]
+ vinserti128 m1, [dstq+strideq*1], 1
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ dec r3d
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+%macro IDCT16_MAIN 2 ; name, idct32 (uses 32-bit intermediates for 11585 multiplies)
+%1_fast:
+ vpbroadcastd m10, [o(pw_m9102x2)]
+ vpbroadcastd m8, [o(pw_13623x2)]
+ vpbroadcastd m14, [o(pw_16069x2)]
+ vpbroadcastd m15, [o(pw_3196x2)]
+ pmulhrsw m10, m6
+ vpbroadcastd m12, [o(pw_15137x2)]
+ pmulhrsw m6, m8
+ vpbroadcastd m8, [o(pw_6270x2)]
+ pmulhrsw m14, m2
+ vpbroadcastd m9, [o(pw_11585x2)]
+ pmulhrsw m2, m15
+ vpbroadcastd m15, [o(pd_8192)]
+ pmulhrsw m12, m4
+ pmulhrsw m4, m8
+ pmulhrsw m0, m9
+ mova m8, m0
+ jmp %%main2
+ALIGN function_align
+%1:
+ mova [rsp+gprsize+32*1], m13
+ mova [rsp+gprsize+32*2], m9
+ vpbroadcastd m15, [o(pd_8192)]
+ ITX_MULSUB_2W 10, 6, 9, 13, 15, 13623, 9102 ; t5a, t6a
+ ITX_MULSUB_2W 2, 14, 9, 13, 15, 3196, 16069 ; t4a, t7a
+ ITX_MULSUB_2W 4, 12, 9, 13, 15, 6270, 15137 ; t2, t3
+ ITX_MULSUB_2W 0, 8, 9, 13, 15, 11585, 11585 ; t1, t0
+%%main2:
+ paddw m13, m14, m6 ; t7
+ psubw m14, m6 ; t6a
+ paddw m6, m2, m10 ; t4
+ psubw m2, m10 ; t5a
+%if %2
+ ITX_MULSUB_2W 14, 2, 9, 10, 15, 11585, 11585 ; t5, t6
+ psubw m10, m0, m4 ; t2
+ paddw m4, m0 ; t1
+ paddw m0, m8, m12 ; t0
+ psubw m8, m12 ; t3
+ psubw m9, m4, m2 ; t6
+ paddw m2, m4 ; t1
+ psubw m4, m8, m6 ; t4
+ paddw m8, m6 ; t3
+ psubw m6, m10, m14 ; t5
+ paddw m10, m14 ; t2
+%else
+ vpbroadcastd m9, [o(pw_11585x2)]
+ psubw m10, m14, m2
+ paddw m2, m14
+ pmulhrsw m10, m9 ; t5
+ pmulhrsw m2, m9 ; t6
+ psubw m14, m0, m4 ; t2
+ paddw m4, m0 ; t1
+ paddw m0, m8, m12 ; t0
+ psubw m8, m12 ; t3
+ psubw m9, m4, m2 ; t6
+ paddw m2, m4 ; t1
+ psubw m4, m8, m6 ; t4
+ paddw m8, m6 ; t3
+ psubw m6, m14, m10 ; t5
+ paddw m10, m14 ; t2
+%endif
+ psubw m14, m0, m13 ; t7
+ paddw m0, m13 ; t0
+ test eobd, eobd
+ jl %%main3_fast
+ mova m12, [rsp+gprsize+32*2] ; in9
+ mova [rsp+gprsize+32*2], m14
+ mova m13, [rsp+gprsize+32*1] ; in13
+ mova [rsp+gprsize+32*1], m2
+ mova m14, [rsp+gprsize+32*0] ; in15
+ mova [rsp+gprsize+32*0], m8
+ ITX_MULSUB_2W 1, 14, 2, 8, 15, 1606, 16305 ; t8a, t15a
+ ITX_MULSUB_2W 12, 7, 2, 8, 15, 12665, 10394 ; t9a, t14a
+ ITX_MULSUB_2W 5, 11, 2, 8, 15, 7723, 14449 ; t10a, t13a
+ ITX_MULSUB_2W 13, 3, 2, 8, 15, 15679, 4756 ; t11a, t12a
+ jmp %%main3
+%%main3_fast:
+ mova [rsp+gprsize+32*2], m14
+ mova [rsp+gprsize+32*1], m2
+ mova [rsp+gprsize+32*0], m8
+ vpbroadcastd m14, [o(pw_16305x2)]
+ vpbroadcastd m2, [o(pw_1606x2)]
+ vpbroadcastd m12, [o(pw_m10394x2)]
+ vpbroadcastd m8, [o(pw_12665x2)]
+ pmulhrsw m14, m1
+ vpbroadcastd m11, [o(pw_14449x2)]
+ pmulhrsw m1, m2
+ vpbroadcastd m2, [o(pw_7723x2)]
+ pmulhrsw m12, m7
+ vpbroadcastd m13, [o(pw_m4756x2)]
+ pmulhrsw m7, m8
+ vpbroadcastd m8, [o(pw_15679x2)]
+ pmulhrsw m11, m5
+ pmulhrsw m5, m2
+ pmulhrsw m13, m3
+ pmulhrsw m3, m8
+%%main3:
+ paddw m2, m11, m3 ; t12
+ psubw m3, m11 ; t13
+ psubw m11, m14, m7 ; t14
+ paddw m14, m7 ; t15
+ psubw m7, m13, m5 ; t10
+ paddw m5, m13 ; t11
+ psubw m13, m1, m12 ; t9
+ paddw m12, m1 ; t8
+ ITX_MULSUB_2W 11, 13, 1, 8, 15, 6270, 15137 ; t9a, t14a
+ ITX_MULSUB_2W 3, 7, 1, 8, 15, m15137, 6270 ; t10a, t13a
+%if %2
+ psubw m1, m12, m5 ; t11a
+ paddw m12, m5 ; t8a
+ psubw m5, m13, m7 ; t13
+ paddw m13, m7 ; t14
+ psubw m7, m14, m2 ; t12a
+ paddw m14, m2 ; t15a
+ psubw m2, m11, m3 ; t10
+ paddw m3, m11 ; t9
+ ITX_MULSUB_2W 5, 2, 8, 11, 15, 11585, 11585 ; t10a, t13a
+ ITX_MULSUB_2W 7, 1, 8, 11, 15, 11585, 11585 ; t11, t12
+%else
+ vpbroadcastd m15, [o(pw_11585x2)]
+ psubw m8, m12, m5 ; t11a
+ paddw m12, m5 ; t8a
+ psubw m5, m13, m7 ; t13
+ paddw m13, m7 ; t14
+ psubw m7, m14, m2 ; t12a
+ paddw m14, m2 ; t15a
+ psubw m1, m11, m3 ; t10
+ paddw m3, m11 ; t9
+ paddw m2, m5, m1 ; t13a
+ psubw m5, m1 ; t10a
+ paddw m1, m7, m8 ; t12
+ psubw m7, m8 ; t11
+ REPX {pmulhrsw x, m15}, m2, m5, m1, m7
+%endif
+ mova m8, [rsp+gprsize+32*1] ; t1
+ psubw m15, m0, m14 ; out15
+ paddw m0, m14 ; out0
+ psubw m14, m8, m13 ; out14
+ paddw m8, m13 ; out1
+ psubw m13, m10, m2 ; out13
+ paddw m2, m10 ; out2
+ psubw m11, m4, m7 ; out11
+ paddw m4, m7 ; out4
+ mova m7, [rsp+gprsize+32*2] ; t7
+ psubw m10, m6, m5 ; out10
+ paddw m5, m6 ; out5
+ paddw m6, m9, m3 ; out6
+ psubw m9, m3 ; out9
+ mova m3, [rsp+gprsize+32*0] ; t3
+ mova [rsp+gprsize+32*1], m8
+ psubw m8, m7, m12 ; out8
+ paddw m7, m12 ; out7
+ psubw m12, m3, m1 ; out12
+ paddw m3, m1 ; out3
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, adst, 39-23
+
+cglobal vp9_idct_16x16_internal, 0, 5, 16, 32*6, dst, stride, c, eob, tx2
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ mova m4, [cq+32*4]
+ mova m5, [cq+32*5]
+ mova m6, [cq+32*6]
+ mova m7, [cq+32*7]
+ sub eobd, 39
+ jl .pass1_fast
+ add cq, 32*12
+ mova m8, [cq-32*4]
+ mova m9, [cq-32*3]
+ mova m10, [cq-32*2]
+ mova m11, [cq-32*1]
+ mova m12, [cq+32*0]
+ mova m13, [cq+32*1]
+ mova m14, [cq+32*2]
+ mova m15, [cq+32*3]
+ mova [rsp], m15
+ call .main
+ vextracti128 [rsp+16*4], m0, 1
+ mova [rsp+16*0], xm0
+.pass1_end:
+ vextracti128 [rsp+16*5], m8, 1
+ mova [rsp+16*1], xm8
+ mova xm1, [rsp+32*1+16*0]
+ vinserti128 m8, m9, [rsp+32*1+16*1], 0
+ vinserti128 m1, xm9, 1
+ vperm2i128 m9, m2, m10, 0x31
+ vinserti128 m2, xm10, 1
+ vperm2i128 m10, m3, m11, 0x31
+ vinserti128 m3, xm11, 1
+ vperm2i128 m11, m4, m12, 0x31
+ vinserti128 m4, xm12, 1
+ vperm2i128 m12, m5, m13, 0x31
+ vinserti128 m5, xm13, 1
+ vperm2i128 m13, m6, m14, 0x31
+ vinserti128 m6, xm14, 1
+ vperm2i128 m14, m7, m15, 0x31
+ vinserti128 m7, xm15, 1
+ mova m15, [rsp+32*2]
+ pxor m0, m0
+ mov r3, -32*12
+.zero_loop:
+ mova [cq+r3+32*0], m0
+ mova [cq+r3+32*1], m0
+ mova [cq+r3+32*2], m0
+ mova [cq+r3+32*3], m0
+ add r3, 32*4
+ jle .zero_loop
+ punpcklwd m0, m9, m10
+ punpckhwd m9, m10
+ punpcklwd m10, m15, m8
+ punpckhwd m15, m8
+ punpckhwd m8, m11, m12
+ punpcklwd m11, m12
+ punpckhwd m12, m13, m14
+ punpcklwd m13, m14
+ punpckhdq m14, m11, m13
+ punpckldq m11, m13
+ punpckldq m13, m15, m9
+ punpckhdq m15, m9
+ punpckldq m9, m10, m0
+ punpckhdq m10, m0
+ punpckhdq m0, m8, m12
+ punpckldq m8, m12
+ punpcklqdq m12, m13, m8
+ punpckhqdq m13, m8
+ punpcklqdq m8, m9, m11
+ punpckhqdq m9, m11
+ punpckhqdq m11, m10, m14
+ punpcklqdq m10, m14
+ punpcklqdq m14, m15, m0
+ punpckhqdq m15, m0
+ mova m0, [rsp]
+ mova [rsp], m15
+ punpckhwd m15, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m15, m1
+ punpckhdq m15, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m15
+ punpcklqdq m6, m15
+ jmp tx2q
+.pass1_fast:
+ call .main_fast
+ mova xm1, [rsp+32*1]
+.pass1_fast_end:
+ vinserti128 m0, xm8, 1
+ vinserti128 m1, xm9, 1
+ vinserti128 m2, xm10, 1
+ vinserti128 m3, xm11, 1
+ vinserti128 m4, xm12, 1
+ vinserti128 m5, xm13, 1
+ vinserti128 m6, xm14, 1
+ vinserti128 m7, xm15, 1
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7
+ call .transpose_8x8
+ jmp tx2q
+.pass2:
+ test eobd, eobd
+ jl .pass2_fast
+ call .main
+ jmp .pass2_end
+.pass2_fast:
+ call .main_fast
+.pass2_end:
+ vpbroadcastd m1, [o(pw_512)]
+ REPX {pmulhrsw x, m1}, m0, m2, m4, m5, m6, m7, m8, m9, m10, m11, m12, m14
+.end:
+ REPX {pmulhrsw x, m1}, m3, m13, m15
+ pmulhrsw m1, [rsp+32*1]
+ mova [rsp], m6
+ lea r3, [strideq*3]
+ WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 10, 11, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 14, 15, 0, 1, strideq*2, r3
+ RET
+ALIGN function_align
+ IDCT16_MAIN .main, 0
+ ret
+ALIGN function_align
+.transpose_8x8:
+ punpckhwd m8, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m8, m1
+ punpckhdq m8, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m8
+ punpcklqdq m6, m8
+ ret
+
+%macro ADST_MULSUB_4W_FAST 12 ; dst/src[1-4], tmp[1-3], rnd, coef[1-4]
+ vpbroadcastd m%5, [o(pw_%11_m%10)]
+ vpbroadcastd m%6, [o(pw_m%11_m%10)]
+ punpckhwd m%7, m%3, m%2
+ punpcklwd m%3, m%2
+ pmaddwd m%2, m%3, m%5
+ pmaddwd m%5, m%7
+ pmaddwd m%4, m%3, m%6
+ pmaddwd m%6, m%7
+ REPX {paddd x, m%8}, m%2, m%5, m%4, m%6
+ REPX {psrad x, 14 }, m%2, m%5, m%4, m%6
+ packssdw m%2, m%5
+ vpbroadcastd m%5, [o(pw_%12_%9)]
+ packssdw m%4, m%6
+ vpbroadcastd m%6, [o(pw_m%12_%9)]
+ pmaddwd m%1, m%3, m%5
+ pmaddwd m%5, m%7
+ pmaddwd m%3, m%6
+ pmaddwd m%6, m%7
+ REPX {paddd x, m%8}, m%1, m%5, m%3, m%6
+ REPX {psrad x, 14 }, m%1, m%5, m%3, m%6
+ packssdw m%1, m%5
+ packssdw m%3, m%6
+%endmacro
+
+INV_TXFM_16X16_FN adst, dct, 39-18
+INV_TXFM_16X16_FN adst, adst
+
+cglobal vp9_iadst_16x16_internal, 0, 5, 16, 32*6, dst, stride, c, eob, tx2
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ mova m4, [cq+32*4]
+ mova m5, [cq+32*5]
+ mova m6, [cq+32*6]
+ mova m7, [cq+32*7]
+ sub eobd, 39
+ jl .pass1_fast
+ add cq, 32*12
+ mova m8, [cq-32*4]
+ mova m9, [cq-32*3]
+ mova m10, [cq-32*2]
+ mova m11, [cq-32*1]
+ mova m12, [cq+32*0]
+ mova m13, [cq+32*1]
+ mova m14, [cq+32*2]
+ mova m15, [cq+32*3]
+ mova [rsp+32*0], m15
+ call .main
+ call .pass1_main_part2
+ mova [rsp+32*1], m1
+ jmp m(vp9_idct_16x16_internal).pass1_end
+.pass1_fast:
+ call .main_fast
+ call .pass1_main_part2
+ mova xm0, [rsp+32*0]
+ jmp m(vp9_idct_16x16_internal).pass1_fast_end
+.pass2:
+ test eobd, eobd
+ jl .pass2_fast
+ call .main
+ jmp .pass2_end
+.pass2_fast:
+ call .main_fast
+.pass2_end:
+ ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to
+ ; 16-bit here will produce the same result as using 32-bit intermediates.
+ paddsw m5, m10, m11 ; -out5
+ psubsw m10, m11 ; out10
+ psubsw m11, m8, m4 ; out11
+ paddsw m4, m8 ; out4
+ psubsw m8, m7, m9 ; out8
+ paddsw m7, m9 ; -out7
+ psubsw m9, m6, m1 ; out9
+ paddsw m6, m1 ; out6
+ vpbroadcastd m1, [o(pw_11585x2)]
+ REPX {pmulhrsw x, m1}, m4, m6, m8, m9, m10, m11
+ vpbroadcastd m1, [o(pw_m11585x2)]
+ pmulhrsw m5, m1
+ pmulhrsw m7, m1
+ vpbroadcastd m1, [o(pw_512)]
+ REPX {pmulhrsw x, m1}, m0, m2, m4, m5, m6, m7, m8, m9, m10, m11, m12, m14
+ vpbroadcastd m1, [o(pw_m512)]
+ jmp m(vp9_idct_16x16_internal).end
+ALIGN function_align
+.main_fast:
+ mova [rsp+gprsize+32*1], m0
+ mova [rsp+gprsize+32*2], m3
+ mova [rsp+gprsize+32*3], m4
+ vpbroadcastd m15, [o(pd_8192)]
+ ADST_MULSUB_4W_FAST 13, 2, 5, 10, 0, 3, 4, 15, 3981, 15893, 14053, 8423
+ ADST_MULSUB_4W_FAST 9, 6, 1, 14, 0, 3, 4, 15, 9760, 13160, 16207, 2404
+ jmp .main2
+ALIGN function_align
+.main:
+ mova [rsp+gprsize+32*1], m0
+ mova [rsp+gprsize+32*2], m3
+ mova [rsp+gprsize+32*3], m4
+ mova [rsp+gprsize+32*4], m12
+ mova [rsp+gprsize+32*5], m8
+ vpbroadcastd m15, [o(pd_8192)]
+ ADST_MULSUB_4W 13, 2, 5, 10, 0, 3, 4, 8, 12, 15, 3981, 15893, 14053, 8423 ; t2a, t3a, t10a, t11a
+ ADST_MULSUB_4W 9, 6, 1, 14, 0, 3, 4, 8, 12, 15, 9760, 13160, 16207, 2404 ; t6a, t7a, t14a, t15a
+.main2:
+ ADST_MULSUB_4W 5, 10, 14, 1, 0, 3, 4, 8, 12, 15, 13623, 9102 ; t10a, t11a, t14a, t15a
+ psubw m4, m2, m6 ; t7
+ paddw m2, m6 ; t3
+ psubw m8, m13, m9 ; t6
+ paddw m13, m9 ; t2
+ mova m0, [rsp+gprsize+32*0] ; in15
+ mova [rsp+gprsize+32*0], m10
+ mova m10, [rsp+gprsize+32*1] ; in0
+ mova [rsp+gprsize+32*1], m13
+ mova m13, [rsp+gprsize+32*2] ; in3
+ mova [rsp+gprsize+32*2], m1
+ mova m6, [rsp+gprsize+32*3] ; in4
+ mova [rsp+gprsize+32*3], m2
+ mova m2, [rsp+gprsize+32*4] ; in12
+ mova [rsp+gprsize+32*4], m5
+ mova m5, [rsp+gprsize+32*5] ; in8
+ mova [rsp+gprsize+32*5], m14
+ test eobd, eobd
+ jl .main3_fast
+ ADST_MULSUB_4W 0, 10, 7, 5, 1, 3, 9, 12, 14, 15, 804, 16364, 12140, 11003 ; t0a, t1a, t8a, t9a
+ ADST_MULSUB_4W 11, 6, 13, 2, 1, 3, 9, 12, 14, 15, 7005, 14811, 15426, 5520 ; t4a, t5a, t12a, t13a
+ jmp .main3
+.main3_fast:
+ ADST_MULSUB_4W_FAST 0, 10, 7, 5, 1, 3, 9, 15, 804, 16364, 12140, 11003
+ ADST_MULSUB_4W_FAST 11, 6, 13, 2, 1, 3, 9, 15, 7005, 14811, 15426, 5520
+.main3:
+ ADST_MULSUB_4W 7, 5, 2, 13, 1, 3, 9, 12, 14, 15, 3196, 16069 ; t8a, t9a, t12a, t13a
+ psubw m3, m0, m11 ; t4
+ paddw m0, m11 ; t0
+ psubw m12, m10, m6 ; t5
+ paddw m10, m6 ; t1
+ mova m11, [rsp+gprsize+32*5] ; t14a
+ mova [rsp+gprsize+32*5], m10
+ mova m10, [rsp+gprsize+32*2] ; t15a
+ mova [rsp+gprsize+32*2], m0
+ ADST_MULSUB_4W 2, 13, 10, 11, 0, 1, 6, 9, 14, 15, 6270, 15137 ; out2, -out13, t14a, t15a
+ ADST_MULSUB_4W 3, 12, 4, 8, 0, 1, 6, 9, 14, 15, 6270, 15137 ; -out3, out12, t6, t7
+ mova m6, [rsp+gprsize+32*4] ; t10a
+ mova m14, [rsp+gprsize+32*0] ; t11a
+ mova m9, [rsp+gprsize+32*1] ; t2
+ mova m0, [rsp+gprsize+32*2] ; t0
+ mova m15, [rsp+gprsize+32*5] ; t1
+ psubw m1, m7, m6 ; t10
+ paddw m7, m6 ; -out1
+ psubw m6, m5, m14 ; t11
+ paddw m14, m5 ; out14
+ mova m5, [rsp+gprsize+32*3] ; t3
+ mova [rsp+gprsize+32*1], m7
+ psubw m7, m0, m9 ; t2a
+ paddw m0, m9 ; out0
+ psubw m9, m15, m5 ; t3a
+ paddw m15, m5 ; -out15
+ ret
+ALIGN function_align
+.pass1_main_part2:
+ mova [rsp+gprsize+16*0], xm0
+ vextracti128 [rsp+gprsize+16*4], m0, 1
+ mova [rsp+gprsize+32*3], m15
+ mova [rsp+gprsize+32*4], m13
+ mova [rsp+gprsize+32*5], m3
+ vpbroadcastd m15, [o(pw_m11585_11585)]
+ vpbroadcastd m13, [o(pw_11585_11585)]
+ vpbroadcastd m3, [o(pd_8192)]
+ punpcklwd m5, m11, m10
+ punpckhwd m11, m10
+ pmaddwd m10, m15, m5
+ pmaddwd m0, m15, m11
+ pmaddwd m5, m13
+ pmaddwd m11, m13
+ paddd m10, m3
+ paddd m0, m3
+ psubd m5, m3, m5
+ psubd m11, m3, m11
+ REPX {psrad x, 14}, m10, m0, m5, m11
+ packssdw m10, m0 ; out10
+ packssdw m5, m11 ; out5
+ punpcklwd m11, m8, m4
+ punpckhwd m8, m4
+ pmaddwd m4, m13, m11
+ pmaddwd m0, m13, m8
+ pmaddwd m11, m15
+ pmaddwd m8, m15
+ paddd m4, m3
+ paddd m0, m3
+ psubd m11, m3, m11
+ psubd m8, m3, m8
+ REPX {psrad x, 14}, m4, m0, m11, m8
+ packssdw m4, m0 ; out4
+ packssdw m11, m8 ; out11
+ punpcklwd m8, m9, m7
+ punpckhwd m9, m7
+ pmaddwd m7, m13, m8
+ pmaddwd m0, m13, m9
+ pmaddwd m8, m15
+ pmaddwd m9, m15
+ psubd m7, m3, m7
+ psubd m0, m3, m0
+ paddd m8, m3
+ paddd m9, m3
+ REPX {psrad x, 14}, m7, m0, m8, m9
+ packssdw m7, m0 ; out7
+ packssdw m8, m9 ; out8
+ punpckhwd m0, m6, m1
+ punpcklwd m6, m1
+ pmaddwd m1, m15, m0
+ pmaddwd m9, m15, m6
+ pmaddwd m0, m13
+ pmaddwd m6, m13
+ psubd m1, m3, m1
+ psubd m9, m3, m9
+ paddd m0, m3
+ paddd m6, m3
+ pxor m3, m3
+ psubw m15, m3, [rsp+gprsize+32*3] ; out15
+ REPX {psrad x, 14}, m1, m9, m0, m6
+ psubw m13, m3, [rsp+gprsize+32*4] ; out13
+ packssdw m9, m1 ; out7
+ psubw m1, m3, [rsp+gprsize+32*1] ; out1
+ packssdw m6, m0 ; out8
+ psubw m3, [rsp+gprsize+32*5] ; out3
+ ret
+
+%macro LOAD_8ROWS 2 ; src, stride
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+ mova m4, [%1+%2*4]
+ mova m5, [%1+%2*5]
+ mova m6, [%1+%2*6]
+ mova m7, [%1+%2*7]
+%endmacro
+
+%macro LOAD_8ROWS_H 2 ; src, stride
+ mova m8, [%1+%2*0]
+ mova m9, [%1+%2*1]
+ mova m10, [%1+%2*2]
+ mova m11, [%1+%2*3]
+ mova m12, [%1+%2*4]
+ mova m13, [%1+%2*5]
+ mova m14, [%1+%2*6]
+ mova m15, [%1+%2*7]
+%endmacro
+
+; Perform the final sumsub step and YMM lane shuffling
+%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2]
+ mova m%3, [tmp2q+32*( 3-%1)]
+ psubw m%4, m%1, m%3
+ paddw m%1, m%3
+ mova m%3, [tmp1q+32*(11-%2)]
+ mova [tmp1q+32*(11-%2)+16], xm%4
+ vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1
+ paddw m%4, m%2, m%3
+ psubw m%2, m%3
+ mova [tmp1q+32*(11-%2)], xm%2
+ vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
+ vperm2i128 m%2, m%1, m%4, 0x31
+ vinserti128 m%1, xm%4, 1
+%endmacro
+
+%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2]
+ mova m%4, [%2]
+ paddw m%3, m%1, m%4
+ psubw m%1, m%4
+ pmovzxbw m%4, [dstq+%6]
+ pmulhrsw m%3, m%5
+ pmulhrsw m%1, m%5
+ paddw m%3, m%4
+ pmovzxbw m%4, [r2+%7]
+ paddw m%1, m%4
+ packuswb m%3, m%1
+ vpermq m%3, m%3, q3120
+ mova [dstq+%6], xm%3
+ vextracti128 [r2+%7], m%3, 1
+%endmacro
+
+cglobal vp9_idct_idct_32x32_add, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ sub eobd, 1
+ jnz .pass1
+ movd xm0, [o(pw_11585x2)]
+ pmulhrsw xm5, xm0, [cq]
+ pxor m4, m4
+ pmulhrsw xm5, xm0
+ pmulhrsw xm5, [o(pw_512)]
+ movd [cq], xm4
+ or r3d, 16
+ vpbroadcastw m5, xm5
+.dconly_loop:
+ mova m2, [dstq+strideq*0]
+ mova m3, [dstq+strideq*1]
+ punpcklbw m0, m2, m4
+ punpckhbw m2, m4
+ punpcklbw m1, m3, m4
+ punpckhbw m3, m4
+ REPX {paddw x, m5}, m0, m2, m1, m3
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ dec r3d
+ jg .dconly_loop
+ RET
+.pass1:
+ PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, \
+ tmp2, base, tmp3, tmp4
+ %undef cmp
+ lea tmp1q, [rsp+32*7]
+ sub eobd, 135
+ lea tmp2q, [tmp1q+32*8]
+ mov tmp4d, eobd
+.pass1_loop:
+ LOAD_8ROWS cq+64*1, 64*2
+ test eobd, eobd
+ jl .pass1_fast
+ LOAD_8ROWS_H cq+64*17, 64*2
+ call .main
+ LOAD_8ROWS_H cq+64*16, 64*2
+ mova [rsp], m15
+ LOAD_8ROWS cq+64*0, 64*2
+ call .idct16
+ mov tmp3d, 64*30
+ jmp .pass1_loop_end
+.pass1_fast:
+ call .main_fast
+ LOAD_8ROWS cq+64*0, 64*2
+ call .idct16_fast
+ mov tmp3d, 64*14
+.pass1_loop_end:
+ pxor m1, m1
+.zero_loop:
+ mova [cq+tmp3q+64*1], m1
+ mova [cq+tmp3q+64*0], m1
+ mova [cq+tmp3q-64*1], m1
+ mova [cq+tmp3q-64*2], m1
+ sub tmp3d, 64*4
+ jg .zero_loop
+ mova [rsp+32*0], m9
+ IDCT32_PASS1_END 0, 8, 1, 9
+ IDCT32_PASS1_END 2, 10, 1, 9
+ IDCT32_PASS1_END 3, 11, 1, 9
+ IDCT32_PASS1_END 4, 12, 1, 9
+ IDCT32_PASS1_END 5, 13, 1, 9
+ IDCT32_PASS1_END 6, 14, 1, 9
+ IDCT32_PASS1_END 7, 15, 1, 9
+ mova m1, [rsp+32*1]
+ mova m9, [rsp+32*0]
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ IDCT32_PASS1_END 1, 9, 6, 7
+ mova m7, [rsp+32*1]
+ punpckhwd m6, m12, m13
+ punpcklwd m12, m13
+ punpckhwd m13, m8, m9
+ punpcklwd m8, m9
+ punpckhwd m9, m14, m15
+ punpcklwd m14, m15
+ punpckhwd m15, m10, m11
+ punpcklwd m10, m11
+ punpckhdq m11, m8, m10
+ punpckldq m8, m10
+ punpckldq m10, m12, m14
+ punpckhdq m12, m14
+ punpckhdq m14, m13, m15
+ punpckldq m13, m15
+ punpckldq m15, m6, m9
+ punpckhdq m6, m9
+ punpckhqdq m9, m8, m10
+ punpcklqdq m8, m10
+ punpcklqdq m10, m11, m12
+ punpckhqdq m11, m12
+ punpcklqdq m12, m13, m15
+ punpckhqdq m13, m15
+ punpckhqdq m15, m14, m6
+ punpcklqdq m14, m6
+ mova m6, [rsp+32*0]
+ mova [rsp+32*0], m8
+ call m(vp9_idct_16x16_internal).transpose_8x8
+ lea tmp3q, [tmp1q+32*32]
+ mova m8, [rsp]
+ mova [tmp3q-32*4], m0
+ mova [tmp3q-32*3], m2
+ mova [tmp3q-32*2], m4
+ mova [tmp3q-32*1], m6
+ mova [tmp3q+32*0], m8
+ mova [tmp3q+32*1], m10
+ mova [tmp3q+32*2], m12
+ mova [tmp3q+32*3], m14
+ add tmp3q, 32*8
+ mova [tmp3q-32*4], m1
+ mova [tmp3q-32*3], m3
+ mova [tmp3q-32*2], m5
+ mova [tmp3q-32*1], m7
+ mova [tmp3q+32*0], m9
+ mova [tmp3q+32*1], m11
+ mova [tmp3q+32*2], m13
+ mova [tmp3q+32*3], m15
+ mova m0, [tmp1q-32*4]
+ mova m1, [tmp1q-32*3]
+ mova m2, [tmp1q-32*2]
+ mova m3, [tmp1q-32*1]
+ mova m4, [tmp1q+32*0]
+ mova m5, [tmp1q+32*1]
+ mova m6, [tmp1q+32*2]
+ mova m7, [tmp1q+32*3]
+ call m(vp9_idct_16x16_internal).transpose_8x8
+ mova [tmp1q-32*4], m0
+ mova m0, [tmp2q-32*4]
+ mova [tmp2q-32*4], m1
+ mova m1, [tmp2q-32*3]
+ mova [tmp1q-32*3], m2
+ mova m2, [tmp2q-32*2]
+ mova [tmp2q-32*3], m3
+ mova m3, [tmp2q-32*1]
+ mova [tmp1q-32*2], m4
+ mova m4, [tmp2q+32*0]
+ mova [tmp2q-32*2], m5
+ mova m5, [tmp2q+32*1]
+ mova [tmp1q-32*1], m6
+ mova m6, [tmp2q+32*2]
+ mova [tmp2q-32*1], m7
+ mova m7, [tmp2q+32*3]
+ call m(vp9_idct_16x16_internal).transpose_8x8
+ mova [tmp1q+32*0], m0
+ mova [tmp2q+32*0], m1
+ mova [tmp1q+32*1], m2
+ mova [tmp2q+32*1], m3
+ mova [tmp1q+32*2], m4
+ mova [tmp2q+32*2], m5
+ mova [tmp1q+32*3], m6
+ mova [tmp2q+32*3], m7
+ add cq, 32
+ add tmp1q, 32*16
+ add tmp2q, 32*16
+ add tmp4d, 0x80000000
+ jnc .pass1_loop
+ add tmp1q, 32*24
+ imul r2, strideq, 19
+ lea tmp4q, [strideq*3]
+ add r2, dstq
+ test eobd, eobd
+ jge .pass2_loop
+ add tmp1q, 32*16
+ add tmp2q, 32*16
+ add tmp3q, 32*16
+.pass2_loop:
+ LOAD_8ROWS tmp2q-32*4, 32
+ test eobd, eobd
+ jl .pass2_fast
+ LOAD_8ROWS_H tmp3q-32*4, 32
+ call .main
+ sub tmp3q, 32*8
+ LOAD_8ROWS_H tmp3q-32*4, 32
+ sub tmp3q, 32*16
+ LOAD_8ROWS tmp3q-32*4, 32
+ mova [rsp], m15
+ call .idct16
+ jmp .pass2_loop_end
+.pass2_fast:
+ call .main_fast
+ sub tmp3q, 32*24
+ LOAD_8ROWS tmp3q-32*4, 32
+ call .idct16_fast
+.pass2_loop_end:
+ mova [rsp+32*0], m7
+ mova [rsp+32*2], m15
+ vpbroadcastd m15, [o(pw_512)]
+ IDCT32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, tmp4q*4
+ IDCT32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, tmp4q*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m1, [rsp+32*1]
+ IDCT32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, tmp4q*4
+ IDCT32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, tmp4q*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ IDCT32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, tmp4q*4
+ IDCT32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, tmp4q*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m7, [rsp+32*0]
+ mova m1, [rsp+32*2]
+ IDCT32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, tmp4q*4
+ IDCT32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, tmp4q*4, strideq*0
+ lea tmp3q, [tmp1q-32*32]
+ cmp tmp2q, tmp3q
+ jb .ret
+ sub tmp2q, 32*32
+ sub dstq, tmp4q
+ lea r2, [r2+tmp4q+16]
+ add dstq, 16
+ jmp .pass2_loop
+.ret:
+ RET
+ALIGN function_align
+ IDCT16_MAIN .idct16, 1
+ ret
+ALIGN function_align
+.main_fast:
+ mova [tmp1q+32*0], m7
+ vpbroadcastd m11, [o(pw_14811x2)]
+ vpbroadcastd m7, [o(pw_7005x2)]
+ vpbroadcastd m12, [o(pw_m5520x2)]
+ vpbroadcastd m8, [o(pw_15426x2)]
+ vpbroadcastd m13, [o(pw_15893x2)]
+ vpbroadcastd m15, [o(pw_3981x2)]
+ pmulhrsw m11, m4 ; t29a
+ vpbroadcastd m10, [o(pw_m8423x2)]
+ pmulhrsw m4, m7 ; t18a
+ vpbroadcastd m7, [o(pw_14053x2)]
+ pmulhrsw m12, m3 ; t19a
+ vpbroadcastd m9, [o(pw_13160x2)]
+ pmulhrsw m3, m8 ; t28a
+ vpbroadcastd m8, [o(pw_9760x2)]
+ pmulhrsw m13, m2 ; t27a
+ vpbroadcastd m14, [o(pw_m2404x2)]
+ pmulhrsw m2, m15 ; t20a
+ vpbroadcastd m15, [o(pw_16207x2)]
+ pmulhrsw m10, m5 ; t21a
+ pmulhrsw m5, m7 ; t26a
+ pmulhrsw m9, m6 ; t25a
+ pmulhrsw m6, m8 ; t22a
+ pmulhrsw m14, m1 ; t23a
+ pmulhrsw m1, m15 ; t24a
+ vpbroadcastd m15, [o(pd_8192)]
+ jmp .main2
+ALIGN function_align
+.main:
+ mova [tmp1q+32*0], m7
+ mova [tmp1q-32*1], m15
+ mova [tmp1q-32*2], m8
+ vpbroadcastd m15, [o(pd_8192)]
+ ITX_MULSUB_2W 4, 11, 7, 8, 15, 7005, 14811 ; t18a, t29a
+ ITX_MULSUB_2W 12, 3, 7, 8, 15, 15426, 5520 ; t19a, t28a
+ ITX_MULSUB_2W 2, 13, 7, 8, 15, 3981, 15893 ; t20a, t27a
+ ITX_MULSUB_2W 10, 5, 7, 8, 15, 14053, 8423 ; t21a, t26a
+ ITX_MULSUB_2W 6, 9, 7, 8, 15, 9760, 13160 ; t22a, t25a
+ ITX_MULSUB_2W 14, 1, 7, 8, 15, 16207, 2404 ; t23a, t24a
+.main2:
+ psubw m7, m12, m4 ; t18
+ paddw m12, m4 ; t19
+ psubw m4, m2, m10 ; t21
+ paddw m2, m10 ; t20
+ psubw m10, m14, m6 ; t22
+ paddw m14, m6 ; t23
+ psubw m6, m1, m9 ; t25
+ paddw m1, m9 ; t24
+ psubw m9, m13, m5 ; t26
+ paddw m13, m5 ; t27
+ psubw m5, m3, m11 ; t29
+ paddw m3, m11 ; t28
+ ITX_MULSUB_2W 5, 7, 8, 11, 15, m16069, 3196 ; t18a, t29a
+ ITX_MULSUB_2W 9, 4, 8, 11, 15, 13623, 9102 ; t21a, t26a
+ ITX_MULSUB_2W 6, 10, 8, 11, 15, m9102, 13623 ; t22a, t25a
+ psubw m8, m14, m2 ; t20a
+ paddw m14, m2 ; t23a
+ psubw m2, m1, m13 ; t27a
+ paddw m1, m13 ; t24a
+ psubw m13, m6, m9 ; t21
+ paddw m6, m9 ; t22
+ psubw m9, m10, m4 ; t26
+ paddw m10, m4 ; t25
+ ITX_MULSUB_2W 2, 8, 4, 11, 15, m15137, 6270 ; t20, t27
+ ITX_MULSUB_2W 9, 13, 4, 11, 15, m15137, 6270 ; t21a, t26a
+ mova [tmp1q+32*1], m6
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m1
+ mova m4, [tmp1q+32*0] ; in15
+ test eobd, eobd
+ jl .main3_fast
+ mova m6, [tmp1q-32*1] ; in31
+ mova m14, [tmp1q-32*2] ; in17
+ ITX_MULSUB_2W 0, 6, 1, 11, 15, 804, 16364 ; t16a, t31a
+ ITX_MULSUB_2W 14, 4, 1, 11, 15, 12140, 11003 ; t17a, t30a
+ jmp .main3
+.main3_fast:
+ vpbroadcastd m6, [o(pw_16364x2)]
+ vpbroadcastd m1, [o(pw_804x2)]
+ vpbroadcastd m14, [o(pw_m11003x2)]
+ vpbroadcastd m11, [o(pw_12140x2)]
+ pmulhrsw m6, m0 ; t31a
+ pmulhrsw m0, m1 ; t16a
+ pmulhrsw m14, m4 ; t17a
+ pmulhrsw m4, m11 ; t30a
+.main3:
+ psubw m1, m0, m14 ; t17
+ paddw m0, m14 ; t16
+ psubw m14, m6, m4 ; t30
+ paddw m4, m6 ; t31
+ ITX_MULSUB_2W 14, 1, 6, 11, 15, 3196, 16069 ; t17a, t30a
+ psubw m6, m0, m12 ; t19a
+ paddw m0, m12 ; t16a
+ psubw m12, m4, m3 ; t28a
+ paddw m4, m3 ; t31a
+ psubw m3, m14, m5 ; t18
+ paddw m14, m5 ; t17
+ psubw m5, m1, m7 ; t29
+ paddw m1, m7 ; t30
+ ITX_MULSUB_2W 5, 3, 7, 11, 15, 6270, 15137 ; t18a, t29a
+ ITX_MULSUB_2W 12, 6, 7, 11, 15, 6270, 15137 ; t19, t28
+ psubw m7, m1, m10 ; t25a
+ paddw m1, m10 ; t30a
+ psubw m10, m5, m9 ; t21
+ paddw m5, m9 ; t18
+ psubw m9, m12, m2 ; t20a
+ paddw m12, m2 ; t19a
+ psubw m2, m3, m13 ; t26
+ paddw m3, m13 ; t29
+ psubw m13, m6, m8 ; t27a
+ paddw m6, m8 ; t28a
+ mova [tmp1q-32*2], m5
+ mova [tmp1q-32*1], m12
+ mova [tmp2q+32*0], m6
+ mova [tmp2q+32*1], m3
+ mova [tmp2q+32*2], m1
+ mova m5, [tmp1q+32*1] ; t22
+ mova m6, [tmp1q+32*2] ; t23
+ mova m3, [tmp1q+32*3] ; t24a
+ psubw m1, m14, m5 ; t22a
+ paddw m14, m5 ; t17a
+ psubw m5, m0, m6 ; t23
+ paddw m0, m6 ; t16
+ psubw m6, m4, m3 ; t24
+ paddw m4, m3 ; t31
+ vpbroadcastd m8, [o(pw_m11585_11585)]
+ vpbroadcastd m3, [o(pw_11585_11585)]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m14
+ mova [tmp2q+32*3], m4
+ ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8 ; t20, t27
+ ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8 ; t21a, t26a
+ ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8 ; t22, t25
+ ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8 ; t23a, t24a
+ mova [tmp1q+32*0], m13
+ mova [tmp1q+32*1], m2
+ mova [tmp1q+32*2], m7
+ mova [tmp1q+32*3], m6
+ mova [tmp2q-32*4], m5
+ mova [tmp2q-32*3], m1
+ mova [tmp2q-32*2], m10
+ mova [tmp2q-32*1], m9
+ ret
+
+%endif
--
2.49.1
>From b79973a4c18413948a3771532fd0da9d9dac7ace Mon Sep 17 00:00:00 2001
From: Henrik Gramner <gramner@twoorioles.com>
Date: Mon, 15 Sep 2025 14:11:58 +0200
Subject: [PATCH 2/2] vp9: Remove 8bpc AVX asm for inverse transforms
There's very little performance difference vs SSE2/SSSE3 and most
systems will use the AVX2 implementations anyway.
This reduces code size and compilation time by a significant amount.
---
libavcodec/x86/vp9dsp_init.c | 15 ---------------
libavcodec/x86/vp9itxfm.asm | 10 ----------
2 files changed, 25 deletions(-)
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index a1e47445a8..72edf6bb45 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -111,14 +111,11 @@ itxfm_funcs(4, ssse3);
itxfm_funcs(4, avx2);
itxfm_funcs(8, sse2);
itxfm_funcs(8, ssse3);
-itxfm_funcs(8, avx);
itxfm_funcs(8, avx2);
itxfm_funcs(16, sse2);
itxfm_funcs(16, ssse3);
-itxfm_funcs(16, avx);
itxfm_func(idct, idct, 32, sse2);
itxfm_func(idct, idct, 32, ssse3);
-itxfm_func(idct, idct, 32, avx);
itxfm_func(iwht, iwht, 4, mmx);
itxfm_func(iwht, iwht, 4, avx2);
itxfm_funcs(16, avx2);
@@ -368,18 +365,6 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
}
if (EXTERNAL_AVX(cpu_flags)) {
- dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
- dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx;
- dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx;
- dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx;
- dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
- dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx;
- dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx;
- dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
- dsp->itxfm_add[TX_32X32][ADST_ADST] =
- dsp->itxfm_add[TX_32X32][ADST_DCT] =
- dsp->itxfm_add[TX_32X32][DCT_ADST] =
- dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
init_lpf(avx);
init_dir_tm_h_ipred(8, avx);
init_dir_tm_h_ipred(16, avx);
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
index a78ea56c22..a560c21178 100644
--- a/libavcodec/x86/vp9itxfm.asm
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -474,7 +474,6 @@ IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
pmulhrsw m7, W_11585x2_REG ; m7=t5
pmulhrsw m5, W_11585x2_REG ; m5=t6
SWAP 5, 1
- ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier
psubw m6, m0, m3 ; m6=t0-t7
paddw m3, m0 ; m3=t0+t7
psubw m2, m0, m1 ; m2=t1-t6
@@ -722,7 +721,6 @@ cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob
VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12
VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13
-VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13
;---------------------------------------------------------------------------------------------
; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
@@ -896,11 +894,8 @@ IADST8_FN idct, IDCT8, iadst, IADST8, sse2, 15
IADST8_FN iadst, IADST8, idct, IDCT8, sse2, 15
IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15
IADST8_FN idct, IDCT8, iadst, IADST8, ssse3, 16
-IADST8_FN idct, IDCT8, iadst, IADST8, avx, 16
IADST8_FN iadst, IADST8, idct, IDCT8, ssse3, 16
-IADST8_FN iadst, IADST8, idct, IDCT8, avx, 16
IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16
-IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
;---------------------------------------------------------------------------------------------
; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
@@ -1438,7 +1433,6 @@ cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
VP9_IDCT_IDCT_16x16_ADD_XMM sse2
VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
-VP9_IDCT_IDCT_16x16_ADD_XMM avx
%macro VP9_IDCT16_YMM_1D 0
VP9_UNPACK_MULSUB_2W_4X 1, 15, 16305, 1606, [pd_8192], 0, 4 ; t8, t15
@@ -1905,9 +1899,6 @@ IADST16_FN iadst, IADST16, iadst, IADST16, sse2
IADST16_FN idct, IDCT16, iadst, IADST16, ssse3
IADST16_FN iadst, IADST16, idct, IDCT16, ssse3
IADST16_FN iadst, IADST16, iadst, IADST16, ssse3
-IADST16_FN idct, IDCT16, iadst, IADST16, avx
-IADST16_FN iadst, IADST16, idct, IDCT16, avx
-IADST16_FN iadst, IADST16, iadst, IADST16, avx
; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128]
; out: m[0-15] except m6, which is in [blockq+192]
@@ -2847,4 +2838,3 @@ cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride,
VP9_IDCT_IDCT_32x32_ADD_XMM sse2
VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
-VP9_IDCT_IDCT_32x32_ADD_XMM avx
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-09-15 13:25 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-09-15 13:25 [FFmpeg-devel] [PATCH] vp9: Improve 8bpc AVX2 inverse transform asm (PR #20526) Henrik Gramner via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git