* [FFmpeg-devel] [PATCH] Stop using MMX in IDCTDSP (PR #20838)
@ 2025-11-05 4:11 mkver via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-11-05 4:11 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: mkver
PR #20838 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20838
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20838.patch
>From 549f85f6c3f32f90429bed8362e8817268fad862 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 4 Nov 2025 13:56:01 +0100
Subject: [PATCH 1/7] avcodec/x86/idctdsp_init: Fix IDCT permutation for 32bit
without SSE2
bfb28b5ce89f3e950214b67ea95b45e3355c2caf removed the MMX idct_put
and idct_add functions, because they were overridden by SSE2 versions
(which use SSE2 only for the put/add part, not the actual IDCT).
This meant that for MMX, the idct functions are not set in unison,
so that the permutation which is meant to apply to all three
is incorrect on 32bit systems if SSE2 is unavailable/disabled.
Fix this by setting the MMX version only if SSE2 is enabled.
(No one complained, so apparently no one uses a new FFmpeg
with non-SSE2 capable systems.)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/idctdsp_init.c | 13 +------------
1 file changed, 1 insertion(+), 12 deletions(-)
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 2d165b975b..281d143ade 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -65,18 +65,6 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
{
int cpu_flags = av_get_cpu_flags();
-#if ARCH_X86_32
- if (EXTERNAL_MMX(cpu_flags)) {
- if (!high_bit_depth &&
- avctx->lowres == 0 &&
- (avctx->idct_algo == FF_IDCT_AUTO ||
- avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
- avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
- c->idct = ff_simple_idct_mmx;
- }
- }
-#endif
-
if (EXTERNAL_SSE2(cpu_flags)) {
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
c->put_pixels_clamped = ff_put_pixels_clamped_sse2;
@@ -88,6 +76,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+ c->idct = ff_simple_idct_mmx;
c->idct_put = ff_simple_idct_put_sse2;
c->idct_add = ff_simple_idct_add_sse2;
c->perm_type = FF_IDCT_PERM_SIMPLE;
--
2.49.1
>From d4013319afd63deb83cc1dbf2816382854085379 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 4 Nov 2025 17:53:30 +0100
Subject: [PATCH 2/7] avcodec/tests/x86/dct: Test 32bit simple idct
The test has been removed in bfb28b5ce89f3e950214b67ea95b45e3355c2caf
when MMX idctdsp functions overridden by SSE2 were removed;
ff_simple_idct_mmx() has been completely disabled in this patch
for x64 and so the test should have been disabled on x64 instead
of removing it.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/tests/x86/dct.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index 7800abc7f7..e864de6904 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -88,6 +88,10 @@ static const struct algo idct_tab_arch[] = {
{ "SIMPLE10-AVX", ff_simple_idct10_avx, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
{ "SIMPLE12-AVX", ff_simple_idct12_avx, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX, 1 },
#endif
+#else
+#if HAVE_SSE2_EXTERNAL
+ { "SIMPLE-SSE2", ff_simple_idct_mmx, FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_SSE2},
+#endif
#endif
#endif
{ 0 }
--
2.49.1
>From d30025d2857dc3cdcc9eb4c09ed85794473ac3a1 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 5 Nov 2025 02:59:59 +0100
Subject: [PATCH 3/7] avcodec/x86/xvididct: Don't use MMX registers in SSE2
function
It is higly surprising and would necessitate emms in order to be ABI
compliant; but it is better just not to use them in the first place.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/xvididct.asm | 76 ++++++++++++++++++++-----------------
1 file changed, 42 insertions(+), 34 deletions(-)
diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
index 4197551cdf..0daa2edd42 100644
--- a/libavcodec/x86/xvididct.asm
+++ b/libavcodec/x86/xvididct.asm
@@ -101,8 +101,6 @@ walkenIdctRounders: times 4 dd 65536
times 4 dd 512
times 2 dd 0
-pb_127: times 8 db 127
-
SECTION .text
; Temporary storage before the column pass
@@ -167,36 +165,47 @@ SECTION .text
%define TAN1 xmm2
%endif
-%macro JZ 2
- test %1, %1
+%macro JZ 3
+ test %1%3, %1%3
jz .%2
%endmacro
-%macro JNZ 2
- test %1, %1
+%macro JNZ 3
+ test %1%3, %1%3
jnz .%2
%endmacro
%macro TEST_ONE_ROW 4 ; src, reg, clear, arg
%3 %4
- movq mm1, [%1]
- por mm1, [%1 + 8]
- paddusb mm1, mm0
- pmovmskb %2, mm1
+ mova m1, [%1]
+ ; due to signed saturation, m1 is all zero iff m1 is all zero after packing
+ packsswb m1, m1
+%if ARCH_X86_64
+ movq %2, m1
+%else
+ packsswb m1, m1
+ movd %2, m1
+%endif
%endmacro
;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2
%macro TEST_TWO_ROWS 8
%5 %6
%7 %8
- movq mm1, [%1 + 0]
- por mm1, [%1 + 8]
- movq mm2, [%2 + 0]
- por mm2, [%2 + 8]
- paddusb mm1, mm0
- paddusb mm2, mm0
- pmovmskb %3, mm1
- pmovmskb %4, mm2
+ mova m1, [%1]
+ packsswb m1, [%2]
+ packsswb m1, m1
+%if ARCH_X86_64
+ movq %4, m1
+ mov %3d, %4d
+ shr %4q, 32
+%else
+ packsswb m1, m1
+ movd %3, m1
+ mov %4, %3
+ shr %4, 16
+ and %3, 0xFFFF
+%endif
%endmacro
; IDCT pass on rows.
@@ -499,16 +508,16 @@ SECTION .text
%macro IDCT_SSE2 1 ; 0=normal 1=put 2=add
%if %1 == 0 || ARCH_X86_32
- %define GPR0 r1d
- %define GPR1 r2d
- %define GPR2 r3d
- %define GPR3 r4d
+ %define GPR0 r1
+ %define GPR1 r2
+ %define GPR2 r3
+ %define GPR3 r4
%define NUM_GPRS 5
%else
- %define GPR0 r3d
- %define GPR1 r4d
- %define GPR2 r5d
- %define GPR3 r6d
+ %define GPR0 r3
+ %define GPR1 r4
+ %define GPR2 r5
+ %define GPR3 r6
%define NUM_GPRS 7
%endif
%if %1 == 0
@@ -527,34 +536,33 @@ cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
%xdefine BLOCK r0q
%endif
%endif
- movq mm0, [pb_127]
iMTX_MULT BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
iMTX_MULT BLOCK + 1*16, iTab2, PUT_ODD, ROW1, 1*16
iMTX_MULT BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
TEST_TWO_ROWS BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
- JZ GPR0, col1
+ JZ GPR0, col1, d
iMTX_MULT BLOCK + 3*16, iTab4, PUT_ODD, ROW3, 3*16
.col1:
TEST_TWO_ROWS BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
TEST_ONE_ROW BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
iLLM_HEAD
- JNZ GPR1, 2
- JNZ GPR0, 3
- JNZ GPR2, 4
- JNZ GPR3, 5
+ JNZ GPR1, 2, d
+ JNZ GPR0, 3, d
+ JNZ GPR2, 4, d
+ JNZ GPR3, 5, q
iLLM_PASS_SPARSE BLOCK, %1
jmp .6
.2:
iMTX_MULT BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
.3:
iMTX_MULT BLOCK + 5*16, iTab4, PUT_ODD, ROW5, 4*16
- JZ GPR2, col2
+ JZ GPR2, col2, d
.4:
iMTX_MULT BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
.col2:
- JZ GPR3, col3
+ JZ GPR3, col3, q
.5:
iMTX_MULT BLOCK + 7*16, iTab2, PUT_ODD, ROW7, 5*16
.col3:
--
2.49.1
>From a7e3cde808bd620e0bb9616261dd5c12cc71da97 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 5 Nov 2025 03:58:12 +0100
Subject: [PATCH 4/7] avcodec/x86/xvididct: Remove remnants of MMX
The non-MMX code only uses the first six rounders.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/xvididct.asm | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
index 0daa2edd42..c3bfabb955 100644
--- a/libavcodec/x86/xvididct.asm
+++ b/libavcodec/x86/xvididct.asm
@@ -24,7 +24,7 @@
;
; More details at http://skal.planet-d.net/coding/dct.html
;
-; ======= MMX and XMM forward discrete cosine transform =======
+; =========== XMM forward discrete cosine transform ===========
;
; Copyright(C) 2001 Peter Ross <pross@xvid.org>
;
@@ -67,7 +67,6 @@
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
-; Similar to tg_1_16 in MMX code
tan1: times 8 dw 13036
tan2: times 8 dw 27146
tan3: times 8 dw 43790
@@ -91,7 +90,6 @@ iTab4: dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746
dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df
dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
-; Similar to rounder_0 in MMX code
; 4 first similar, then: 4*8->6*16 5*8->4*16 6/7*8->5*16
walkenIdctRounders: times 4 dd 65536
times 4 dd 3597
@@ -99,7 +97,6 @@ walkenIdctRounders: times 4 dd 65536
times 4 dd 1203
times 4 dd 120
times 4 dd 512
- times 2 dd 0
SECTION .text
--
2.49.1
>From 6b353d7a322b1142d80bdce172cabe8257f239c1 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 5 Nov 2025 04:04:02 +0100
Subject: [PATCH 5/7] avcodec/tests/dct: Remove unnecessary emms_c
Unnecessary since the Xvid IDCT no longer uses MMX registers at all.
(Notice that the simple MMX IDCT issues emms and is therefore ABI
compliant.)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/tests/dct.c | 4 ----
1 file changed, 4 deletions(-)
diff --git a/libavcodec/tests/dct.c b/libavcodec/tests/dct.c
index 784b49276c..eb74f3559e 100644
--- a/libavcodec/tests/dct.c
+++ b/libavcodec/tests/dct.c
@@ -37,7 +37,6 @@
#include "libavutil/cpu.h"
#include "libavutil/common.h"
-#include "libavutil/emms.h"
#include "libavutil/internal.h"
#include "libavutil/lfg.h"
#include "libavutil/mem_internal.h"
@@ -212,7 +211,6 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed, c
permute(block, block1, dct->perm_type);
dct->func(block);
- emms_c();
if (!strcmp(dct->name, "IJG-AAN-INT")) {
for (i = 0; i < 64; i++) {
@@ -287,7 +285,6 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed, c
memcpy(block, block1, sizeof(block));
dct->func(block);
}
- emms_c();
it1 += NB_ITS_SPEED;
ti1 = av_gettime_relative() - ti;
} while (ti1 < 1000000);
@@ -449,7 +446,6 @@ static void idct248_error(const char *name,
block[i] = block1[i];
idct248_put(img_dest, 8, block);
}
- emms_c();
it1 += NB_ITS_SPEED;
ti1 = av_gettime_relative() - ti;
} while (ti1 < 1000000);
--
2.49.1
>From 7b6c5ddebd8c43127ac0d2f1c744cd9d17d9a7ca Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 4 Nov 2025 14:25:54 +0100
Subject: [PATCH 6/7] avcodec/x86/simple_idct: Port to SSE2
Before this commit, the (32-bit only) simple idct came in three
versions: A pure MMX IDCT and idct-put and idct-add versions
which use SSE2 at the put and add stage, but still use pure MMX
for the actual IDCT.
This commit ports said IDCT to SSE2; this was entirely trivial
for the IDCT1-5 and IDCT7 parts (where one can directly use
the full register width) and was easy for IDCT6 and IDCT8
(involving a few movhps and pshufds). Unfortunately, DC_COND_INIT
and Z_COND_INIT still use only the lower half of the registers.
This saved 4658B here; the benchmarking option of the dct test tool
showed a 15% speedup.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/tests/x86/dct.c | 2 +-
libavcodec/x86/idctdsp_init.c | 2 +-
libavcodec/x86/simple_idct.asm | 1242 ++++++++++++++++----------------
libavcodec/x86/simple_idct.h | 5 +-
4 files changed, 628 insertions(+), 623 deletions(-)
diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index e864de6904..f879ab1d42 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -90,7 +90,7 @@ static const struct algo idct_tab_arch[] = {
#endif
#else
#if HAVE_SSE2_EXTERNAL
- { "SIMPLE-SSE2", ff_simple_idct_mmx, FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_SSE2},
+ { "SIMPLE-SSE2", ff_simple_idct_sse2, FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_SSE2},
#endif
#endif
#endif
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 281d143ade..9c7f235b3f 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -76,7 +76,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
- c->idct = ff_simple_idct_mmx;
+ c->idct = ff_simple_idct_sse2;
c->idct_put = ff_simple_idct_put_sse2;
c->idct_add = ff_simple_idct_add_sse2;
c->perm_type = FF_IDCT_PERM_SIMPLE;
diff --git a/libavcodec/x86/simple_idct.asm b/libavcodec/x86/simple_idct.asm
index c79519372a..2410737038 100644
--- a/libavcodec/x86/simple_idct.asm
+++ b/libavcodec/x86/simple_idct.asm
@@ -30,8 +30,8 @@ SECTION_RODATA
%if ARCH_X86_32
cextern pb_80
+d40000: dd 4 << 16, 0 ; must be 16-byte aligned
wm1010: dw 0, 0xffff, 0, 0xffff
-d40000: dd 4 << 16, 0
; 23170.475006
; 22725.260826
@@ -57,650 +57,675 @@ d40000: dd 4 << 16, 0
coeffs:
dw 1 << (ROW_SHIFT - 1), 0
dw 1 << (ROW_SHIFT - 1), 0
+ dw 1 << (ROW_SHIFT - 1), 0
+ dw 1 << (ROW_SHIFT - 1), 0
+ dw 1 << (ROW_SHIFT - 1), 1
+ dw 1 << (ROW_SHIFT - 1), 0
dw 1 << (ROW_SHIFT - 1), 1
dw 1 << (ROW_SHIFT - 1), 0
- dw C4, C4, C4, C4
- dw C4, -C4, C4, -C4
+ dw C4, C4, C4, C4, C4, C4, C4, C4
+ dw C4, -C4, C4, -C4, C4, -C4, C4, -C4
- dw C2, C6, C2, C6
- dw C6, -C2, C6, -C2
+ dw C2, C6, C2, C6, C2, C6, C2, C6
+ dw C6, -C2, C6, -C2, C6, -C2, C6, -C2
- dw C1, C3, C1, C3
- dw C5, C7, C5, C7
+ dw C1, C3, C1, C3, C1, C3, C1, C3
+ dw C5, C7, C5, C7, C5, C7, C5, C7
- dw C3, -C7, C3, -C7
- dw -C1, -C5, -C1, -C5
+ dw C3, -C7, C3, -C7, C3, -C7, C3, -C7
+ dw -C1, -C5, -C1, -C5, -C1, -C5, -C1, -C5
- dw C5, -C1, C5, -C1
- dw C7, C3, C7, C3
+ dw C5, -C1, C5, -C1, C5, -C1, C5, -C1
+ dw C7, C3, C7, C3, C7, C3, C7, C3
- dw C7, -C5, C7, -C5
- dw C3, -C1, C3, -C1
+ dw C7, -C5, C7, -C5, C7, -C5, C7, -C5
+ dw C3, -C1, C3, -C1, C3, -C1, C3, -C1
SECTION .text
%macro DC_COND_IDCT 7
- movq mm0, [blockq + %1] ; R4 R0 r4 r0
- movq mm1, [blockq + %2] ; R6 R2 r6 r2
- movq mm2, [blockq + %3] ; R3 R1 r3 r1
- movq mm3, [blockq + %4] ; R7 R5 r7 r5
- movq mm4, [wm1010]
- pand mm4, mm0
- por mm4, mm1
- por mm4, mm2
- por mm4, mm3
- packssdw mm4, mm4
- movd t0d, mm4
+ movq m0, [blockq + %1] ; R4 R0 r4 r0
+ movq m1, [blockq + %2] ; R6 R2 r6 r2
+ movq m2, [blockq + %3] ; R3 R1 r3 r1
+ movq m3, [blockq + %4] ; R7 R5 r7 r5
+ movq m4, [wm1010]
+ pand m4, m0
+ por m4, m1
+ por m4, m2
+ por m4, m3
+ packssdw m4, m4
+ movd t0d, m4
or t0d, t0d
jz %%1
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm5, [coeffs + 32] ; C6 C2 C6 C2
- pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
- movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
- pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
- movq mm7, [coeffs + 48] ; C3 C1 C3 C1
- pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
- paddd mm4, [coeffs + 8]
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- paddd mm4, mm5 ; A0 a0
- psubd mm6, mm5 ; A3 a3
- movq mm5, [coeffs + 56] ; C7 C5 C7 C5
- pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5
- paddd mm0, [coeffs + 8]
- paddd mm1, mm0 ; A1 a1
- paddd mm0, mm0
- psubd mm0, mm1 ; A2 a2
- pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7, mm5 ; B0 b0
- movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1
- pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm7, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm7 ; A0-B0 a0-b0
- paddd mm5, mm2 ; B1 b1
- psrad mm7, %7
- psrad mm4, %7
- movq mm2, mm1 ; A1 a1
- paddd mm1, mm5 ; A1+B1 a1+b1
- psubd mm2, mm5 ; A1-B1 a1-b1
- psrad mm1, %7
- psrad mm2, %7
- packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0
- packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1
- movq [%5], mm7
- movq mm1, [blockq + %3] ; R3 R1 r3 r1
- movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
- movq [24 + %5], mm2
- pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1
- movq mm7, [coeffs + 88] ; C3 C7 C3 C7
- pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
- pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2, mm0 ; A2 a2
- pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm4, mm7 ; B2 b2
- paddd mm2, mm4 ; A2+B2 a2+b2
- psubd mm0, mm4 ; a2-B2 a2-b2
- psrad mm2, %7
- psrad mm0, %7
- movq mm4, mm6 ; A3 a3
- paddd mm3, mm1 ; B3 b3
- paddd mm6, mm3 ; A3+B3 a3+b3
- psubd mm4, mm3 ; a3-B3 a3-b3
- psrad mm6, %7
- packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2
- movq [8 + %5], mm2
- psrad mm4, %7
- packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3
- movq [16 + %5], mm4
+ movq m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ movq m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ movq m5, [coeffs + 64] ; C6 C2 C6 C2
+ pmaddwd m5, m1 ; C6R6+C2R2 C6r6+C2r2
+ movq m6, [coeffs + 80] ; -C2 C6 -C2 C6
+ pmaddwd m1, m6 ; -C2R6+C6R2 -C2r6+C6r2
+ movq m7, [coeffs + 96] ; C3 C1 C3 C1
+ pmaddwd m7, m2 ; C3R3+C1R1 C3r3+C1r1
+ paddd m4, [coeffs + 16]
+ movq m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ paddd m4, m5 ; A0 a0
+ psubd m6, m5 ; A3 a3
+ movq m5, [coeffs + 112] ; C7 C5 C7 C5
+ pmaddwd m5, m3 ; C7R7+C5R5 C7r7+C5r5
+ paddd m0, [coeffs + 16]
+ paddd m1, m0 ; A1 a1
+ paddd m0, m0
+ psubd m0, m1 ; A2 a2
+ pmaddwd m2, [coeffs + 128] ; -C7R3+C3R1 -C7r3+C3r1
+ paddd m7, m5 ; B0 b0
+ movq m5, [coeffs + 144] ; -C5 -C1 -C5 -C1
+ pmaddwd m5, m3 ; -C5R7-C1R5 -C5r7-C1r5
+ paddd m7, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m7 ; A0-B0 a0-b0
+ paddd m5, m2 ; B1 b1
+ psrad m7, %7
+ psrad m4, %7
+ movq m2, m1 ; A1 a1
+ paddd m1, m5 ; A1+B1 a1+b1
+ psubd m2, m5 ; A1-B1 a1-b1
+ psrad m1, %7
+ psrad m2, %7
+ packssdw m7, m1 ; A1+B1 a1+b1 A0+B0 a0+b0
+ pshufd m7, m7, 0xD8
+ packssdw m2, m4 ; A0-B0 a0-b0 A1-B1 a1-b1
+ pshufd m2, m2, 0xD8
+ movq [%5], m7
+ movq m1, [blockq + %3] ; R3 R1 r3 r1
+ movq m4, [coeffs + 160] ; -C1 C5 -C1 C5
+ movq [24 + %5], m2
+ pmaddwd m4, m1 ; -C1R3+C5R1 -C1r3+C5r1
+ movq m7, [coeffs + 176] ; C3 C7 C3 C7
+ pmaddwd m1, [coeffs + 192] ; -C5R3+C7R1 -C5r3+C7r1
+ pmaddwd m7, m3 ; C3R7+C7R5 C3r7+C7r5
+ movq m2, m0 ; A2 a2
+ pmaddwd m3, [coeffs + 208] ; -C1R7+C3R5 -C1r7+C3r5
+ paddd m4, m7 ; B2 b2
+ paddd m2, m4 ; A2+B2 a2+b2
+ psubd m0, m4 ; a2-B2 a2-b2
+ psrad m2, %7
+ psrad m0, %7
+ movq m4, m6 ; A3 a3
+ paddd m3, m1 ; B3 b3
+ paddd m6, m3 ; A3+B3 a3+b3
+ psubd m4, m3 ; a3-B3 a3-b3
+ psrad m6, %7
+ packssdw m2, m6 ; A3+B3 a3+b3 A2+B2 a2+b2
+ pshufd m2, m2, 0xD8
+ movq [8 + %5], m2
+ psrad m4, %7
+ packssdw m4, m0 ; A2-B2 a2-b2 A3-B3 a3-b3
+ pshufd m4, m4, 0xD8
+ movq [16 + %5], m4
jmp %%2
%%1:
- pslld mm0, 16
- paddd mm0, [d40000]
- psrad mm0, 13
- packssdw mm0, mm0
- movq [%5], mm0
- movq [8 + %5], mm0
- movq [16 + %5], mm0
- movq [24 + %5], mm0
+ pslld m0, 16
+ ; d40000 is only eight bytes long, so this will clobber
+ ; the upper half of m0 with wm1010. It doesn't matter due to pshufd below.
+ paddd m0, [d40000]
+ psrad m0, 13
+ packssdw m0, m0
+ pshufd m0, m0, 0x0
+ mova [%5], m0
+ mova [16 + %5], m0
%%2:
%endmacro
%macro Z_COND_IDCT 8
- movq mm0, [blockq + %1] ; R4 R0 r4 r0
- movq mm1, [blockq + %2] ; R6 R2 r6 r2
- movq mm2, [blockq + %3] ; R3 R1 r3 r1
- movq mm3, [blockq + %4] ; R7 R5 r7 r5
- movq mm4, mm0
- por mm4, mm1
- por mm4, mm2
- por mm4, mm3
- packssdw mm4, mm4
- movd t0d, mm4
+ movq m0, [blockq + %1] ; R4 R0 r4 r0
+ movq m1, [blockq + %2] ; R6 R2 r6 r2
+ movq m2, [blockq + %3] ; R3 R1 r3 r1
+ movq m3, [blockq + %4] ; R7 R5 r7 r5
+ movq m4, m0
+ por m4, m1
+ por m4, m2
+ por m4, m3
+ packssdw m4, m4
+ movd t0d, m4
or t0d, t0d
jz %8
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm5, [coeffs + 32] ; C6 C2 C6 C2
- pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
- movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
- pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
- movq mm7, [coeffs + 48] ; C3 C1 C3 C1
- pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
- paddd mm4, [coeffs]
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- paddd mm4, mm5 ; A0 a0
- psubd mm6, mm5 ; A3 a3
- movq mm5, [coeffs + 56] ; C7 C5 C7 C5
- pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5
- paddd mm0, [coeffs]
- paddd mm1, mm0 ; A1 a1
- paddd mm0, mm0
- psubd mm0, mm1 ; A2 a2
- pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7, mm5 ; B0 b0
- movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1
- pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm7, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm7 ; A0-B0 a0-b0
- paddd mm5, mm2 ; B1 b1
- psrad mm7, %7
- psrad mm4, %7
- movq mm2, mm1 ; A1 a1
- paddd mm1, mm5 ; A1+B1 a1+b1
- psubd mm2, mm5 ; A1-B1 a1-b1
- psrad mm1, %7
- psrad mm2, %7
- packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0
- packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1
- movq [%5], mm7
- movq mm1, [blockq + %3] ; R3 R1 r3 r1
- movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
- movq [24 + %5], mm2
- pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1
- movq mm7, [coeffs + 88] ; C3 C7 C3 C7
- pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
- pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2, mm0 ; A2 a2
- pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm4, mm7 ; B2 b2
- paddd mm2, mm4 ; A2+B2 a2+b2
- psubd mm0, mm4 ; a2-B2 a2-b2
- psrad mm2, %7
- psrad mm0, %7
- movq mm4, mm6 ; A3 a3
- paddd mm3, mm1 ; B3 b3
- paddd mm6, mm3 ; A3+B3 a3+b3
- psubd mm4, mm3 ; a3-B3 a3-b3
- psrad mm6, %7
- packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2
- movq [8 + %5], mm2
- psrad mm4, %7
- packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3
- movq [16 + %5], mm4
+ movq m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ movq m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ movq m5, [coeffs + 64] ; C6 C2 C6 C2
+ pmaddwd m5, m1 ; C6R6+C2R2 C6r6+C2r2
+ movq m6, [coeffs + 80] ; -C2 C6 -C2 C6
+ pmaddwd m1, m6 ; -C2R6+C6R2 -C2r6+C6r2
+ movq m7, [coeffs + 96] ; C3 C1 C3 C1
+ pmaddwd m7, m2 ; C3R3+C1R1 C3r3+C1r1
+ paddd m4, [coeffs]
+ movq m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ paddd m4, m5 ; A0 a0
+ psubd m6, m5 ; A3 a3
+ movq m5, [coeffs + 112] ; C7 C5 C7 C5
+ pmaddwd m5, m3 ; C7R7+C5R5 C7r7+C5r5
+ paddd m0, [coeffs]
+ paddd m1, m0 ; A1 a1
+ paddd m0, m0
+ psubd m0, m1 ; A2 a2
+ pmaddwd m2, [coeffs + 128] ; -C7R3+C3R1 -C7r3+C3r1
+ paddd m7, m5 ; B0 b0
+ movq m5, [coeffs + 144] ; -C5 -C1 -C5 -C1
+ pmaddwd m5, m3 ; -C5R7-C1R5 -C5r7-C1r5
+ paddd m7, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m7 ; A0-B0 a0-b0
+ paddd m5, m2 ; B1 b1
+ psrad m7, %7
+ psrad m4, %7
+ movq m2, m1 ; A1 a1
+ paddd m1, m5 ; A1+B1 a1+b1
+ psubd m2, m5 ; A1-B1 a1-b1
+ psrad m1, %7
+ psrad m2, %7
+ packssdw m7, m1 ; A1+B1 a1+b1 A0+B0 a0+b0
+ pshufd m7, m7, 0xD8
+ packssdw m2, m4 ; A0-B0 a0-b0 A1-B1 a1-b1
+ pshufd m2, m2, 0xD8
+ movq [%5], m7
+ movq m1, [blockq + %3] ; R3 R1 r3 r1
+ movq m4, [coeffs + 160] ; -C1 C5 -C1 C5
+ movq [24 + %5], m2
+ pmaddwd m4, m1 ; -C1R3+C5R1 -C1r3+C5r1
+ movq m7, [coeffs + 176] ; C3 C7 C3 C7
+ pmaddwd m1, [coeffs + 192] ; -C5R3+C7R1 -C5r3+C7r1
+ pmaddwd m7, m3 ; C3R7+C7R5 C3r7+C7r5
+ movq m2, m0 ; A2 a2
+ pmaddwd m3, [coeffs + 208] ; -C1R7+C3R5 -C1r7+C3r5
+ paddd m4, m7 ; B2 b2
+ paddd m2, m4 ; A2+B2 a2+b2
+ psubd m0, m4 ; a2-B2 a2-b2
+ psrad m2, %7
+ psrad m0, %7
+ movq m4, m6 ; A3 a3
+ paddd m3, m1 ; B3 b3
+ paddd m6, m3 ; A3+B3 a3+b3
+ psubd m4, m3 ; a3-B3 a3-b3
+ psrad m6, %7
+ packssdw m2, m6 ; A3+B3 a3+b3 A2+B2 a2+b2
+ pshufd m2, m2, 0xD8
+ movq [8 + %5], m2
+ psrad m4, %7
+ packssdw m4, m0 ; A2-B2 a2-b2 A3-B3 a3-b3
+ pshufd m4, m4, 0xD8
+ movq [16 + %5], m4
%endmacro
%macro IDCT1 6
- movq mm0, %1 ; R4 R0 r4 r0
- movq mm1, %2 ; R6 R2 r6 r2
- movq mm2, %3 ; R3 R1 r3 r1
- movq mm3, %4 ; R7 R5 r7 r5
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm5, [coeffs + 32] ; C6 C2 C6 C2
- pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
- movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
- pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- movq mm7, [coeffs + 48] ; C3 C1 C3 C1
- pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
- paddd mm4, mm5 ; A0 a0
- psubd mm6, mm5 ; A3 a3
- movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
- paddd mm0, mm1 ; A1 a1
- psubd mm5, mm1 ; A2 a2
- movq mm1, [coeffs + 56] ; C7 C5 C7 C5
- pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
- pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7, mm1 ; B0 b0
- movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1
- pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm7, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm7 ; A0-B0 a0-b0
- paddd mm1, mm2 ; B1 b1
- psrad mm7, %6
- psrad mm4, %6
- movq mm2, mm0 ; A1 a1
- paddd mm0, mm1 ; A1+B1 a1+b1
- psubd mm2, mm1 ; A1-B1 a1-b1
- psrad mm0, %6
- psrad mm2, %6
- packssdw mm7, mm7 ; A0+B0 a0+b0
- movd [%5], mm7
- packssdw mm0, mm0 ; A1+B1 a1+b1
- movd [16 + %5], mm0
- packssdw mm2, mm2 ; A1-B1 a1-b1
- movd [96 + %5], mm2
- packssdw mm4, mm4 ; A0-B0 a0-b0
- movd [112 + %5], mm4
- movq mm0, %3 ; R3 R1 r3 r1
- movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
- pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1
- movq mm7, [coeffs + 88] ; C3 C7 C3 C7
- pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
- pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2, mm5 ; A2 a2
- pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm4, mm7 ; B2 b2
- paddd mm2, mm4 ; A2+B2 a2+b2
- psubd mm5, mm4 ; a2-B2 a2-b2
- psrad mm2, %6
- psrad mm5, %6
- movq mm4, mm6 ; A3 a3
- paddd mm3, mm0 ; B3 b3
- paddd mm6, mm3 ; A3+B3 a3+b3
- psubd mm4, mm3 ; a3-B3 a3-b3
- psrad mm6, %6
- psrad mm4, %6
- packssdw mm2, mm2 ; A2+B2 a2+b2
- packssdw mm6, mm6 ; A3+B3 a3+b3
- movd [32 + %5], mm2
- packssdw mm4, mm4 ; A3-B3 a3-b3
- packssdw mm5, mm5 ; A2-B2 a2-b2
- movd [48 + %5], mm6
- movd [64 + %5], mm4
- movd [80 + %5], mm5
+ mova m0, %1 ; R4 R0 r4 r0
+ mova m1, %2 ; R6 R2 r6 r2
+ mova m2, %3 ; R3 R1 r3 r1
+ mova m3, %4 ; R7 R5 r7 r5
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m5, [coeffs + 64] ; C6 C2 C6 C2
+ pmaddwd m5, m1 ; C6R6+C2R2 C6r6+C2r2
+ mova m6, [coeffs + 80] ; -C2 C6 -C2 C6
+ pmaddwd m1, m6 ; -C2R6+C6R2 -C2r6+C6r2
+ mova m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ mova m7, [coeffs + 96] ; C3 C1 C3 C1
+ pmaddwd m7, m2 ; C3R3+C1R1 C3r3+C1r1
+ paddd m4, m5 ; A0 a0
+ psubd m6, m5 ; A3 a3
+ mova m5, m0 ; -C4R4+C4R0 -C4r4+C4r0
+ paddd m0, m1 ; A1 a1
+ psubd m5, m1 ; A2 a2
+ mova m1, [coeffs + 112] ; C7 C5 C7 C5
+ pmaddwd m1, m3 ; C7R7+C5R5 C7r7+C5r5
+ pmaddwd m2, [coeffs + 128] ; -C7R3+C3R1 -C7r3+C3r1
+ paddd m7, m1 ; B0 b0
+ mova m1, [coeffs + 144] ; -C5 -C1 -C5 -C1
+ pmaddwd m1, m3 ; -C5R7-C1R5 -C5r7-C1r5
+ paddd m7, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m7 ; A0-B0 a0-b0
+ paddd m1, m2 ; B1 b1
+ psrad m7, %6
+ psrad m4, %6
+ mova m2, m0 ; A1 a1
+ paddd m0, m1 ; A1+B1 a1+b1
+ psubd m2, m1 ; A1-B1 a1-b1
+ psrad m0, %6
+ psrad m2, %6
+ packssdw m7, m7 ; A0+B0 a0+b0
+ movq [%5], m7
+ packssdw m0, m0 ; A1+B1 a1+b1
+ movq [16 + %5], m0
+ packssdw m2, m2 ; A1-B1 a1-b1
+ movq [96 + %5], m2
+ packssdw m4, m4 ; A0-B0 a0-b0
+ movq [112 + %5], m4
+ mova m0, %3 ; R3 R1 r3 r1
+ mova m4, [coeffs + 160] ; -C1 C5 -C1 C5
+ pmaddwd m4, m0 ; -C1R3+C5R1 -C1r3+C5r1
+ mova m7, [coeffs + 176] ; C3 C7 C3 C7
+ pmaddwd m0, [coeffs + 192] ; -C5R3+C7R1 -C5r3+C7r1
+ pmaddwd m7, m3 ; C3R7+C7R5 C3r7+C7r5
+ mova m2, m5 ; A2 a2
+ pmaddwd m3, [coeffs + 208] ; -C1R7+C3R5 -C1r7+C3r5
+ paddd m4, m7 ; B2 b2
+ paddd m2, m4 ; A2+B2 a2+b2
+ psubd m5, m4 ; a2-B2 a2-b2
+ psrad m2, %6
+ psrad m5, %6
+ mova m4, m6 ; A3 a3
+ paddd m3, m0 ; B3 b3
+ paddd m6, m3 ; A3+B3 a3+b3
+ psubd m4, m3 ; a3-B3 a3-b3
+ psrad m6, %6
+ psrad m4, %6
+ packssdw m2, m2 ; A2+B2 a2+b2
+ packssdw m6, m6 ; A3+B3 a3+b3
+ movq [32 + %5], m2
+ packssdw m4, m4 ; A3-B3 a3-b3
+ packssdw m5, m5 ; A2-B2 a2-b2
+ movq [48 + %5], m6
+ movq [64 + %5], m4
+ movq [80 + %5], m5
%endmacro
%macro IDCT2 6
- movq mm0, %1 ; R4 R0 r4 r0
- movq mm1, %2 ; R6 R2 r6 r2
- movq mm3, %4 ; R7 R5 r7 r5
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm5, [coeffs + 32] ; C6 C2 C6 C2
- pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
- movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
- pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- paddd mm4, mm5 ; A0 a0
- psubd mm6, mm5 ; A3 a3
- movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
- paddd mm0, mm1 ; A1 a1
- psubd mm5, mm1 ; A2 a2
- movq mm1, [coeffs + 56] ; C7 C5 C7 C5
- pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
- movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1
- pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm1, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm1 ; A0-B0 a0-b0
- psrad mm1, %6
- psrad mm4, %6
- movq mm2, mm0 ; A1 a1
- paddd mm0, mm7 ; A1+B1 a1+b1
- psubd mm2, mm7 ; A1-B1 a1-b1
- psrad mm0, %6
- psrad mm2, %6
- packssdw mm1, mm1 ; A0+B0 a0+b0
- movd [%5], mm1
- packssdw mm0, mm0 ; A1+B1 a1+b1
- movd [16 + %5], mm0
- packssdw mm2, mm2 ; A1-B1 a1-b1
- movd [96 + %5], mm2
- packssdw mm4, mm4 ; A0-B0 a0-b0
- movd [112 + %5], mm4
- movq mm1, [coeffs + 88] ; C3 C7 C3 C7
- pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2, mm5 ; A2 a2
- pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm2, mm1 ; A2+B2 a2+b2
- psubd mm5, mm1 ; a2-B2 a2-b2
- psrad mm2, %6
- psrad mm5, %6
- movq mm1, mm6 ; A3 a3
- paddd mm6, mm3 ; A3+B3 a3+b3
- psubd mm1, mm3 ; a3-B3 a3-b3
- psrad mm6, %6
- psrad mm1, %6
- packssdw mm2, mm2 ; A2+B2 a2+b2
- packssdw mm6, mm6 ; A3+B3 a3+b3
- movd [32 + %5], mm2
- packssdw mm1, mm1 ; A3-B3 a3-b3
- packssdw mm5, mm5 ; A2-B2 a2-b2
- movd [48 + %5], mm6
- movd [64 + %5], mm1
- movd [80 + %5], mm5
+ mova m0, %1 ; R4 R0 r4 r0
+ mova m1, %2 ; R6 R2 r6 r2
+ mova m3, %4 ; R7 R5 r7 r5
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m5, [coeffs + 64] ; C6 C2 C6 C2
+ pmaddwd m5, m1 ; C6R6+C2R2 C6r6+C2r2
+ mova m6, [coeffs + 80] ; -C2 C6 -C2 C6
+ pmaddwd m1, m6 ; -C2R6+C6R2 -C2r6+C6r2
+ mova m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ paddd m4, m5 ; A0 a0
+ psubd m6, m5 ; A3 a3
+ mova m5, m0 ; -C4R4+C4R0 -C4r4+C4r0
+ paddd m0, m1 ; A1 a1
+ psubd m5, m1 ; A2 a2
+ mova m1, [coeffs + 112] ; C7 C5 C7 C5
+ pmaddwd m1, m3 ; C7R7+C5R5 C7r7+C5r5
+ mova m7, [coeffs + 144] ; -C5 -C1 -C5 -C1
+ pmaddwd m7, m3 ; -C5R7-C1R5 -C5r7-C1r5
+ paddd m1, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m1 ; A0-B0 a0-b0
+ psrad m1, %6
+ psrad m4, %6
+ mova m2, m0 ; A1 a1
+ paddd m0, m7 ; A1+B1 a1+b1
+ psubd m2, m7 ; A1-B1 a1-b1
+ psrad m0, %6
+ psrad m2, %6
+ packssdw m1, m1 ; A0+B0 a0+b0
+ movq [%5], m1
+ packssdw m0, m0 ; A1+B1 a1+b1
+ movq [16 + %5], m0
+ packssdw m2, m2 ; A1-B1 a1-b1
+ movq [96 + %5], m2
+ packssdw m4, m4 ; A0-B0 a0-b0
+ movq [112 + %5], m4
+ mova m1, [coeffs + 176] ; C3 C7 C3 C7
+ pmaddwd m1, m3 ; C3R7+C7R5 C3r7+C7r5
+ mova m2, m5 ; A2 a2
+ pmaddwd m3, [coeffs + 208] ; -C1R7+C3R5 -C1r7+C3r5
+ paddd m2, m1 ; A2+B2 a2+b2
+ psubd m5, m1 ; a2-B2 a2-b2
+ psrad m2, %6
+ psrad m5, %6
+ mova m1, m6 ; A3 a3
+ paddd m6, m3 ; A3+B3 a3+b3
+ psubd m1, m3 ; a3-B3 a3-b3
+ psrad m6, %6
+ psrad m1, %6
+ packssdw m2, m2 ; A2+B2 a2+b2
+ packssdw m6, m6 ; A3+B3 a3+b3
+ movq [32 + %5], m2
+ packssdw m1, m1 ; A3-B3 a3-b3
+ packssdw m5, m5 ; A2-B2 a2-b2
+ movq [48 + %5], m6
+ movq [64 + %5], m1
+ movq [80 + %5], m5
%endmacro
%macro IDCT3 6
- movq mm0, %1 ; R4 R0 r4 r0
- movq mm3, %4 ; R7 R5 r7 r5
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm1, [coeffs + 56] ; C7 C5 C7 C5
- pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
- movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1
- pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm1, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm1 ; A0-B0 a0-b0
- psrad mm1, %6
- psrad mm4, %6
- movq mm2, mm0 ; A1 a1
- paddd mm0, mm7 ; A1+B1 a1+b1
- psubd mm2, mm7 ; A1-B1 a1-b1
- psrad mm0, %6
- psrad mm2, %6
- packssdw mm1, mm1 ; A0+B0 a0+b0
- movd [%5], mm1
- packssdw mm0, mm0 ; A1+B1 a1+b1
- movd [16 + %5], mm0
- packssdw mm2, mm2 ; A1-B1 a1-b1
- movd [96 + %5], mm2
- packssdw mm4, mm4 ; A0-B0 a0-b0
- movd [112 + %5], mm4
- movq mm1, [coeffs + 88] ; C3 C7 C3 C7
- pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2, mm5 ; A2 a2
- pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm2, mm1 ; A2+B2 a2+b2
- psubd mm5, mm1 ; a2-B2 a2-b2
- psrad mm2, %6
- psrad mm5, %6
- movq mm1, mm6 ; A3 a3
- paddd mm6, mm3 ; A3+B3 a3+b3
- psubd mm1, mm3 ; a3-B3 a3-b3
- psrad mm6, %6
- psrad mm1, %6
- packssdw mm2, mm2 ; A2+B2 a2+b2
- packssdw mm6, mm6 ; A3+B3 a3+b3
- movd [32 + %5], mm2
- packssdw mm1, mm1 ; A3-B3 a3-b3
- packssdw mm5, mm5 ; A2-B2 a2-b2
- movd [48 + %5], mm6
- movd [64 + %5], mm1
- movd [80 + %5], mm5
+ mova m0, %1 ; R4 R0 r4 r0
+ mova m3, %4 ; R7 R5 r7 r5
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, m0 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m1, [coeffs + 112] ; C7 C5 C7 C5
+ pmaddwd m1, m3 ; C7R7+C5R5 C7r7+C5r5
+ mova m7, [coeffs + 144] ; -C5 -C1 -C5 -C1
+ pmaddwd m7, m3 ; -C5R7-C1R5 -C5r7-C1r5
+ paddd m1, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m1 ; A0-B0 a0-b0
+ psrad m1, %6
+ psrad m4, %6
+ mova m2, m0 ; A1 a1
+ paddd m0, m7 ; A1+B1 a1+b1
+ psubd m2, m7 ; A1-B1 a1-b1
+ psrad m0, %6
+ psrad m2, %6
+ packssdw m1, m1 ; A0+B0 a0+b0
+ movq [%5], m1
+ packssdw m0, m0 ; A1+B1 a1+b1
+ movq [16 + %5], m0
+ packssdw m2, m2 ; A1-B1 a1-b1
+ movq [96 + %5], m2
+ packssdw m4, m4 ; A0-B0 a0-b0
+ movq [112 + %5], m4
+ mova m1, [coeffs + 176] ; C3 C7 C3 C7
+ pmaddwd m1, m3 ; C3R7+C7R5 C3r7+C7r5
+ mova m2, m5 ; A2 a2
+ pmaddwd m3, [coeffs + 208] ; -C1R7+C3R5 -C1r7+C3r5
+ paddd m2, m1 ; A2+B2 a2+b2
+ psubd m5, m1 ; a2-B2 a2-b2
+ psrad m2, %6
+ psrad m5, %6
+ mova m1, m6 ; A3 a3
+ paddd m6, m3 ; A3+B3 a3+b3
+ psubd m1, m3 ; a3-B3 a3-b3
+ psrad m6, %6
+ psrad m1, %6
+ packssdw m2, m2 ; A2+B2 a2+b2
+ packssdw m6, m6 ; A3+B3 a3+b3
+ movq [32 + %5], m2
+ packssdw m1, m1 ; A3-B3 a3-b3
+ packssdw m5, m5 ; A2-B2 a2-b2
+ movq [48 + %5], m6
+ movq [64 + %5], m1
+ movq [80 + %5], m5
%endmacro
%macro IDCT4 6
- movq mm0, %1 ; R4 R0 r4 r0
- movq mm2, %3 ; R3 R1 r3 r1
- movq mm3, %4 ; R7 R5 r7 r5
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- movq mm7, [coeffs + 48] ; C3 C1 C3 C1
- pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
- movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm1, [coeffs + 56] ; C7 C5 C7 C5
- pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
- pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7, mm1 ; B0 b0
- movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1
- pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm7, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm7 ; A0-B0 a0-b0
- paddd mm1, mm2 ; B1 b1
- psrad mm7, %6
- psrad mm4, %6
- movq mm2, mm0 ; A1 a1
- paddd mm0, mm1 ; A1+B1 a1+b1
- psubd mm2, mm1 ; A1-B1 a1-b1
- psrad mm0, %6
- psrad mm2, %6
- packssdw mm7, mm7 ; A0+B0 a0+b0
- movd [%5], mm7
- packssdw mm0, mm0 ; A1+B1 a1+b1
- movd [16 + %5], mm0
- packssdw mm2, mm2 ; A1-B1 a1-b1
- movd [96 + %5], mm2
- packssdw mm4, mm4 ; A0-B0 a0-b0
- movd [112 + %5], mm4
- movq mm0, %3 ; R3 R1 r3 r1
- movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
- pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1
- movq mm7, [coeffs + 88] ; C3 C7 C3 C7
- pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
- pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2, mm5 ; A2 a2
- pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm4, mm7 ; B2 b2
- paddd mm2, mm4 ; A2+B2 a2+b2
- psubd mm5, mm4 ; a2-B2 a2-b2
- psrad mm2, %6
- psrad mm5, %6
- movq mm4, mm6 ; A3 a3
- paddd mm3, mm0 ; B3 b3
- paddd mm6, mm3 ; A3+B3 a3+b3
- psubd mm4, mm3 ; a3-B3 a3-b3
- psrad mm6, %6
- psrad mm4, %6
- packssdw mm2, mm2 ; A2+B2 a2+b2
- packssdw mm6, mm6 ; A3+B3 a3+b3
- movd [32 + %5], mm2
- packssdw mm4, mm4 ; A3-B3 a3-b3
- packssdw mm5, mm5 ; A2-B2 a2-b2
- movd [48 + %5], mm6
- movd [64 + %5], mm4
- movd [80 + %5], mm5
+ mova m0, %1 ; R4 R0 r4 r0
+ mova m2, %3 ; R3 R1 r3 r1
+ mova m3, %4 ; R7 R5 r7 r5
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ mova m7, [coeffs + 96] ; C3 C1 C3 C1
+ pmaddwd m7, m2 ; C3R3+C1R1 C3r3+C1r1
+ mova m5, m0 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m1, [coeffs + 112] ; C7 C5 C7 C5
+ pmaddwd m1, m3 ; C7R7+C5R5 C7r7+C5r5
+ pmaddwd m2, [coeffs + 128] ; -C7R3+C3R1 -C7r3+C3r1
+ paddd m7, m1 ; B0 b0
+ mova m1, [coeffs + 144] ; -C5 -C1 -C5 -C1
+ pmaddwd m1, m3 ; -C5R7-C1R5 -C5r7-C1r5
+ paddd m7, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m7 ; A0-B0 a0-b0
+ paddd m1, m2 ; B1 b1
+ psrad m7, %6
+ psrad m4, %6
+ mova m2, m0 ; A1 a1
+ paddd m0, m1 ; A1+B1 a1+b1
+ psubd m2, m1 ; A1-B1 a1-b1
+ psrad m0, %6
+ psrad m2, %6
+ packssdw m7, m7 ; A0+B0 a0+b0
+ movq [%5], m7
+ packssdw m0, m0 ; A1+B1 a1+b1
+ movq [16 + %5], m0
+ packssdw m2, m2 ; A1-B1 a1-b1
+ movq [96 + %5], m2
+ packssdw m4, m4 ; A0-B0 a0-b0
+ movq [112 + %5], m4
+ mova m0, %3 ; R3 R1 r3 r1
+ mova m4, [coeffs + 160] ; -C1 C5 -C1 C5
+ pmaddwd m4, m0 ; -C1R3+C5R1 -C1r3+C5r1
+ mova m7, [coeffs + 176] ; C3 C7 C3 C7
+ pmaddwd m0, [coeffs + 192] ; -C5R3+C7R1 -C5r3+C7r1
+ pmaddwd m7, m3 ; C3R7+C7R5 C3r7+C7r5
+ mova m2, m5 ; A2 a2
+ pmaddwd m3, [coeffs + 208] ; -C1R7+C3R5 -C1r7+C3r5
+ paddd m4, m7 ; B2 b2
+ paddd m2, m4 ; A2+B2 a2+b2
+ psubd m5, m4 ; a2-B2 a2-b2
+ psrad m2, %6
+ psrad m5, %6
+ mova m4, m6 ; A3 a3
+ paddd m3, m0 ; B3 b3
+ paddd m6, m3 ; A3+B3 a3+b3
+ psubd m4, m3 ; a3-B3 a3-b3
+ psrad m6, %6
+ psrad m4, %6
+ packssdw m2, m2 ; A2+B2 a2+b2
+ packssdw m6, m6 ; A3+B3 a3+b3
+ movq [32 + %5], m2
+ packssdw m4, m4 ; A3-B3 a3-b3
+ packssdw m5, m5 ; A2-B2 a2-b2
+ movq [48 + %5], m6
+ movq [64 + %5], m4
+ movq [80 + %5], m5
%endmacro
%macro IDCT5 6
- movq mm0, %1 ; R4 R0 r4 r0
- movq mm2, %3 ; R3 R1 r3 r1
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- movq mm7, [coeffs + 48] ; C3 C1 C3 C1
- pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
- movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm3, [coeffs + 64]
- pmaddwd mm3, mm2 ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm7 ; A0-B0 a0-b0
- psrad mm7, %6
- psrad mm4, %6
- movq mm1, mm0 ; A1 a1
- paddd mm0, mm3 ; A1+B1 a1+b1
- psubd mm1, mm3 ; A1-B1 a1-b1
- psrad mm0, %6
- psrad mm1, %6
- packssdw mm7, mm7 ; A0+B0 a0+b0
- movd [%5], mm7
- packssdw mm0, mm0 ; A1+B1 a1+b1
- movd [16 + %5], mm0
- packssdw mm1, mm1 ; A1-B1 a1-b1
- movd [96 + %5], mm1
- packssdw mm4, mm4 ; A0-B0 a0-b0
- movd [112 + %5], mm4
- movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
- pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1
- pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
- movq mm1, mm5 ; A2 a2
- paddd mm1, mm4 ; A2+B2 a2+b2
- psubd mm5, mm4 ; a2-B2 a2-b2
- psrad mm1, %6
- psrad mm5, %6
- movq mm4, mm6 ; A3 a3
- paddd mm6, mm2 ; A3+B3 a3+b3
- psubd mm4, mm2 ; a3-B3 a3-b3
- psrad mm6, %6
- psrad mm4, %6
- packssdw mm1, mm1 ; A2+B2 a2+b2
- packssdw mm6, mm6 ; A3+B3 a3+b3
- movd [32 + %5], mm1
- packssdw mm4, mm4 ; A3-B3 a3-b3
- packssdw mm5, mm5 ; A2-B2 a2-b2
- movd [48 + %5], mm6
- movd [64 + %5], mm4
- movd [80 + %5], mm5
+ mova m0, %1 ; R4 R0 r4 r0
+ mova m2, %3 ; R3 R1 r3 r1
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ mova m7, [coeffs + 96] ; C3 C1 C3 C1
+ pmaddwd m7, m2 ; C3R3+C1R1 C3r3+C1r1
+ mova m5, m0 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m3, [coeffs + 128]
+ pmaddwd m3, m2 ; -C7R3+C3R1 -C7r3+C3r1
+ paddd m7, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m7 ; A0-B0 a0-b0
+ psrad m7, %6
+ psrad m4, %6
+ mova m1, m0 ; A1 a1
+ paddd m0, m3 ; A1+B1 a1+b1
+ psubd m1, m3 ; A1-B1 a1-b1
+ psrad m0, %6
+ psrad m1, %6
+ packssdw m7, m7 ; A0+B0 a0+b0
+ movq [%5], m7
+ packssdw m0, m0 ; A1+B1 a1+b1
+ movq [16 + %5], m0
+ packssdw m1, m1 ; A1-B1 a1-b1
+ movq [96 + %5], m1
+ packssdw m4, m4 ; A0-B0 a0-b0
+ movq [112 + %5], m4
+ mova m4, [coeffs + 160] ; -C1 C5 -C1 C5
+ pmaddwd m4, m2 ; -C1R3+C5R1 -C1r3+C5r1
+ pmaddwd m2, [coeffs + 192] ; -C5R3+C7R1 -C5r3+C7r1
+ mova m1, m5 ; A2 a2
+ paddd m1, m4 ; A2+B2 a2+b2
+ psubd m5, m4 ; a2-B2 a2-b2
+ psrad m1, %6
+ psrad m5, %6
+ mova m4, m6 ; A3 a3
+ paddd m6, m2 ; A3+B3 a3+b3
+ psubd m4, m2 ; a3-B3 a3-b3
+ psrad m6, %6
+ psrad m4, %6
+ packssdw m1, m1 ; A2+B2 a2+b2
+ packssdw m6, m6 ; A3+B3 a3+b3
+ movq [32 + %5], m1
+ packssdw m4, m4 ; A3-B3 a3-b3
+ packssdw m5, m5 ; A2-B2 a2-b2
+ movq [48 + %5], m6
+ movq [64 + %5], m4
+ movq [80 + %5], m5
%endmacro
%macro IDCT6 6
- movq mm0, [%1] ; R4 R0 r4 r0
- movq mm1, [%2] ; R6 R2 r6 r2
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm5, [coeffs + 32] ; C6 C2 C6 C2
- pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
- movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
- pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- paddd mm4, mm5 ; A0 a0
- psubd mm6, mm5 ; A3 a3
- movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
- paddd mm0, mm1 ; A1 a1
- psubd mm5, mm1 ; A2 a2
- movq mm2, [8 + %1] ; R4 R0 r4 r0
- movq mm3, [8 + %2] ; R6 R2 r6 r2
- movq mm1, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0
- movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm7, [coeffs + 32] ; C6 C2 C6 C2
- pmaddwd mm7, mm3 ; C6R6+C2R2 C6r6+C2r2
- pmaddwd mm3, [coeffs + 40] ; -C2R6+C6R2 -C2r6+C6r2
- paddd mm7, mm1 ; A0 a0
- paddd mm1, mm1 ; 2C0 2c0
- psubd mm1, mm7 ; A3 a3
- paddd mm3, mm2 ; A1 a1
- paddd mm2, mm2 ; 2C1 2c1
- psubd mm2, mm3 ; A2 a2
- psrad mm4, %6
- psrad mm7, %6
- psrad mm3, %6
- packssdw mm4, mm7 ; A0 a0
- movq [%5], mm4
- psrad mm0, %6
- packssdw mm0, mm3 ; A1 a1
- movq [16 + %5], mm0
- movq [96 + %5], mm0
- movq [112 + %5], mm4
- psrad mm5, %6
- psrad mm6, %6
- psrad mm2, %6
- packssdw mm5, mm2 ; A2-B2 a2-b2
- movq [32 + %5], mm5
- psrad mm1, %6
- packssdw mm6, mm1 ; A3+B3 a3+b3
- movq [48 + %5], mm6
- movq [64 + %5], mm6
- movq [80 + %5], mm5
+ movq m0, [%1] ; R4 R0 r4 r0
+ movhps m0, [%1 + 16]
+ movq m1, [%2] ; R6 R2 r6 r2
+ movhps m1, [%2 + 16]
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m5, [coeffs + 64] ; C6 C2 C6 C2
+ pmaddwd m5, m1 ; C6R6+C2R2 C6r6+C2r2
+ mova m6, [coeffs + 80] ; -C2 C6 -C2 C6
+ pmaddwd m1, m6 ; -C2R6+C6R2 -C2r6+C6r2
+ mova m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ paddd m4, m5 ; A0 a0
+ psubd m6, m5 ; A3 a3
+ mova m5, m0 ; -C4R4+C4R0 -C4r4+C4r0
+ paddd m0, m1 ; A1 a1
+ psubd m5, m1 ; A2 a2
+ movq m2, [%1 + 8] ; R4 R0 r4 r0
+ movhps m2, [%1 + 24]
+ movq m3, [%2 + 8] ; R6 R2 r6 r2
+ movhps m3, [%2 + 24]
+ mova m1, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m1, m2 ; C4R4+C4R0 C4r4+C4r0
+ mova m7, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m2, m7 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m7, [coeffs + 64] ; C6 C2 C6 C2
+ pmaddwd m7, m3 ; C6R6+C2R2 C6r6+C2r2
+ pmaddwd m3, [coeffs + 80] ; -C2R6+C6R2 -C2r6+C6r2
+ paddd m7, m1 ; A0 a0
+ paddd m1, m1 ; 2C0 2c0
+ psubd m1, m7 ; A3 a3
+ paddd m3, m2 ; A1 a1
+ paddd m2, m2 ; 2C1 2c1
+ psubd m2, m3 ; A2 a2
+ psrad m4, %6
+ psrad m7, %6
+ psrad m3, %6
+ packssdw m4, m7 ; A0 a0
+ pshufd m4, m4, 0xD8
+ mova [%5], m4
+ psrad m0, %6
+ packssdw m0, m3 ; A1 a1
+ pshufd m0, m0, 0xD8
+ mova [16 + %5], m0
+ mova [96 + %5], m0
+ mova [112 + %5], m4
+ psrad m5, %6
+ psrad m6, %6
+ psrad m2, %6
+ packssdw m5, m2 ; A2-B2 a2-b2
+ pshufd m5, m5, 0xD8
+ mova [32 + %5], m5
+ psrad m1, %6
+ packssdw m6, m1 ; A3+B3 a3+b3
+ pshufd m6, m6, 0xD8
+ mova [48 + %5], m6
+ mova [64 + %5], m6
+ mova [80 + %5], m5
%endmacro
%macro IDCT7 6
- movq mm0, %1 ; R4 R0 r4 r0
- movq mm1, %2 ; R6 R2 r6 r2
- movq mm2, %3 ; R3 R1 r3 r1
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm5, [coeffs + 32] ; C6 C2 C6 C2
- pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
- movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
- pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- movq mm7, [coeffs + 48] ; C3 C1 C3 C1
- pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
- paddd mm4, mm5 ; A0 a0
- psubd mm6, mm5 ; A3 a3
- movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
- paddd mm0, mm1 ; A1 a1
- psubd mm5, mm1 ; A2 a2
- movq mm1, [coeffs + 64]
- pmaddwd mm1, mm2 ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm7 ; A0-B0 a0-b0
- psrad mm7, %6
- psrad mm4, %6
- movq mm3, mm0 ; A1 a1
- paddd mm0, mm1 ; A1+B1 a1+b1
- psubd mm3, mm1 ; A1-B1 a1-b1
- psrad mm0, %6
- psrad mm3, %6
- packssdw mm7, mm7 ; A0+B0 a0+b0
- movd [%5], mm7
- packssdw mm0, mm0 ; A1+B1 a1+b1
- movd [16 + %5], mm0
- packssdw mm3, mm3 ; A1-B1 a1-b1
- movd [96 + %5], mm3
- packssdw mm4, mm4 ; A0-B0 a0-b0
- movd [112 + %5], mm4
- movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
- pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1
- pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
- movq mm3, mm5 ; A2 a2
- paddd mm3, mm4 ; A2+B2 a2+b2
- psubd mm5, mm4 ; a2-B2 a2-b2
- psrad mm3, %6
- psrad mm5, %6
- movq mm4, mm6 ; A3 a3
- paddd mm6, mm2 ; A3+B3 a3+b3
- psubd mm4, mm2 ; a3-B3 a3-b3
- psrad mm6, %6
- packssdw mm3, mm3 ; A2+B2 a2+b2
- movd [32 + %5], mm3
- psrad mm4, %6
- packssdw mm6, mm6 ; A3+B3 a3+b3
- movd [48 + %5], mm6
- packssdw mm4, mm4 ; A3-B3 a3-b3
- packssdw mm5, mm5 ; A2-B2 a2-b2
- movd [64 + %5], mm4
- movd [80 + %5], mm5
+ mova m0, %1 ; R4 R0 r4 r0
+ mova m1, %2 ; R6 R2 r6 r2
+ mova m2, %3 ; R3 R1 r3 r1
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m5, [coeffs + 64] ; C6 C2 C6 C2
+ pmaddwd m5, m1 ; C6R6+C2R2 C6r6+C2r2
+ mova m6, [coeffs + 80] ; -C2 C6 -C2 C6
+ pmaddwd m1, m6 ; -C2R6+C6R2 -C2r6+C6r2
+ mova m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ mova m7, [coeffs + 96] ; C3 C1 C3 C1
+ pmaddwd m7, m2 ; C3R3+C1R1 C3r3+C1r1
+ paddd m4, m5 ; A0 a0
+ psubd m6, m5 ; A3 a3
+ mova m5, m0 ; -C4R4+C4R0 -C4r4+C4r0
+ paddd m0, m1 ; A1 a1
+ psubd m5, m1 ; A2 a2
+ mova m1, [coeffs + 128]
+ pmaddwd m1, m2 ; -C7R3+C3R1 -C7r3+C3r1
+ paddd m7, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m7 ; A0-B0 a0-b0
+ psrad m7, %6
+ psrad m4, %6
+ mova m3, m0 ; A1 a1
+ paddd m0, m1 ; A1+B1 a1+b1
+ psubd m3, m1 ; A1-B1 a1-b1
+ psrad m0, %6
+ psrad m3, %6
+ packssdw m7, m7 ; A0+B0 a0+b0
+ movq [%5], m7
+ packssdw m0, m0 ; A1+B1 a1+b1
+ movq [16 + %5], m0
+ packssdw m3, m3 ; A1-B1 a1-b1
+ movq [96 + %5], m3
+ packssdw m4, m4 ; A0-B0 a0-b0
+ movq [112 + %5], m4
+ mova m4, [coeffs + 160] ; -C1 C5 -C1 C5
+ pmaddwd m4, m2 ; -C1R3+C5R1 -C1r3+C5r1
+ pmaddwd m2, [coeffs + 192] ; -C5R3+C7R1 -C5r3+C7r1
+ mova m3, m5 ; A2 a2
+ paddd m3, m4 ; A2+B2 a2+b2
+ psubd m5, m4 ; a2-B2 a2-b2
+ psrad m3, %6
+ psrad m5, %6
+ mova m4, m6 ; A3 a3
+ paddd m6, m2 ; A3+B3 a3+b3
+ psubd m4, m2 ; a3-B3 a3-b3
+ psrad m6, %6
+ packssdw m3, m3 ; A2+B2 a2+b2
+ movq [32 + %5], m3
+ psrad m4, %6
+ packssdw m6, m6 ; A3+B3 a3+b3
+ movq [48 + %5], m6
+ packssdw m4, m4 ; A3-B3 a3-b3
+ packssdw m5, m5 ; A2-B2 a2-b2
+ movq [64 + %5], m4
+ movq [80 + %5], m5
%endmacro
%macro IDCT8 6
- movq mm0, [%1] ; R4 R0 r4 r0
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- psrad mm4, %6
- psrad mm0, %6
- movq mm2, [8 + %1] ; R4 R0 r4 r0
- movq mm1, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0
- movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm7, [coeffs + 32] ; C6 C2 C6 C2
- psrad mm1, %6
- packssdw mm4, mm1 ; A0 a0
- movq [%5], mm4
- psrad mm2, %6
- packssdw mm0, mm2 ; A1 a1
- movq [16 + %5], mm0
- movq [96 + %5], mm0
- movq [112 + %5], mm4
- movq [32 + %5], mm0
- movq [48 + %5], mm4
- movq [64 + %5], mm4
- movq [80 + %5], mm0
+ movq m0, [%1] ; R4 R0 r4 r0
+ movhps m0, [%1 + 16]
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ psrad m4, %6
+ psrad m0, %6
+ movq m2, [%1 + 8] ; R4 R0 r4 r0
+ movhps m2, [%1 + 24]
+ mova m1, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m1, m2 ; C4R4+C4R0 C4r4+C4r0
+ mova m7, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m2, m7 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m7, [coeffs + 64] ; C6 C2 C6 C2
+ psrad m1, %6
+ packssdw m4, m1 ; A0 a0
+ pshufd m4, m4, 0xD8
+ mova [%5], m4
+ psrad m2, %6
+ packssdw m0, m2 ; A1 a1
+ pshufd m0, m0, 0xD8
+ mova [16 + %5], m0
+ mova [96 + %5], m0
+ mova [112 + %5], m4
+ mova [32 + %5], m0
+ mova [48 + %5], m4
+ mova [64 + %5], m4
+ mova [80 + %5], m0
%endmacro
%macro IDCT 0
@@ -710,9 +735,7 @@ SECTION .text
Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%1
IDCT1 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
- IDCT1 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
- IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
jmp %%9
ALIGN 16
@@ -721,9 +744,7 @@ SECTION .text
Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5
IDCT2 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
- IDCT2 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
- IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
jmp %%9
ALIGN 16
@@ -731,9 +752,7 @@ SECTION .text
Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7
IDCT3 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
- IDCT3 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
- IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
jmp %%9
ALIGN 16
@@ -741,41 +760,33 @@ SECTION .text
Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3
IDCT4 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
- IDCT4 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
- IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
jmp %%9
ALIGN 16
%%3:
IDCT5 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
- IDCT5 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
- IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
jmp %%9
ALIGN 16
%%5:
IDCT6 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20
- IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20
jmp %%9
ALIGN 16
%%1:
IDCT7 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
- IDCT7 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
- IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
jmp %%9
ALIGN 16
%%7:
IDCT8 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20
- IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20
%%9:
%endmacro
@@ -805,15 +816,12 @@ SECTION .text
movhps [pixelsq+lsizeq], m0
%endmacro
-INIT_MMX mmx
+INIT_XMM sse2
cglobal simple_idct, 1, 2, 8, 128, block, t0
IDCT
- emms
RET
-INIT_XMM sse2
-
cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
IDCT
lea lsize3q, [lsizeq*3]
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index 9b64cfe9bc..c9ba6aedaf 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -22,10 +22,7 @@
#include <stddef.h>
#include <stdint.h>
-void ff_simple_idct_mmx(int16_t *block);
-void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
-void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
-
+void ff_simple_idct_sse2(int16_t *block);
void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
--
2.49.1
>From e7152c27777f5ade271e5c06487776dfa9e6eb94 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 5 Nov 2025 05:08:28 +0100
Subject: [PATCH 7/7] avcodec/dvdec,mjpegdec: Remove emms_c
It is no longer necessary now that the IDCTDSP is always ABI-compliant
(and free of MMX).
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/dvdec.c | 3 ---
libavcodec/mjpegdec.c | 2 --
2 files changed, 5 deletions(-)
diff --git a/libavcodec/dvdec.c b/libavcodec/dvdec.c
index 242708c70a..4799ec96dc 100644
--- a/libavcodec/dvdec.c
+++ b/libavcodec/dvdec.c
@@ -36,7 +36,6 @@
*/
#include "libavutil/avassert.h"
-#include "libavutil/emms.h"
#include "libavutil/internal.h"
#include "libavutil/mem_internal.h"
#include "libavutil/thread.h"
@@ -683,8 +682,6 @@ static int dvvideo_decode_frame(AVCodecContext *avctx, AVFrame *frame,
avctx->execute(avctx, dv_decode_video_segment, s->work_chunks, NULL,
dv_work_pool_size(s->sys), sizeof(DVwork_chunk));
- emms_c();
-
/* return image */
*got_frame = 1;
diff --git a/libavcodec/mjpegdec.c b/libavcodec/mjpegdec.c
index 5fd77073da..fb39c4e9fd 100644
--- a/libavcodec/mjpegdec.c
+++ b/libavcodec/mjpegdec.c
@@ -33,7 +33,6 @@
#include "config_components.h"
#include "libavutil/attributes.h"
-#include "libavutil/emms.h"
#include "libavutil/imgutils.h"
#include "libavutil/avassert.h"
#include "libavutil/mem.h"
@@ -1824,7 +1823,6 @@ next_field:
}
}
- emms_c();
return 0;
out_of_range:
av_log(s->avctx, AV_LOG_ERROR, "decode_sos: ac/dc index out of range\n");
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-11-05 4:12 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-11-05 4:11 [FFmpeg-devel] [PATCH] Stop using MMX in IDCTDSP (PR #20838) mkver via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git