Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] Stop using MMX in IDCTDSP (PR #20838)
@ 2025-11-05  4:11 mkver via ffmpeg-devel
  0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-11-05  4:11 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: mkver

PR #20838 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20838
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20838.patch


>From 549f85f6c3f32f90429bed8362e8817268fad862 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 4 Nov 2025 13:56:01 +0100
Subject: [PATCH 1/7] avcodec/x86/idctdsp_init: Fix IDCT permutation for 32bit
 without SSE2

bfb28b5ce89f3e950214b67ea95b45e3355c2caf removed the MMX idct_put
and idct_add functions, because they were overridden by SSE2 versions
(which use SSE2 only for the put/add part, not the actual IDCT).
This meant that for MMX, the idct functions are not set in unison,
so that the permutation which is meant to apply to all three
is incorrect on 32bit systems if SSE2 is unavailable/disabled.

Fix this by setting the MMX version only if SSE2 is enabled.

(No one complained, so apparently no one uses a new FFmpeg
with non-SSE2 capable systems.)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/idctdsp_init.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 2d165b975b..281d143ade 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -65,18 +65,6 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
 {
     int cpu_flags = av_get_cpu_flags();
 
-#if ARCH_X86_32
-    if (EXTERNAL_MMX(cpu_flags)) {
-        if (!high_bit_depth &&
-            avctx->lowres == 0 &&
-            (avctx->idct_algo == FF_IDCT_AUTO ||
-                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
-                avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
-                c->idct      = ff_simple_idct_mmx;
-        }
-    }
-#endif
-
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
         c->put_pixels_clamped        = ff_put_pixels_clamped_sse2;
@@ -88,6 +76,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
             (avctx->idct_algo == FF_IDCT_AUTO ||
                 avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
                 avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+                c->idct      = ff_simple_idct_mmx;
                 c->idct_put  = ff_simple_idct_put_sse2;
                 c->idct_add  = ff_simple_idct_add_sse2;
                 c->perm_type = FF_IDCT_PERM_SIMPLE;
-- 
2.49.1


>From d4013319afd63deb83cc1dbf2816382854085379 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 4 Nov 2025 17:53:30 +0100
Subject: [PATCH 2/7] avcodec/tests/x86/dct: Test 32bit simple idct

The test has been removed in bfb28b5ce89f3e950214b67ea95b45e3355c2caf
when MMX idctdsp functions overridden by SSE2 were removed;
ff_simple_idct_mmx() has been completely disabled in this patch
for x64 and so the test should have been disabled on x64 instead
of removing it.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/tests/x86/dct.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index 7800abc7f7..e864de6904 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -88,6 +88,10 @@ static const struct algo idct_tab_arch[] = {
     { "SIMPLE10-AVX",   ff_simple_idct10_avx,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
     { "SIMPLE12-AVX",   ff_simple_idct12_avx,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX,  1 },
 #endif
+#else
+#if HAVE_SSE2_EXTERNAL
+    { "SIMPLE-SSE2",   ff_simple_idct_mmx,  FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_SSE2},
+#endif
 #endif
 #endif
     { 0 }
-- 
2.49.1


>From d30025d2857dc3cdcc9eb4c09ed85794473ac3a1 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 5 Nov 2025 02:59:59 +0100
Subject: [PATCH 3/7] avcodec/x86/xvididct: Don't use MMX registers in SSE2
 function

It is higly surprising and would necessitate emms in order to be ABI
compliant; but it is better just not to use them in the first place.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/xvididct.asm | 76 ++++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 34 deletions(-)

diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
index 4197551cdf..0daa2edd42 100644
--- a/libavcodec/x86/xvididct.asm
+++ b/libavcodec/x86/xvididct.asm
@@ -101,8 +101,6 @@ walkenIdctRounders: times 4 dd 65536
                     times 4 dd   512
                     times 2 dd     0
 
-pb_127: times 8 db 127
-
 SECTION .text
 
 ; Temporary storage before the column pass
@@ -167,36 +165,47 @@ SECTION .text
 %define TAN1  xmm2
 %endif
 
-%macro JZ  2
-    test      %1, %1
+%macro JZ  3
+    test    %1%3, %1%3
     jz       .%2
 %endmacro
 
-%macro JNZ  2
-    test      %1, %1
+%macro JNZ  3
+    test    %1%3, %1%3
     jnz      .%2
 %endmacro
 
 %macro TEST_ONE_ROW 4 ; src, reg, clear, arg
     %3        %4
-    movq     mm1, [%1]
-    por      mm1, [%1 + 8]
-    paddusb  mm1, mm0
-    pmovmskb  %2, mm1
+    mova       m1, [%1]
+    ; due to signed saturation, m1 is all zero iff m1 is all zero after packing
+    packsswb   m1, m1
+%if ARCH_X86_64
+    movq       %2, m1
+%else
+    packsswb   m1, m1
+    movd       %2, m1
+%endif
 %endmacro
 
 ;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2
 %macro  TEST_TWO_ROWS  8
     %5         %6
     %7         %8
-    movq      mm1, [%1 + 0]
-    por       mm1, [%1 + 8]
-    movq      mm2, [%2 + 0]
-    por       mm2, [%2 + 8]
-    paddusb   mm1, mm0
-    paddusb   mm2, mm0
-    pmovmskb   %3, mm1
-    pmovmskb   %4, mm2
+    mova       m1, [%1]
+    packsswb   m1, [%2]
+    packsswb   m1, m1
+%if ARCH_X86_64
+    movq       %4, m1
+    mov       %3d, %4d
+    shr       %4q, 32
+%else
+    packsswb   m1, m1
+    movd       %3, m1
+    mov        %4, %3
+    shr        %4, 16
+    and        %3, 0xFFFF
+%endif
 %endmacro
 
 ; IDCT pass on rows.
@@ -499,16 +508,16 @@ SECTION .text
 
 %macro IDCT_SSE2 1 ; 0=normal  1=put  2=add
 %if %1 == 0 || ARCH_X86_32
-    %define GPR0  r1d
-    %define GPR1  r2d
-    %define GPR2  r3d
-    %define GPR3  r4d
+    %define GPR0  r1
+    %define GPR1  r2
+    %define GPR2  r3
+    %define GPR3  r4
     %define NUM_GPRS 5
 %else
-    %define GPR0  r3d
-    %define GPR1  r4d
-    %define GPR2  r5d
-    %define GPR3  r6d
+    %define GPR0  r3
+    %define GPR1  r4
+    %define GPR2  r5
+    %define GPR3  r6
     %define NUM_GPRS 7
 %endif
 %if %1 == 0
@@ -527,34 +536,33 @@ cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
     %xdefine BLOCK r0q
     %endif
 %endif
-    movq           mm0, [pb_127]
     iMTX_MULT      BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
     iMTX_MULT      BLOCK + 1*16, iTab2, PUT_ODD, ROW1,  1*16
     iMTX_MULT      BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
 
     TEST_TWO_ROWS  BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
-    JZ   GPR0, col1
+    JZ   GPR0, col1, d
     iMTX_MULT      BLOCK + 3*16, iTab4, PUT_ODD, ROW3,  3*16
 .col1:
     TEST_TWO_ROWS  BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
     TEST_ONE_ROW   BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
 
     iLLM_HEAD
-    JNZ  GPR1, 2
-    JNZ  GPR0, 3
-    JNZ  GPR2, 4
-    JNZ  GPR3, 5
+    JNZ  GPR1, 2, d
+    JNZ  GPR0, 3, d
+    JNZ  GPR2, 4, d
+    JNZ  GPR3, 5, q
     iLLM_PASS_SPARSE BLOCK, %1
     jmp .6
 .2:
     iMTX_MULT     BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
 .3:
     iMTX_MULT     BLOCK + 5*16, iTab4, PUT_ODD, ROW5,  4*16
-    JZ   GPR2, col2
+    JZ   GPR2, col2, d
 .4:
     iMTX_MULT     BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
 .col2:
-    JZ   GPR3, col3
+    JZ   GPR3, col3, q
 .5:
     iMTX_MULT     BLOCK + 7*16, iTab2, PUT_ODD, ROW7,  5*16
 .col3:
-- 
2.49.1


>From a7e3cde808bd620e0bb9616261dd5c12cc71da97 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 5 Nov 2025 03:58:12 +0100
Subject: [PATCH 4/7] avcodec/x86/xvididct: Remove remnants of MMX

The non-MMX code only uses the first six rounders.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/xvididct.asm | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
index 0daa2edd42..c3bfabb955 100644
--- a/libavcodec/x86/xvididct.asm
+++ b/libavcodec/x86/xvididct.asm
@@ -24,7 +24,7 @@
 ;
 ; More details at http://skal.planet-d.net/coding/dct.html
 ;
-; =======     MMX and XMM forward discrete cosine transform     =======
+; ===========     XMM forward discrete cosine transform     ===========
 ;
 ; Copyright(C) 2001 Peter Ross <pross@xvid.org>
 ;
@@ -67,7 +67,6 @@
 %include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
-; Similar to tg_1_16 in MMX code
 tan1:   times 8 dw 13036
 tan2:   times 8 dw 27146
 tan3:   times 8 dw 43790
@@ -91,7 +90,6 @@ iTab4:  dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746
         dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df
         dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
 
-; Similar to rounder_0 in MMX code
 ; 4 first similar, then: 4*8->6*16  5*8->4*16  6/7*8->5*16
 walkenIdctRounders: times 4 dd 65536
                     times 4 dd  3597
@@ -99,7 +97,6 @@ walkenIdctRounders: times 4 dd 65536
                     times 4 dd  1203
                     times 4 dd   120
                     times 4 dd   512
-                    times 2 dd     0
 
 SECTION .text
 
-- 
2.49.1


>From 6b353d7a322b1142d80bdce172cabe8257f239c1 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 5 Nov 2025 04:04:02 +0100
Subject: [PATCH 5/7] avcodec/tests/dct: Remove unnecessary emms_c

Unnecessary since the Xvid IDCT no longer uses MMX registers at all.
(Notice that the simple MMX IDCT issues emms and is therefore ABI
compliant.)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/tests/dct.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/libavcodec/tests/dct.c b/libavcodec/tests/dct.c
index 784b49276c..eb74f3559e 100644
--- a/libavcodec/tests/dct.c
+++ b/libavcodec/tests/dct.c
@@ -37,7 +37,6 @@
 
 #include "libavutil/cpu.h"
 #include "libavutil/common.h"
-#include "libavutil/emms.h"
 #include "libavutil/internal.h"
 #include "libavutil/lfg.h"
 #include "libavutil/mem_internal.h"
@@ -212,7 +211,6 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed, c
         permute(block, block1, dct->perm_type);
 
         dct->func(block);
-        emms_c();
 
         if (!strcmp(dct->name, "IJG-AAN-INT")) {
             for (i = 0; i < 64; i++) {
@@ -287,7 +285,6 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed, c
             memcpy(block, block1, sizeof(block));
             dct->func(block);
         }
-        emms_c();
         it1 += NB_ITS_SPEED;
         ti1 = av_gettime_relative() - ti;
     } while (ti1 < 1000000);
@@ -449,7 +446,6 @@ static void idct248_error(const char *name,
                 block[i] = block1[i];
             idct248_put(img_dest, 8, block);
         }
-        emms_c();
         it1 += NB_ITS_SPEED;
         ti1 = av_gettime_relative() - ti;
     } while (ti1 < 1000000);
-- 
2.49.1


>From 7b6c5ddebd8c43127ac0d2f1c744cd9d17d9a7ca Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 4 Nov 2025 14:25:54 +0100
Subject: [PATCH 6/7] avcodec/x86/simple_idct: Port to SSE2

Before this commit, the (32-bit only) simple idct came in three
versions: A pure MMX IDCT and idct-put and idct-add versions
which use SSE2 at the put and add stage, but still use pure MMX
for the actual IDCT.

This commit ports said IDCT to SSE2; this was entirely trivial
for the IDCT1-5 and IDCT7 parts (where one can directly use
the full register width) and was easy for IDCT6 and IDCT8
(involving a few movhps and pshufds). Unfortunately, DC_COND_INIT
and Z_COND_INIT still use only the lower half of the registers.

This saved 4658B here; the benchmarking option of the dct test tool
showed a 15% speedup.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/tests/x86/dct.c     |    2 +-
 libavcodec/x86/idctdsp_init.c  |    2 +-
 libavcodec/x86/simple_idct.asm | 1242 ++++++++++++++++----------------
 libavcodec/x86/simple_idct.h   |    5 +-
 4 files changed, 628 insertions(+), 623 deletions(-)

diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index e864de6904..f879ab1d42 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -90,7 +90,7 @@ static const struct algo idct_tab_arch[] = {
 #endif
 #else
 #if HAVE_SSE2_EXTERNAL
-    { "SIMPLE-SSE2",   ff_simple_idct_mmx,  FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_SSE2},
+    { "SIMPLE-SSE2",   ff_simple_idct_sse2,  FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_SSE2},
 #endif
 #endif
 #endif
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 281d143ade..9c7f235b3f 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -76,7 +76,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
             (avctx->idct_algo == FF_IDCT_AUTO ||
                 avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
                 avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
-                c->idct      = ff_simple_idct_mmx;
+                c->idct      = ff_simple_idct_sse2;
                 c->idct_put  = ff_simple_idct_put_sse2;
                 c->idct_add  = ff_simple_idct_add_sse2;
                 c->perm_type = FF_IDCT_PERM_SIMPLE;
diff --git a/libavcodec/x86/simple_idct.asm b/libavcodec/x86/simple_idct.asm
index c79519372a..2410737038 100644
--- a/libavcodec/x86/simple_idct.asm
+++ b/libavcodec/x86/simple_idct.asm
@@ -30,8 +30,8 @@ SECTION_RODATA
 %if ARCH_X86_32
 cextern pb_80
 
+d40000: dd 4 << 16, 0 ; must be 16-byte aligned
 wm1010: dw 0, 0xffff, 0, 0xffff
-d40000: dd 4 << 16, 0
 
 ; 23170.475006
 ; 22725.260826
@@ -57,650 +57,675 @@ d40000: dd 4 << 16, 0
 coeffs:
     dw 1 << (ROW_SHIFT - 1), 0
     dw 1 << (ROW_SHIFT - 1), 0
+    dw 1 << (ROW_SHIFT - 1), 0
+    dw 1 << (ROW_SHIFT - 1), 0
+    dw 1 << (ROW_SHIFT - 1), 1
+    dw 1 << (ROW_SHIFT - 1), 0
     dw 1 << (ROW_SHIFT - 1), 1
     dw 1 << (ROW_SHIFT - 1), 0
 
-    dw C4,  C4,  C4,  C4
-    dw C4, -C4,  C4, -C4
+    dw C4,  C4,  C4,  C4, C4,  C4,  C4,  C4
+    dw C4, -C4,  C4, -C4, C4, -C4,  C4, -C4
 
-    dw C2,  C6,  C2,  C6
-    dw C6, -C2,  C6, -C2
+    dw C2,  C6,  C2,  C6, C2,  C6,  C2,  C6
+    dw C6, -C2,  C6, -C2, C6, -C2,  C6, -C2
 
-    dw C1,  C3,  C1,  C3
-    dw C5,  C7,  C5,  C7
+    dw C1,  C3,  C1,  C3, C1,  C3,  C1,  C3
+    dw C5,  C7,  C5,  C7, C5,  C7,  C5,  C7
 
-    dw C3, -C7,  C3, -C7
-    dw -C1, -C5, -C1, -C5
+    dw  C3, -C7,  C3, -C7,  C3, -C7,  C3, -C7
+    dw -C1, -C5, -C1, -C5, -C1, -C5, -C1, -C5
 
-    dw C5, -C1,  C5, -C1
-    dw C7,  C3,  C7,  C3
+    dw C5, -C1,  C5, -C1, C5, -C1,  C5, -C1
+    dw C7,  C3,  C7,  C3, C7,  C3,  C7,  C3
 
-    dw C7, -C5,  C7, -C5
-    dw C3, -C1,  C3, -C1
+    dw C7, -C5,  C7, -C5, C7, -C5,  C7, -C5
+    dw C3, -C1,  C3, -C1, C3, -C1,  C3, -C1
 
 SECTION .text
 
 %macro DC_COND_IDCT 7
-    movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
-    movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
-    movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
-    movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
-    movq            mm4, [wm1010]
-    pand            mm4, mm0
-    por             mm4, mm1
-    por             mm4, mm2
-    por             mm4, mm3
-    packssdw        mm4, mm4
-    movd            t0d, mm4
+    movq             m0, [blockq + %1]  ; R4     R0      r4      r0
+    movq             m1, [blockq + %2]  ; R6     R2      r6      r2
+    movq             m2, [blockq + %3]  ; R3     R1      r3      r1
+    movq             m3, [blockq + %4]  ; R7     R5      r7      r5
+    movq             m4, [wm1010]
+    pand             m4, m0
+    por              m4, m1
+    por              m4, m2
+    por              m4, m3
+    packssdw         m4, m4
+    movd            t0d, m4
     or              t0d, t0d
     jz              %%1
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
-    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
-    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
-    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
-    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
-    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
-    paddd           mm4, [coeffs + 8]
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    paddd           mm4, mm5            ; A0             a0
-    psubd           mm6, mm5            ; A3             a3
-    movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
-    pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
-    paddd           mm0, [coeffs + 8]
-    paddd           mm1, mm0            ; A1             a1
-    paddd           mm0, mm0
-    psubd           mm0, mm1            ; A2             a2
-    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
-    paddd           mm7, mm5            ; B0             b0
-    movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
-    pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
-    paddd           mm7, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm7            ; A0-B0          a0-b0
-    paddd           mm5, mm2            ; B1             b1
-    psrad           mm7, %7
-    psrad           mm4, %7
-    movq            mm2, mm1            ; A1             a1
-    paddd           mm1, mm5            ; A1+B1          a1+b1
-    psubd           mm2, mm5            ; A1-B1          a1-b1
-    psrad           mm1, %7
-    psrad           mm2, %7
-    packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
-    packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
-    movq           [%5], mm7
-    movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
-    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
-    movq      [24 + %5], mm2
-    pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
-    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
-    pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
-    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
-    movq            mm2, mm0            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
-    paddd           mm4, mm7            ; B2             b2
-    paddd           mm2, mm4            ; A2+B2          a2+b2
-    psubd           mm0, mm4            ; a2-B2          a2-b2
-    psrad           mm2, %7
-    psrad           mm0, %7
-    movq            mm4, mm6            ; A3             a3
-    paddd           mm3, mm1            ; B3             b3
-    paddd           mm6, mm3            ; A3+B3          a3+b3
-    psubd           mm4, mm3            ; a3-B3          a3-b3
-    psrad           mm6, %7
-    packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
-    movq       [8 + %5], mm2
-    psrad           mm4, %7
-    packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
-    movq      [16 + %5], mm4
+    movq             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    movq             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    movq             m5, [coeffs + 64]  ; C6     C2      C6      C2
+    pmaddwd          m5, m1             ; C6R6+C2R2      C6r6+C2r2
+    movq             m6, [coeffs + 80]  ; -C2    C6      -C2     C6
+    pmaddwd          m1, m6             ; -C2R6+C6R2     -C2r6+C6r2
+    movq             m7, [coeffs + 96]  ; C3     C1      C3      C1
+    pmaddwd          m7, m2             ; C3R3+C1R1      C3r3+C1r1
+    paddd            m4, [coeffs + 16]
+    movq             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    paddd            m4, m5             ; A0             a0
+    psubd            m6, m5             ; A3             a3
+    movq             m5, [coeffs + 112] ; C7     C5      C7      C5
+    pmaddwd          m5, m3             ; C7R7+C5R5      C7r7+C5r5
+    paddd            m0, [coeffs + 16]
+    paddd            m1, m0             ; A1             a1
+    paddd            m0, m0
+    psubd            m0, m1             ; A2             a2
+    pmaddwd          m2, [coeffs + 128] ; -C7R3+C3R1     -C7r3+C3r1
+    paddd            m7, m5             ; B0             b0
+    movq             m5, [coeffs + 144] ; -C5    -C1     -C5     -C1
+    pmaddwd          m5, m3             ; -C5R7-C1R5     -C5r7-C1r5
+    paddd            m7, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m7             ; A0-B0          a0-b0
+    paddd            m5, m2             ; B1             b1
+    psrad            m7, %7
+    psrad            m4, %7
+    movq             m2, m1             ; A1             a1
+    paddd            m1, m5             ; A1+B1          a1+b1
+    psubd            m2, m5             ; A1-B1          a1-b1
+    psrad            m1, %7
+    psrad            m2, %7
+    packssdw         m7, m1             ; A1+B1  a1+b1   A0+B0   a0+b0
+    pshufd           m7, m7, 0xD8
+    packssdw         m2, m4             ; A0-B0  a0-b0   A1-B1   a1-b1
+    pshufd           m2, m2, 0xD8
+    movq           [%5], m7
+    movq             m1, [blockq + %3]  ; R3     R1      r3      r1
+    movq             m4, [coeffs + 160] ; -C1    C5      -C1     C5
+    movq      [24 + %5], m2
+    pmaddwd          m4, m1             ; -C1R3+C5R1     -C1r3+C5r1
+    movq             m7, [coeffs + 176] ; C3     C7      C3      C7
+    pmaddwd          m1, [coeffs + 192] ; -C5R3+C7R1     -C5r3+C7r1
+    pmaddwd          m7, m3             ; C3R7+C7R5      C3r7+C7r5
+    movq             m2, m0             ; A2             a2
+    pmaddwd          m3, [coeffs + 208] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd            m4, m7             ; B2             b2
+    paddd            m2, m4             ; A2+B2          a2+b2
+    psubd            m0, m4             ; a2-B2          a2-b2
+    psrad            m2, %7
+    psrad            m0, %7
+    movq             m4, m6             ; A3             a3
+    paddd            m3, m1             ; B3             b3
+    paddd            m6, m3             ; A3+B3          a3+b3
+    psubd            m4, m3             ; a3-B3          a3-b3
+    psrad            m6, %7
+    packssdw         m2, m6             ; A3+B3  a3+b3   A2+B2   a2+b2
+    pshufd           m2, m2, 0xD8
+    movq       [8 + %5], m2
+    psrad            m4, %7
+    packssdw         m4, m0             ; A2-B2  a2-b2   A3-B3   a3-b3
+    pshufd           m4, m4, 0xD8
+    movq      [16 + %5], m4
     jmp             %%2
 %%1:
-    pslld           mm0, 16
-    paddd           mm0, [d40000]
-    psrad           mm0, 13
-    packssdw        mm0, mm0
-    movq           [%5], mm0
-    movq       [8 + %5], mm0
-    movq      [16 + %5], mm0
-    movq      [24 + %5], mm0
+    pslld            m0, 16
+    ; d40000 is only eight bytes long, so this will clobber
+    ; the upper half of m0 with wm1010. It doesn't matter due to pshufd below.
+    paddd            m0, [d40000]
+    psrad            m0, 13
+    packssdw         m0, m0
+    pshufd           m0, m0, 0x0
+    mova           [%5], m0
+    mova      [16 + %5], m0
 %%2:
 %endmacro
 
 %macro Z_COND_IDCT 8
-    movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
-    movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
-    movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
-    movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
-    movq            mm4, mm0
-    por             mm4, mm1
-    por             mm4, mm2
-    por             mm4, mm3
-    packssdw        mm4, mm4
-    movd            t0d, mm4
+    movq             m0, [blockq + %1]  ; R4     R0      r4      r0
+    movq             m1, [blockq + %2]  ; R6     R2      r6      r2
+    movq             m2, [blockq + %3]  ; R3     R1      r3      r1
+    movq             m3, [blockq + %4]  ; R7     R5      r7      r5
+    movq             m4, m0
+    por              m4, m1
+    por              m4, m2
+    por              m4, m3
+    packssdw         m4, m4
+    movd            t0d, m4
     or              t0d, t0d
     jz               %8
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
-    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
-    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
-    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
-    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
-    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
-    paddd           mm4, [coeffs]
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    paddd           mm4, mm5            ; A0             a0
-    psubd           mm6, mm5            ; A3             a3
-    movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
-    pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
-    paddd           mm0, [coeffs]
-    paddd           mm1, mm0            ; A1             a1
-    paddd           mm0, mm0
-    psubd           mm0, mm1            ; A2             a2
-    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
-    paddd           mm7, mm5            ; B0             b0
-    movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
-    pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
-    paddd           mm7, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm7            ; A0-B0          a0-b0
-    paddd           mm5, mm2            ; B1             b1
-    psrad           mm7, %7
-    psrad           mm4, %7
-    movq            mm2, mm1            ; A1             a1
-    paddd           mm1, mm5            ; A1+B1          a1+b1
-    psubd           mm2, mm5            ; A1-B1          a1-b1
-    psrad           mm1, %7
-    psrad           mm2, %7
-    packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
-    packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
-    movq           [%5], mm7
-    movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
-    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
-    movq      [24 + %5], mm2
-    pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
-    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
-    pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
-    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
-    movq            mm2, mm0            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
-    paddd           mm4, mm7            ; B2             b2
-    paddd           mm2, mm4            ; A2+B2          a2+b2
-    psubd           mm0, mm4            ; a2-B2          a2-b2
-    psrad           mm2, %7
-    psrad           mm0, %7
-    movq            mm4, mm6            ; A3             a3
-    paddd           mm3, mm1            ; B3             b3
-    paddd           mm6, mm3            ; A3+B3          a3+b3
-    psubd           mm4, mm3            ; a3-B3          a3-b3
-    psrad           mm6, %7
-    packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
-    movq       [8 + %5], mm2
-    psrad           mm4, %7
-    packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
-    movq      [16 + %5], mm4
+    movq             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    movq             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    movq             m5, [coeffs + 64]  ; C6     C2      C6      C2
+    pmaddwd          m5, m1             ; C6R6+C2R2      C6r6+C2r2
+    movq             m6, [coeffs + 80]  ; -C2    C6      -C2     C6
+    pmaddwd          m1, m6             ; -C2R6+C6R2     -C2r6+C6r2
+    movq             m7, [coeffs + 96]  ; C3     C1      C3      C1
+    pmaddwd          m7, m2             ; C3R3+C1R1      C3r3+C1r1
+    paddd            m4, [coeffs]
+    movq             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    paddd            m4, m5             ; A0             a0
+    psubd            m6, m5             ; A3             a3
+    movq             m5, [coeffs + 112] ; C7     C5      C7      C5
+    pmaddwd          m5, m3             ; C7R7+C5R5      C7r7+C5r5
+    paddd            m0, [coeffs]
+    paddd            m1, m0             ; A1             a1
+    paddd            m0, m0
+    psubd            m0, m1             ; A2             a2
+    pmaddwd          m2, [coeffs + 128] ; -C7R3+C3R1     -C7r3+C3r1
+    paddd            m7, m5             ; B0             b0
+    movq             m5, [coeffs + 144] ; -C5    -C1     -C5     -C1
+    pmaddwd          m5, m3             ; -C5R7-C1R5     -C5r7-C1r5
+    paddd            m7, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m7             ; A0-B0          a0-b0
+    paddd            m5, m2             ; B1             b1
+    psrad            m7, %7
+    psrad            m4, %7
+    movq             m2, m1             ; A1             a1
+    paddd            m1, m5             ; A1+B1          a1+b1
+    psubd            m2, m5             ; A1-B1          a1-b1
+    psrad            m1, %7
+    psrad            m2, %7
+    packssdw         m7, m1             ; A1+B1  a1+b1   A0+B0   a0+b0
+    pshufd           m7, m7, 0xD8
+    packssdw         m2, m4             ; A0-B0  a0-b0   A1-B1   a1-b1
+    pshufd           m2, m2, 0xD8
+    movq           [%5], m7
+    movq             m1, [blockq + %3]  ; R3     R1      r3      r1
+    movq             m4, [coeffs + 160] ; -C1    C5      -C1     C5
+    movq      [24 + %5], m2
+    pmaddwd          m4, m1             ; -C1R3+C5R1     -C1r3+C5r1
+    movq             m7, [coeffs + 176] ; C3     C7      C3      C7
+    pmaddwd          m1, [coeffs + 192] ; -C5R3+C7R1     -C5r3+C7r1
+    pmaddwd          m7, m3             ; C3R7+C7R5      C3r7+C7r5
+    movq             m2, m0             ; A2             a2
+    pmaddwd          m3, [coeffs + 208] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd            m4, m7             ; B2             b2
+    paddd            m2, m4             ; A2+B2          a2+b2
+    psubd            m0, m4             ; a2-B2          a2-b2
+    psrad            m2, %7
+    psrad            m0, %7
+    movq             m4, m6             ; A3             a3
+    paddd            m3, m1             ; B3             b3
+    paddd            m6, m3             ; A3+B3          a3+b3
+    psubd            m4, m3             ; a3-B3          a3-b3
+    psrad            m6, %7
+    packssdw         m2, m6             ; A3+B3  a3+b3   A2+B2   a2+b2
+    pshufd           m2, m2, 0xD8
+    movq       [8 + %5], m2
+    psrad            m4, %7
+    packssdw         m4, m0             ; A2-B2  a2-b2   A3-B3   a3-b3
+    pshufd           m4, m4, 0xD8
+    movq      [16 + %5], m4
 %endmacro
 
 %macro IDCT1 6
-    movq            mm0, %1             ; R4     R0      r4      r0
-    movq            mm1, %2             ; R6     R2      r6      r2
-    movq            mm2, %3             ; R3     R1      r3      r1
-    movq            mm3, %4             ; R7     R5      r7      r5
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
-    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
-    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
-    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
-    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
-    paddd           mm4, mm5            ; A0             a0
-    psubd           mm6, mm5            ; A3             a3
-    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
-    paddd           mm0, mm1            ; A1             a1
-    psubd           mm5, mm1            ; A2             a2
-    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
-    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
-    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
-    paddd           mm7, mm1            ; B0             b0
-    movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
-    pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
-    paddd           mm7, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm7            ; A0-B0          a0-b0
-    paddd           mm1, mm2            ; B1             b1
-    psrad           mm7, %6
-    psrad           mm4, %6
-    movq            mm2, mm0            ; A1             a1
-    paddd           mm0, mm1            ; A1+B1          a1+b1
-    psubd           mm2, mm1            ; A1-B1          a1-b1
-    psrad           mm0, %6
-    psrad           mm2, %6
-    packssdw        mm7, mm7            ; A0+B0  a0+b0
-    movd           [%5], mm7
-    packssdw        mm0, mm0            ; A1+B1  a1+b1
-    movd      [16 + %5], mm0
-    packssdw        mm2, mm2            ; A1-B1  a1-b1
-    movd      [96 + %5], mm2
-    packssdw        mm4, mm4            ; A0-B0  a0-b0
-    movd     [112 + %5], mm4
-    movq            mm0, %3             ; R3     R1      r3      r1
-    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
-    pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
-    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
-    pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
-    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
-    movq            mm2, mm5            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
-    paddd           mm4, mm7            ; B2             b2
-    paddd           mm2, mm4            ; A2+B2          a2+b2
-    psubd           mm5, mm4            ; a2-B2          a2-b2
-    psrad           mm2, %6
-    psrad           mm5, %6
-    movq            mm4, mm6            ; A3             a3
-    paddd           mm3, mm0            ; B3             b3
-    paddd           mm6, mm3            ; A3+B3          a3+b3
-    psubd           mm4, mm3            ; a3-B3          a3-b3
-    psrad           mm6, %6
-    psrad           mm4, %6
-    packssdw        mm2, mm2            ; A2+B2  a2+b2
-    packssdw        mm6, mm6            ; A3+B3  a3+b3
-    movd      [32 + %5], mm2
-    packssdw        mm4, mm4            ; A3-B3  a3-b3
-    packssdw        mm5, mm5            ; A2-B2  a2-b2
-    movd      [48 + %5], mm6
-    movd      [64 + %5], mm4
-    movd      [80 + %5], mm5
+    mova             m0, %1             ; R4     R0      r4      r0
+    mova             m1, %2             ; R6     R2      r6      r2
+    mova             m2, %3             ; R3     R1      r3      r1
+    mova             m3, %4             ; R7     R5      r7      r5
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m5, [coeffs + 64]  ; C6     C2      C6      C2
+    pmaddwd          m5, m1             ; C6R6+C2R2      C6r6+C2r2
+    mova             m6, [coeffs + 80]  ; -C2    C6      -C2     C6
+    pmaddwd          m1, m6             ; -C2R6+C6R2     -C2r6+C6r2
+    mova             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    mova             m7, [coeffs + 96]  ; C3     C1      C3      C1
+    pmaddwd          m7, m2             ; C3R3+C1R1      C3r3+C1r1
+    paddd            m4, m5             ; A0             a0
+    psubd            m6, m5             ; A3             a3
+    mova             m5, m0             ; -C4R4+C4R0     -C4r4+C4r0
+    paddd            m0, m1             ; A1             a1
+    psubd            m5, m1             ; A2             a2
+    mova             m1, [coeffs + 112] ; C7     C5      C7      C5
+    pmaddwd          m1, m3             ; C7R7+C5R5      C7r7+C5r5
+    pmaddwd          m2, [coeffs + 128] ; -C7R3+C3R1     -C7r3+C3r1
+    paddd            m7, m1             ; B0             b0
+    mova             m1, [coeffs + 144] ; -C5    -C1     -C5     -C1
+    pmaddwd          m1, m3             ; -C5R7-C1R5     -C5r7-C1r5
+    paddd            m7, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m7             ; A0-B0          a0-b0
+    paddd            m1, m2             ; B1             b1
+    psrad            m7, %6
+    psrad            m4, %6
+    mova             m2, m0             ; A1             a1
+    paddd            m0, m1             ; A1+B1          a1+b1
+    psubd            m2, m1             ; A1-B1          a1-b1
+    psrad            m0, %6
+    psrad            m2, %6
+    packssdw         m7, m7             ; A0+B0  a0+b0
+    movq           [%5], m7
+    packssdw         m0, m0             ; A1+B1  a1+b1
+    movq      [16 + %5], m0
+    packssdw         m2, m2             ; A1-B1  a1-b1
+    movq      [96 + %5], m2
+    packssdw         m4, m4             ; A0-B0  a0-b0
+    movq     [112 + %5], m4
+    mova             m0, %3             ; R3     R1      r3      r1
+    mova             m4, [coeffs + 160] ; -C1    C5      -C1     C5
+    pmaddwd          m4, m0             ; -C1R3+C5R1     -C1r3+C5r1
+    mova             m7, [coeffs + 176] ; C3     C7      C3      C7
+    pmaddwd          m0, [coeffs + 192] ; -C5R3+C7R1     -C5r3+C7r1
+    pmaddwd          m7, m3             ; C3R7+C7R5      C3r7+C7r5
+    mova             m2, m5             ; A2             a2
+    pmaddwd          m3, [coeffs + 208] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd            m4, m7             ; B2             b2
+    paddd            m2, m4             ; A2+B2          a2+b2
+    psubd            m5, m4             ; a2-B2          a2-b2
+    psrad            m2, %6
+    psrad            m5, %6
+    mova             m4, m6             ; A3             a3
+    paddd            m3, m0             ; B3             b3
+    paddd            m6, m3             ; A3+B3          a3+b3
+    psubd            m4, m3             ; a3-B3          a3-b3
+    psrad            m6, %6
+    psrad            m4, %6
+    packssdw         m2, m2             ; A2+B2  a2+b2
+    packssdw         m6, m6             ; A3+B3  a3+b3
+    movq      [32 + %5], m2
+    packssdw         m4, m4             ; A3-B3  a3-b3
+    packssdw         m5, m5             ; A2-B2  a2-b2
+    movq      [48 + %5], m6
+    movq      [64 + %5], m4
+    movq      [80 + %5], m5
 %endmacro
 
 %macro IDCT2 6
-    movq            mm0, %1             ; R4     R0      r4      r0
-    movq            mm1, %2             ; R6     R2      r6      r2
-    movq            mm3, %4             ; R7     R5      r7      r5
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
-    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
-    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
-    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    paddd           mm4, mm5            ; A0             a0
-    psubd           mm6, mm5            ; A3             a3
-    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
-    paddd           mm0, mm1            ; A1             a1
-    psubd           mm5, mm1            ; A2             a2
-    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
-    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
-    movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
-    pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
-    paddd           mm1, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm1            ; A0-B0          a0-b0
-    psrad           mm1, %6
-    psrad           mm4, %6
-    movq            mm2, mm0            ; A1             a1
-    paddd           mm0, mm7            ; A1+B1          a1+b1
-    psubd           mm2, mm7            ; A1-B1          a1-b1
-    psrad           mm0, %6
-    psrad           mm2, %6
-    packssdw        mm1, mm1            ; A0+B0  a0+b0
-    movd           [%5], mm1
-    packssdw        mm0, mm0            ; A1+B1  a1+b1
-    movd      [16 + %5], mm0
-    packssdw        mm2, mm2            ; A1-B1  a1-b1
-    movd      [96 + %5], mm2
-    packssdw        mm4, mm4            ; A0-B0  a0-b0
-    movd     [112 + %5], mm4
-    movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
-    pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
-    movq            mm2, mm5            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
-    paddd           mm2, mm1            ; A2+B2          a2+b2
-    psubd           mm5, mm1            ; a2-B2          a2-b2
-    psrad           mm2, %6
-    psrad           mm5, %6
-    movq            mm1, mm6            ; A3             a3
-    paddd           mm6, mm3            ; A3+B3          a3+b3
-    psubd           mm1, mm3            ; a3-B3          a3-b3
-    psrad           mm6, %6
-    psrad           mm1, %6
-    packssdw        mm2, mm2            ; A2+B2  a2+b2
-    packssdw        mm6, mm6            ; A3+B3  a3+b3
-    movd      [32 + %5], mm2
-    packssdw        mm1, mm1            ; A3-B3  a3-b3
-    packssdw        mm5, mm5            ; A2-B2  a2-b2
-    movd      [48 + %5], mm6
-    movd      [64 + %5], mm1
-    movd      [80 + %5], mm5
+    mova             m0, %1             ; R4     R0      r4      r0
+    mova             m1, %2             ; R6     R2      r6      r2
+    mova             m3, %4             ; R7     R5      r7      r5
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m5, [coeffs + 64]  ; C6     C2      C6      C2
+    pmaddwd          m5, m1             ; C6R6+C2R2      C6r6+C2r2
+    mova             m6, [coeffs + 80]  ; -C2    C6      -C2     C6
+    pmaddwd          m1, m6             ; -C2R6+C6R2     -C2r6+C6r2
+    mova             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    paddd            m4, m5             ; A0             a0
+    psubd            m6, m5             ; A3             a3
+    mova             m5, m0             ; -C4R4+C4R0     -C4r4+C4r0
+    paddd            m0, m1             ; A1             a1
+    psubd            m5, m1             ; A2             a2
+    mova             m1, [coeffs + 112] ; C7     C5      C7      C5
+    pmaddwd          m1, m3             ; C7R7+C5R5      C7r7+C5r5
+    mova             m7, [coeffs + 144] ; -C5    -C1     -C5     -C1
+    pmaddwd          m7, m3             ; -C5R7-C1R5     -C5r7-C1r5
+    paddd            m1, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m1             ; A0-B0          a0-b0
+    psrad            m1, %6
+    psrad            m4, %6
+    mova             m2, m0             ; A1             a1
+    paddd            m0, m7             ; A1+B1          a1+b1
+    psubd            m2, m7             ; A1-B1          a1-b1
+    psrad            m0, %6
+    psrad            m2, %6
+    packssdw         m1, m1             ; A0+B0  a0+b0
+    movq           [%5], m1
+    packssdw         m0, m0             ; A1+B1  a1+b1
+    movq      [16 + %5], m0
+    packssdw         m2, m2             ; A1-B1  a1-b1
+    movq      [96 + %5], m2
+    packssdw         m4, m4             ; A0-B0  a0-b0
+    movq     [112 + %5], m4
+    mova             m1, [coeffs + 176] ; C3     C7      C3      C7
+    pmaddwd          m1, m3             ; C3R7+C7R5      C3r7+C7r5
+    mova             m2, m5             ; A2             a2
+    pmaddwd          m3, [coeffs + 208] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd            m2, m1             ; A2+B2          a2+b2
+    psubd            m5, m1             ; a2-B2          a2-b2
+    psrad            m2, %6
+    psrad            m5, %6
+    mova             m1, m6             ; A3             a3
+    paddd            m6, m3             ; A3+B3          a3+b3
+    psubd            m1, m3             ; a3-B3          a3-b3
+    psrad            m6, %6
+    psrad            m1, %6
+    packssdw         m2, m2             ; A2+B2  a2+b2
+    packssdw         m6, m6             ; A3+B3  a3+b3
+    movq      [32 + %5], m2
+    packssdw         m1, m1             ; A3-B3  a3-b3
+    packssdw         m5, m5             ; A2-B2  a2-b2
+    movq      [48 + %5], m6
+    movq      [64 + %5], m1
+    movq      [80 + %5], m5
 %endmacro
 
 %macro IDCT3 6
-    movq            mm0, %1             ; R4     R0      r4      r0
-    movq            mm3, %4             ; R7     R5      r7      r5
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
-    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
-    movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
-    pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
-    paddd           mm1, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm1            ; A0-B0          a0-b0
-    psrad           mm1, %6
-    psrad           mm4, %6
-    movq            mm2, mm0            ; A1             a1
-    paddd           mm0, mm7            ; A1+B1          a1+b1
-    psubd           mm2, mm7            ; A1-B1          a1-b1
-    psrad           mm0, %6
-    psrad           mm2, %6
-    packssdw        mm1, mm1            ; A0+B0  a0+b0
-    movd           [%5], mm1
-    packssdw        mm0, mm0            ; A1+B1  a1+b1
-    movd      [16 + %5], mm0
-    packssdw        mm2, mm2            ; A1-B1  a1-b1
-    movd      [96 + %5], mm2
-    packssdw        mm4, mm4            ; A0-B0  a0-b0
-    movd     [112 + %5], mm4
-    movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
-    pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
-    movq            mm2, mm5            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
-    paddd           mm2, mm1            ; A2+B2          a2+b2
-    psubd           mm5, mm1            ; a2-B2          a2-b2
-    psrad           mm2, %6
-    psrad           mm5, %6
-    movq            mm1, mm6            ; A3             a3
-    paddd           mm6, mm3            ; A3+B3          a3+b3
-    psubd           mm1, mm3            ; a3-B3          a3-b3
-    psrad           mm6, %6
-    psrad           mm1, %6
-    packssdw        mm2, mm2            ; A2+B2  a2+b2
-    packssdw        mm6, mm6            ; A3+B3  a3+b3
-    movd      [32 + %5], mm2
-    packssdw        mm1, mm1            ; A3-B3  a3-b3
-    packssdw        mm5, mm5            ; A2-B2  a2-b2
-    movd      [48 + %5], mm6
-    movd      [64 + %5], mm1
-    movd      [80 + %5], mm5
+    mova             m0, %1             ; R4     R0      r4      r0
+    mova             m3, %4             ; R7     R5      r7      r5
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, m0             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m1, [coeffs + 112] ; C7     C5      C7      C5
+    pmaddwd          m1, m3             ; C7R7+C5R5      C7r7+C5r5
+    mova             m7, [coeffs + 144] ; -C5    -C1     -C5     -C1
+    pmaddwd          m7, m3             ; -C5R7-C1R5     -C5r7-C1r5
+    paddd            m1, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m1             ; A0-B0          a0-b0
+    psrad            m1, %6
+    psrad            m4, %6
+    mova             m2, m0             ; A1             a1
+    paddd            m0, m7             ; A1+B1          a1+b1
+    psubd            m2, m7             ; A1-B1          a1-b1
+    psrad            m0, %6
+    psrad            m2, %6
+    packssdw         m1, m1             ; A0+B0  a0+b0
+    movq           [%5], m1
+    packssdw         m0, m0             ; A1+B1  a1+b1
+    movq      [16 + %5], m0
+    packssdw         m2, m2             ; A1-B1  a1-b1
+    movq      [96 + %5], m2
+    packssdw         m4, m4             ; A0-B0  a0-b0
+    movq     [112 + %5], m4
+    mova             m1, [coeffs + 176] ; C3     C7      C3      C7
+    pmaddwd          m1, m3             ; C3R7+C7R5      C3r7+C7r5
+    mova             m2, m5             ; A2             a2
+    pmaddwd          m3, [coeffs + 208] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd            m2, m1             ; A2+B2          a2+b2
+    psubd            m5, m1             ; a2-B2          a2-b2
+    psrad            m2, %6
+    psrad            m5, %6
+    mova             m1, m6             ; A3             a3
+    paddd            m6, m3             ; A3+B3          a3+b3
+    psubd            m1, m3             ; a3-B3          a3-b3
+    psrad            m6, %6
+    psrad            m1, %6
+    packssdw         m2, m2             ; A2+B2  a2+b2
+    packssdw         m6, m6             ; A3+B3  a3+b3
+    movq      [32 + %5], m2
+    packssdw         m1, m1             ; A3-B3  a3-b3
+    packssdw         m5, m5             ; A2-B2  a2-b2
+    movq      [48 + %5], m6
+    movq      [64 + %5], m1
+    movq      [80 + %5], m5
 %endmacro
 
 %macro IDCT4 6
-    movq            mm0, %1             ; R4     R0      r4      r0
-    movq            mm2, %3             ; R3     R1      r3      r1
-    movq            mm3, %4             ; R7     R5      r7      r5
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
-    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
-    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
-    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
-    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
-    paddd           mm7, mm1            ; B0             b0
-    movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
-    pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
-    paddd           mm7, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm7            ; A0-B0          a0-b0
-    paddd           mm1, mm2            ; B1             b1
-    psrad           mm7, %6
-    psrad           mm4, %6
-    movq            mm2, mm0            ; A1             a1
-    paddd           mm0, mm1            ; A1+B1          a1+b1
-    psubd           mm2, mm1            ; A1-B1          a1-b1
-    psrad           mm0, %6
-    psrad           mm2, %6
-    packssdw        mm7, mm7            ; A0+B0  a0+b0
-    movd           [%5], mm7
-    packssdw        mm0, mm0            ; A1+B1  a1+b1
-    movd      [16 + %5], mm0
-    packssdw        mm2, mm2            ; A1-B1  a1-b1
-    movd      [96 + %5], mm2
-    packssdw        mm4, mm4            ; A0-B0  a0-b0
-    movd     [112 + %5], mm4
-    movq            mm0, %3             ; R3     R1      r3      r1
-    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
-    pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
-    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
-    pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
-    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
-    movq            mm2, mm5            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
-    paddd           mm4, mm7            ; B2             b2
-    paddd           mm2, mm4            ; A2+B2          a2+b2
-    psubd           mm5, mm4            ; a2-B2          a2-b2
-    psrad           mm2, %6
-    psrad           mm5, %6
-    movq            mm4, mm6            ; A3             a3
-    paddd           mm3, mm0            ; B3             b3
-    paddd           mm6, mm3            ; A3+B3          a3+b3
-    psubd           mm4, mm3            ; a3-B3          a3-b3
-    psrad           mm6, %6
-    psrad           mm4, %6
-    packssdw        mm2, mm2            ; A2+B2  a2+b2
-    packssdw        mm6, mm6            ; A3+B3  a3+b3
-    movd      [32 + %5], mm2
-    packssdw        mm4, mm4            ; A3-B3  a3-b3
-    packssdw        mm5, mm5            ; A2-B2  a2-b2
-    movd      [48 + %5], mm6
-    movd      [64 + %5], mm4
-    movd      [80 + %5], mm5
+    mova             m0, %1             ; R4     R0      r4      r0
+    mova             m2, %3             ; R3     R1      r3      r1
+    mova             m3, %4             ; R7     R5      r7      r5
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    mova             m7, [coeffs + 96]  ; C3     C1      C3      C1
+    pmaddwd          m7, m2             ; C3R3+C1R1      C3r3+C1r1
+    mova             m5, m0             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m1, [coeffs + 112] ; C7     C5      C7      C5
+    pmaddwd          m1, m3             ; C7R7+C5R5      C7r7+C5r5
+    pmaddwd          m2, [coeffs + 128] ; -C7R3+C3R1     -C7r3+C3r1
+    paddd            m7, m1             ; B0             b0
+    mova             m1, [coeffs + 144] ; -C5    -C1     -C5     -C1
+    pmaddwd          m1, m3             ; -C5R7-C1R5     -C5r7-C1r5
+    paddd            m7, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m7             ; A0-B0          a0-b0
+    paddd            m1, m2             ; B1             b1
+    psrad            m7, %6
+    psrad            m4, %6
+    mova             m2, m0             ; A1             a1
+    paddd            m0, m1             ; A1+B1          a1+b1
+    psubd            m2, m1             ; A1-B1          a1-b1
+    psrad            m0, %6
+    psrad            m2, %6
+    packssdw         m7, m7             ; A0+B0  a0+b0
+    movq           [%5], m7
+    packssdw         m0, m0             ; A1+B1  a1+b1
+    movq      [16 + %5], m0
+    packssdw         m2, m2             ; A1-B1  a1-b1
+    movq      [96 + %5], m2
+    packssdw         m4, m4             ; A0-B0  a0-b0
+    movq     [112 + %5], m4
+    mova             m0, %3             ; R3     R1      r3      r1
+    mova             m4, [coeffs + 160] ; -C1    C5      -C1     C5
+    pmaddwd          m4, m0             ; -C1R3+C5R1     -C1r3+C5r1
+    mova             m7, [coeffs + 176] ; C3     C7      C3      C7
+    pmaddwd          m0, [coeffs + 192] ; -C5R3+C7R1     -C5r3+C7r1
+    pmaddwd          m7, m3             ; C3R7+C7R5      C3r7+C7r5
+    mova             m2, m5             ; A2             a2
+    pmaddwd          m3, [coeffs + 208] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd            m4, m7             ; B2             b2
+    paddd            m2, m4             ; A2+B2          a2+b2
+    psubd            m5, m4             ; a2-B2          a2-b2
+    psrad            m2, %6
+    psrad            m5, %6
+    mova             m4, m6             ; A3             a3
+    paddd            m3, m0             ; B3             b3
+    paddd            m6, m3             ; A3+B3          a3+b3
+    psubd            m4, m3             ; a3-B3          a3-b3
+    psrad            m6, %6
+    psrad            m4, %6
+    packssdw         m2, m2             ; A2+B2  a2+b2
+    packssdw         m6, m6             ; A3+B3  a3+b3
+    movq      [32 + %5], m2
+    packssdw         m4, m4             ; A3-B3  a3-b3
+    packssdw         m5, m5             ; A2-B2  a2-b2
+    movq      [48 + %5], m6
+    movq      [64 + %5], m4
+    movq      [80 + %5], m5
 %endmacro
 
 %macro IDCT5 6
-    movq            mm0, %1             ; R4     R0      r4      r0
-    movq            mm2, %3             ; R3     R1      r3      r1
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
-    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
-    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm3, [coeffs + 64]
-    pmaddwd         mm3, mm2            ; -C7R3+C3R1     -C7r3+C3r1
-    paddd           mm7, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm7            ; A0-B0          a0-b0
-    psrad           mm7, %6
-    psrad           mm4, %6
-    movq            mm1, mm0            ; A1             a1
-    paddd           mm0, mm3            ; A1+B1          a1+b1
-    psubd           mm1, mm3            ; A1-B1          a1-b1
-    psrad           mm0, %6
-    psrad           mm1, %6
-    packssdw        mm7, mm7            ; A0+B0  a0+b0
-    movd           [%5], mm7
-    packssdw        mm0, mm0            ; A1+B1  a1+b1
-    movd      [16 + %5], mm0
-    packssdw        mm1, mm1            ; A1-B1  a1-b1
-    movd      [96 + %5], mm1
-    packssdw        mm4, mm4            ; A0-B0  a0-b0
-    movd     [112 + %5], mm4
-    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
-    pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
-    pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
-    movq            mm1, mm5            ; A2             a2
-    paddd           mm1, mm4            ; A2+B2          a2+b2
-    psubd           mm5, mm4            ; a2-B2          a2-b2
-    psrad           mm1, %6
-    psrad           mm5, %6
-    movq            mm4, mm6            ; A3             a3
-    paddd           mm6, mm2            ; A3+B3          a3+b3
-    psubd           mm4, mm2            ; a3-B3          a3-b3
-    psrad           mm6, %6
-    psrad           mm4, %6
-    packssdw        mm1, mm1            ; A2+B2  a2+b2
-    packssdw        mm6, mm6            ; A3+B3  a3+b3
-    movd      [32 + %5], mm1
-    packssdw        mm4, mm4            ; A3-B3  a3-b3
-    packssdw        mm5, mm5            ; A2-B2  a2-b2
-    movd      [48 + %5], mm6
-    movd      [64 + %5], mm4
-    movd      [80 + %5], mm5
+    mova             m0, %1             ; R4     R0      r4      r0
+    mova             m2, %3             ; R3     R1      r3      r1
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    mova             m7, [coeffs + 96]  ; C3     C1      C3      C1
+    pmaddwd          m7, m2             ; C3R3+C1R1      C3r3+C1r1
+    mova             m5, m0             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m3, [coeffs + 128]
+    pmaddwd          m3, m2             ; -C7R3+C3R1     -C7r3+C3r1
+    paddd            m7, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m7             ; A0-B0          a0-b0
+    psrad            m7, %6
+    psrad            m4, %6
+    mova             m1, m0             ; A1             a1
+    paddd            m0, m3             ; A1+B1          a1+b1
+    psubd            m1, m3             ; A1-B1          a1-b1
+    psrad            m0, %6
+    psrad            m1, %6
+    packssdw         m7, m7             ; A0+B0  a0+b0
+    movq           [%5], m7
+    packssdw         m0, m0             ; A1+B1  a1+b1
+    movq      [16 + %5], m0
+    packssdw         m1, m1             ; A1-B1  a1-b1
+    movq      [96 + %5], m1
+    packssdw         m4, m4             ; A0-B0  a0-b0
+    movq     [112 + %5], m4
+    mova             m4, [coeffs + 160] ; -C1    C5      -C1     C5
+    pmaddwd          m4, m2             ; -C1R3+C5R1     -C1r3+C5r1
+    pmaddwd          m2, [coeffs + 192] ; -C5R3+C7R1     -C5r3+C7r1
+    mova             m1, m5             ; A2             a2
+    paddd            m1, m4             ; A2+B2          a2+b2
+    psubd            m5, m4             ; a2-B2          a2-b2
+    psrad            m1, %6
+    psrad            m5, %6
+    mova             m4, m6             ; A3             a3
+    paddd            m6, m2             ; A3+B3          a3+b3
+    psubd            m4, m2             ; a3-B3          a3-b3
+    psrad            m6, %6
+    psrad            m4, %6
+    packssdw         m1, m1             ; A2+B2  a2+b2
+    packssdw         m6, m6             ; A3+B3  a3+b3
+    movq      [32 + %5], m1
+    packssdw         m4, m4             ; A3-B3  a3-b3
+    packssdw         m5, m5             ; A2-B2  a2-b2
+    movq      [48 + %5], m6
+    movq      [64 + %5], m4
+    movq      [80 + %5], m5
 %endmacro
 
 %macro IDCT6 6
-    movq            mm0, [%1]           ; R4     R0      r4      r0
-    movq            mm1, [%2]           ; R6     R2      r6      r2
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
-    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
-    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
-    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    paddd           mm4, mm5            ; A0             a0
-    psubd           mm6, mm5            ; A3             a3
-    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
-    paddd           mm0, mm1            ; A1             a1
-    psubd           mm5, mm1            ; A2             a2
-    movq            mm2, [8 + %1]       ; R4     R0      r4      r0
-    movq            mm3, [8 + %2]       ; R6     R2      r6      r2
-    movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
-    pmaddwd         mm7, mm3            ; C6R6+C2R2      C6r6+C2r2
-    pmaddwd         mm3, [coeffs + 40]  ; -C2R6+C6R2     -C2r6+C6r2
-    paddd           mm7, mm1            ; A0             a0
-    paddd           mm1, mm1            ; 2C0            2c0
-    psubd           mm1, mm7            ; A3             a3
-    paddd           mm3, mm2            ; A1             a1
-    paddd           mm2, mm2            ; 2C1            2c1
-    psubd           mm2, mm3            ; A2             a2
-    psrad           mm4, %6
-    psrad           mm7, %6
-    psrad           mm3, %6
-    packssdw        mm4, mm7            ; A0     a0
-    movq           [%5], mm4
-    psrad           mm0, %6
-    packssdw        mm0, mm3            ; A1     a1
-    movq      [16 + %5], mm0
-    movq      [96 + %5], mm0
-    movq     [112 + %5], mm4
-    psrad           mm5, %6
-    psrad           mm6, %6
-    psrad           mm2, %6
-    packssdw        mm5, mm2            ; A2-B2  a2-b2
-    movq      [32 + %5], mm5
-    psrad           mm1, %6
-    packssdw        mm6, mm1            ; A3+B3  a3+b3
-    movq      [48 + %5], mm6
-    movq      [64 + %5], mm6
-    movq      [80 + %5], mm5
+    movq             m0, [%1]           ; R4     R0      r4      r0
+    movhps           m0, [%1 + 16]
+    movq             m1, [%2]           ; R6     R2      r6      r2
+    movhps           m1, [%2 + 16]
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m5, [coeffs + 64]  ; C6     C2      C6      C2
+    pmaddwd          m5, m1             ; C6R6+C2R2      C6r6+C2r2
+    mova             m6, [coeffs + 80]  ; -C2    C6      -C2     C6
+    pmaddwd          m1, m6             ; -C2R6+C6R2     -C2r6+C6r2
+    mova             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    paddd            m4, m5             ; A0             a0
+    psubd            m6, m5             ; A3             a3
+    mova             m5, m0             ; -C4R4+C4R0     -C4r4+C4r0
+    paddd            m0, m1             ; A1             a1
+    psubd            m5, m1             ; A2             a2
+    movq             m2, [%1 + 8]       ; R4     R0      r4      r0
+    movhps           m2, [%1 + 24]
+    movq             m3, [%2 + 8]       ; R6     R2      r6      r2
+    movhps           m3, [%2 + 24]
+    mova             m1, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m1, m2             ; C4R4+C4R0      C4r4+C4r0
+    mova             m7, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m2, m7             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m7, [coeffs + 64]  ; C6     C2      C6      C2
+    pmaddwd          m7, m3             ; C6R6+C2R2      C6r6+C2r2
+    pmaddwd          m3, [coeffs + 80]  ; -C2R6+C6R2     -C2r6+C6r2
+    paddd            m7, m1             ; A0             a0
+    paddd            m1, m1             ; 2C0            2c0
+    psubd            m1, m7             ; A3             a3
+    paddd            m3, m2             ; A1             a1
+    paddd            m2, m2             ; 2C1            2c1
+    psubd            m2, m3             ; A2             a2
+    psrad            m4, %6
+    psrad            m7, %6
+    psrad            m3, %6
+    packssdw         m4, m7             ; A0     a0
+    pshufd           m4, m4, 0xD8
+    mova           [%5], m4
+    psrad            m0, %6
+    packssdw         m0, m3             ; A1     a1
+    pshufd           m0, m0, 0xD8
+    mova      [16 + %5], m0
+    mova      [96 + %5], m0
+    mova     [112 + %5], m4
+    psrad            m5, %6
+    psrad            m6, %6
+    psrad            m2, %6
+    packssdw         m5, m2             ; A2-B2  a2-b2
+    pshufd           m5, m5, 0xD8
+    mova      [32 + %5], m5
+    psrad            m1, %6
+    packssdw         m6, m1             ; A3+B3  a3+b3
+    pshufd           m6, m6, 0xD8
+    mova      [48 + %5], m6
+    mova      [64 + %5], m6
+    mova      [80 + %5], m5
 %endmacro
 
 %macro IDCT7 6
-    movq            mm0, %1             ; R4     R0      r4      r0
-    movq            mm1, %2             ; R6     R2      r6      r2
-    movq            mm2, %3             ; R3     R1      r3      r1
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
-    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
-    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
-    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
-    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
-    paddd           mm4, mm5            ; A0             a0
-    psubd           mm6, mm5            ; A3             a3
-    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
-    paddd           mm0, mm1            ; A1             a1
-    psubd           mm5, mm1            ; A2             a2
-    movq            mm1, [coeffs + 64]
-    pmaddwd         mm1, mm2            ; -C7R3+C3R1     -C7r3+C3r1
-    paddd           mm7, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm7            ; A0-B0          a0-b0
-    psrad           mm7, %6
-    psrad           mm4, %6
-    movq            mm3, mm0            ; A1             a1
-    paddd           mm0, mm1            ; A1+B1          a1+b1
-    psubd           mm3, mm1            ; A1-B1          a1-b1
-    psrad           mm0, %6
-    psrad           mm3, %6
-    packssdw        mm7, mm7            ; A0+B0  a0+b0
-    movd           [%5], mm7
-    packssdw        mm0, mm0            ; A1+B1  a1+b1
-    movd      [16 + %5], mm0
-    packssdw        mm3, mm3            ; A1-B1  a1-b1
-    movd      [96 + %5], mm3
-    packssdw        mm4, mm4            ; A0-B0  a0-b0
-    movd     [112 + %5], mm4
-    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
-    pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
-    pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
-    movq            mm3, mm5            ; A2             a2
-    paddd           mm3, mm4            ; A2+B2          a2+b2
-    psubd           mm5, mm4            ; a2-B2          a2-b2
-    psrad           mm3, %6
-    psrad           mm5, %6
-    movq            mm4, mm6            ; A3             a3
-    paddd           mm6, mm2            ; A3+B3          a3+b3
-    psubd           mm4, mm2            ; a3-B3          a3-b3
-    psrad           mm6, %6
-    packssdw        mm3, mm3            ; A2+B2  a2+b2
-    movd      [32 + %5], mm3
-    psrad           mm4, %6
-    packssdw        mm6, mm6            ; A3+B3  a3+b3
-    movd      [48 + %5], mm6
-    packssdw        mm4, mm4            ; A3-B3  a3-b3
-    packssdw        mm5, mm5            ; A2-B2  a2-b2
-    movd      [64 + %5], mm4
-    movd      [80 + %5], mm5
+    mova             m0, %1             ; R4     R0      r4      r0
+    mova             m1, %2             ; R6     R2      r6      r2
+    mova             m2, %3             ; R3     R1      r3      r1
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m5, [coeffs + 64]  ; C6     C2      C6      C2
+    pmaddwd          m5, m1             ; C6R6+C2R2      C6r6+C2r2
+    mova             m6, [coeffs + 80]  ; -C2    C6      -C2     C6
+    pmaddwd          m1, m6             ; -C2R6+C6R2     -C2r6+C6r2
+    mova             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    mova             m7, [coeffs + 96]  ; C3     C1      C3      C1
+    pmaddwd          m7, m2             ; C3R3+C1R1      C3r3+C1r1
+    paddd            m4, m5             ; A0             a0
+    psubd            m6, m5             ; A3             a3
+    mova             m5, m0             ; -C4R4+C4R0     -C4r4+C4r0
+    paddd            m0, m1             ; A1             a1
+    psubd            m5, m1             ; A2             a2
+    mova             m1, [coeffs + 128]
+    pmaddwd          m1, m2             ; -C7R3+C3R1     -C7r3+C3r1
+    paddd            m7, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m7             ; A0-B0          a0-b0
+    psrad            m7, %6
+    psrad            m4, %6
+    mova             m3, m0             ; A1             a1
+    paddd            m0, m1             ; A1+B1          a1+b1
+    psubd            m3, m1             ; A1-B1          a1-b1
+    psrad            m0, %6
+    psrad            m3, %6
+    packssdw         m7, m7             ; A0+B0  a0+b0
+    movq           [%5], m7
+    packssdw         m0, m0             ; A1+B1  a1+b1
+    movq      [16 + %5], m0
+    packssdw         m3, m3             ; A1-B1  a1-b1
+    movq      [96 + %5], m3
+    packssdw         m4, m4             ; A0-B0  a0-b0
+    movq     [112 + %5], m4
+    mova             m4, [coeffs + 160] ; -C1    C5      -C1     C5
+    pmaddwd          m4, m2             ; -C1R3+C5R1     -C1r3+C5r1
+    pmaddwd          m2, [coeffs + 192] ; -C5R3+C7R1     -C5r3+C7r1
+    mova             m3, m5             ; A2             a2
+    paddd            m3, m4             ; A2+B2          a2+b2
+    psubd            m5, m4             ; a2-B2          a2-b2
+    psrad            m3, %6
+    psrad            m5, %6
+    mova             m4, m6             ; A3             a3
+    paddd            m6, m2             ; A3+B3          a3+b3
+    psubd            m4, m2             ; a3-B3          a3-b3
+    psrad            m6, %6
+    packssdw         m3, m3             ; A2+B2  a2+b2
+    movq      [32 + %5], m3
+    psrad            m4, %6
+    packssdw         m6, m6             ; A3+B3  a3+b3
+    movq      [48 + %5], m6
+    packssdw         m4, m4             ; A3-B3  a3-b3
+    packssdw         m5, m5             ; A2-B2  a2-b2
+    movq      [64 + %5], m4
+    movq      [80 + %5], m5
 %endmacro
 
 %macro IDCT8 6
-    movq            mm0, [%1]           ; R4     R0      r4      r0
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    psrad           mm4, %6
-    psrad           mm0, %6
-    movq            mm2, [8 + %1]       ; R4     R0      r4      r0
-    movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
-    psrad           mm1, %6
-    packssdw        mm4, mm1            ; A0     a0
-    movq           [%5], mm4
-    psrad           mm2, %6
-    packssdw        mm0, mm2            ; A1     a1
-    movq      [16 + %5], mm0
-    movq      [96 + %5], mm0
-    movq     [112 + %5], mm4
-    movq      [32 + %5], mm0
-    movq      [48 + %5], mm4
-    movq      [64 + %5], mm4
-    movq      [80 + %5], mm0
+    movq             m0, [%1]           ; R4     R0      r4      r0
+    movhps           m0, [%1 + 16]
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    psrad            m4, %6
+    psrad            m0, %6
+    movq             m2, [%1 + 8]       ; R4     R0      r4      r0
+    movhps           m2, [%1 + 24]
+    mova             m1, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m1, m2             ; C4R4+C4R0      C4r4+C4r0
+    mova             m7, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m2, m7             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m7, [coeffs + 64]  ; C6     C2      C6      C2
+    psrad            m1, %6
+    packssdw         m4, m1             ; A0     a0
+    pshufd           m4, m4, 0xD8
+    mova           [%5], m4
+    psrad            m2, %6
+    packssdw         m0, m2             ; A1     a1
+    pshufd           m0, m0, 0xD8
+    mova      [16 + %5], m0
+    mova      [96 + %5], m0
+    mova     [112 + %5], m4
+    mova      [32 + %5], m0
+    mova      [48 + %5], m4
+    mova      [64 + %5], m4
+    mova      [80 + %5], m0
 %endmacro
 
 %macro IDCT 0
@@ -710,9 +735,7 @@ SECTION .text
     Z_COND_IDCT  96, 104, 112, 120, rsp + 96, null, 11, %%1
 
     IDCT1 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
-    IDCT1 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
     IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
-    IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
     jmp %%9
 
     ALIGN 16
@@ -721,9 +744,7 @@ SECTION .text
     Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5
 
     IDCT2 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
-    IDCT2 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
     IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
-    IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
     jmp %%9
 
     ALIGN 16
@@ -731,9 +752,7 @@ SECTION .text
     Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7
 
     IDCT3 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
-    IDCT3 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
     IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
-    IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
     jmp %%9
 
     ALIGN 16
@@ -741,41 +760,33 @@ SECTION .text
     Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3
 
     IDCT4 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
-    IDCT4 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
     IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
-    IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
     jmp %%9
 
     ALIGN 16
     %%3:
 
     IDCT5 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
-    IDCT5 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
     IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
-    IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
     jmp %%9
 
     ALIGN 16
     %%5:
 
     IDCT6 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
-    IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
     jmp %%9
 
     ALIGN 16
     %%1:
 
     IDCT7 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
-    IDCT7 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
     IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
-    IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
     jmp %%9
 
     ALIGN 16
     %%7:
 
     IDCT8 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
-    IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
 
     %%9:
 %endmacro
@@ -805,15 +816,12 @@ SECTION .text
     movhps     [pixelsq+lsizeq], m0
 %endmacro
 
-INIT_MMX mmx
+INIT_XMM sse2
 
 cglobal simple_idct, 1, 2, 8, 128, block, t0
     IDCT
-    emms
 RET
 
-INIT_XMM sse2
-
 cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
     IDCT
     lea lsize3q, [lsizeq*3]
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index 9b64cfe9bc..c9ba6aedaf 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -22,10 +22,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
-void ff_simple_idct_mmx(int16_t *block);
-void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
-void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
-
+void ff_simple_idct_sse2(int16_t *block);
 void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
-- 
2.49.1


>From e7152c27777f5ade271e5c06487776dfa9e6eb94 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 5 Nov 2025 05:08:28 +0100
Subject: [PATCH 7/7] avcodec/dvdec,mjpegdec: Remove emms_c

It is no longer necessary now that the IDCTDSP is always ABI-compliant
(and free of MMX).

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/dvdec.c    | 3 ---
 libavcodec/mjpegdec.c | 2 --
 2 files changed, 5 deletions(-)

diff --git a/libavcodec/dvdec.c b/libavcodec/dvdec.c
index 242708c70a..4799ec96dc 100644
--- a/libavcodec/dvdec.c
+++ b/libavcodec/dvdec.c
@@ -36,7 +36,6 @@
  */
 
 #include "libavutil/avassert.h"
-#include "libavutil/emms.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem_internal.h"
 #include "libavutil/thread.h"
@@ -683,8 +682,6 @@ static int dvvideo_decode_frame(AVCodecContext *avctx, AVFrame *frame,
     avctx->execute(avctx, dv_decode_video_segment, s->work_chunks, NULL,
                    dv_work_pool_size(s->sys), sizeof(DVwork_chunk));
 
-    emms_c();
-
     /* return image */
     *got_frame = 1;
 
diff --git a/libavcodec/mjpegdec.c b/libavcodec/mjpegdec.c
index 5fd77073da..fb39c4e9fd 100644
--- a/libavcodec/mjpegdec.c
+++ b/libavcodec/mjpegdec.c
@@ -33,7 +33,6 @@
 #include "config_components.h"
 
 #include "libavutil/attributes.h"
-#include "libavutil/emms.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/avassert.h"
 #include "libavutil/mem.h"
@@ -1824,7 +1823,6 @@ next_field:
         }
     }
 
-    emms_c();
     return 0;
  out_of_range:
     av_log(s->avctx, AV_LOG_ERROR, "decode_sos: ac/dc index out of range\n");
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2025-11-05  4:12 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-11-05  4:11 [FFmpeg-devel] [PATCH] Stop using MMX in IDCTDSP (PR #20838) mkver via ffmpeg-devel

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git