From: mkver via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: mkver <code@ffmpeg.org>
Subject: [FFmpeg-devel] [PATCH] avcodec/me_cmp: Remove MMXEXT hadamard diff functions (PR #21018)
Date: Tue, 25 Nov 2025 15:54:50 -0000
Message-ID: <176408609136.39.14488614940261261295@2cb04c0e5124> (raw)
PR #21018 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21018
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21018.patch
>From 7b076a270165995d5ad943b27bd620f41e7b9019 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 25 Nov 2025 15:29:19 +0100
Subject: [PATCH 1/3] avcodec/x86/me_cmp: Avoid manual stack handling
Use x86inc's stack alignment feature instead of allocating the stack
manually*; this means that this code now also automatically supports
unaligned stacks, so that the SSE2 and SSSE3 functions will now be
available everywhere.
*: The code for this was also buggy: It resulted in the stack pointer
to be 4 mod 8 for x64 for the mmxext version before it was disabled
in 542765ce3eccbca587d54262a512cbdb1407230d, because it hardcode 4
instead of using gprsize.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/me_cmp.asm | 23 ++++-------------------
libavcodec/x86/me_cmp_init.c | 4 ----
2 files changed, 4 insertions(+), 23 deletions(-)
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 4545eae276..e123089ba3 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -152,23 +152,11 @@ SECTION .text
%endmacro
%macro hadamard8_16_wrapper 2
-cglobal hadamard8_diff, 4, 4, %1
-%ifndef m8
- %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
- SUB rsp, pad
-%endif
+cglobal hadamard8_diff, 4, 4, %1, %2*mmsize
call hadamard8x8_diff %+ SUFFIX
-%ifndef m8
- ADD rsp, pad
-%endif
RET
-cglobal hadamard8_diff16, 5, 6, %1
-%ifndef m8
- %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
- SUB rsp, pad
-%endif
-
+cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
call hadamard8x8_diff %+ SUFFIX
mov r5d, eax
@@ -192,9 +180,6 @@ cglobal hadamard8_diff16, 5, 6, %1
.done:
mov eax, r5d
-%ifndef m8
- ADD rsp, pad
-%endif
RET
%endmacro
@@ -215,7 +200,7 @@ hadamard8x8_diff %+ SUFFIX:
and eax, 0xFFFF
ret
-hadamard8_16_wrapper %1, 3
+hadamard8_16_wrapper %1, 2*ARCH_X86_32
%elif cpuflag(mmx)
ALIGN 16
; int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1,
@@ -261,7 +246,7 @@ hadamard8x8_diff %+ SUFFIX:
and rax, 0xFFFF
ret
-hadamard8_16_wrapper 0, 14
+hadamard8_16_wrapper 0, 13
%endif
%endmacro
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index d4503eef3b..35abbbf7f5 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -146,10 +146,8 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->pix_abs[0][2] = ff_sad16_y2_sse2;
c->pix_abs[0][3] = ff_sad16_xy2_sse2;
-#if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
-#endif
if (avctx->codec_id != AV_CODEC_ID_SNOW) {
c->sad[0] = ff_sad16_sse2;
@@ -179,10 +177,8 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->nsse[1] = nsse8_ssse3;
c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
-#if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
-#endif
}
#endif
}
--
2.49.1
>From ae99f1b6d80fba362c180d0d2dab4e3872970db0 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 25 Nov 2025 15:45:44 +0100
Subject: [PATCH 2/3] avcodec/me_cmp: Remove MMXEXT hadamard diff functions
The SSE2 and SSSE3 functions are now available everywhere,
making the MMXEXT functions irrelevant.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/me_cmp.asm | 88 ++----------------------------------
libavcodec/x86/me_cmp_init.c | 6 ---
2 files changed, 4 insertions(+), 90 deletions(-)
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index e123089ba3..2a196d03bb 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -112,7 +112,6 @@ SECTION .text
; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
%macro HSUM 3
-%if cpuflag(sse2)
movhlps %2, %1
paddusw %1, %2
pshuflw %2, %1, 0xE
@@ -120,35 +119,6 @@ SECTION .text
pshuflw %2, %1, 0x1
paddusw %1, %2
movd %3, %1
-%elif cpuflag(mmxext)
- pshufw %2, %1, 0xE
- paddusw %1, %2
- pshufw %2, %1, 0x1
- paddusw %1, %2
- movd %3, %1
-%elif cpuflag(mmx)
- mova %2, %1
- psrlq %1, 32
- paddusw %1, %2
- mova %2, %1
- psrlq %1, 16
- paddusw %1, %2
- movd %3, %1
-%endif
-%endmacro
-
-%macro STORE4 5
- mova [%1+mmsize*0], %2
- mova [%1+mmsize*1], %3
- mova [%1+mmsize*2], %4
- mova [%1+mmsize*3], %5
-%endmacro
-
-%macro LOAD4 5
- mova %2, [%1+mmsize*0]
- mova %3, [%1+mmsize*1]
- mova %4, [%1+mmsize*2]
- mova %5, [%1+mmsize*3]
%endmacro
%macro hadamard8_16_wrapper 2
@@ -183,8 +153,10 @@ cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
RET
%endmacro
-%macro HADAMARD8_DIFF 0-1
-%if cpuflag(sse2)
+%macro HADAMARD8_DIFF 1
+; r1, r2 and r3 are not clobbered in this function, so 16x16 can
+; simply call this 2x2x (and that's why we access rsp+gprsize
+; everywhere, which is rsp of calling function)
hadamard8x8_diff %+ SUFFIX:
lea r0, [r3*3]
DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
@@ -201,60 +173,8 @@ hadamard8x8_diff %+ SUFFIX:
ret
hadamard8_16_wrapper %1, 2*ARCH_X86_32
-%elif cpuflag(mmx)
-ALIGN 16
-; int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1,
-; const uint8_t *src2, ptrdiff_t stride, int h)
-; r0 = void *s = unused, int h = unused (always 8)
-; note how r1, r2 and r3 are not clobbered in this function, so 16x16
-; can simply call this 2x2x (and that's why we access rsp+gprsize
-; everywhere, which is rsp of calling func
-hadamard8x8_diff %+ SUFFIX:
- lea r0, [r3*3]
-
- ; first 4x8 pixels
- DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
- HADAMARD8
- mova [rsp+gprsize+0x60], m7
- TRANSPOSE4x4W 0, 1, 2, 3, 7
- STORE4 rsp+gprsize, m0, m1, m2, m3
- mova m7, [rsp+gprsize+0x60]
- TRANSPOSE4x4W 4, 5, 6, 7, 0
- STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
-
- ; second 4x8 pixels
- DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
- HADAMARD8
- mova [rsp+gprsize+0x60], m7
- TRANSPOSE4x4W 0, 1, 2, 3, 7
- STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
- mova m7, [rsp+gprsize+0x60]
- TRANSPOSE4x4W 4, 5, 6, 7, 0
-
- LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
- HADAMARD8
- ABS_SUM_8x8_32 rsp+gprsize+0x60
- mova [rsp+gprsize+0x60], m0
-
- LOAD4 rsp+gprsize , m0, m1, m2, m3
- LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
- HADAMARD8
- ABS_SUM_8x8_32 rsp+gprsize
- paddusw m0, [rsp+gprsize+0x60]
-
- HSUM m0, m1, eax
- and rax, 0xFFFF
- ret
-
-hadamard8_16_wrapper 0, 13
-%endif
%endmacro
-%if HAVE_ALIGNED_STACK == 0
-INIT_MMX mmxext
-HADAMARD8_DIFF
-%endif
-
INIT_XMM sse2
%if ARCH_X86_64
%define ABS_SUM_8x8 ABS_SUM_8x8_64
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index 35abbbf7f5..3a8b46f4e1 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -77,7 +77,6 @@ int ff_vsad16u_approx_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t
int ff_hadamard8_diff16_ ## cpu(MPVEncContext *s, const uint8_t *src1, \
const uint8_t *src2, ptrdiff_t stride, int h);
-hadamard_func(mmxext)
hadamard_func(sse2)
hadamard_func(ssse3)
@@ -116,11 +115,6 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMXEXT(cpu_flags)) {
-#if !HAVE_ALIGNED_STACK
- c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
- c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
-#endif
-
c->sad[1] = ff_sad8_mmxext;
c->pix_abs[1][0] = ff_sad8_mmxext;
--
2.49.1
>From 68ea75b632c5787c7a5566aa6a47ab889b17541e Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 25 Nov 2025 16:28:51 +0100
Subject: [PATCH 3/3] avcodec/x86/me_cmp: Avoid call on UNIX64
The internal functions for calculating the hadamard difference
of two 8x8 blocks have no epilogue on UNIX64, so one can avoid
the call altogether by placing the 8x8 function so that it directly
falls into the internal function.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/me_cmp.asm | 15 +++++----------
1 file changed, 5 insertions(+), 10 deletions(-)
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 2a196d03bb..3ac8acee2c 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -121,12 +121,8 @@ SECTION .text
movd %3, %1
%endmacro
-%macro hadamard8_16_wrapper 2
-cglobal hadamard8_diff, 4, 4, %1, %2*mmsize
- call hadamard8x8_diff %+ SUFFIX
- RET
-
-cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
+%macro HADAMARD8_DIFF 1
+cglobal hadamard8_diff16, 5, 6, %1, 2*mmsize*ARCH_X86_32
call hadamard8x8_diff %+ SUFFIX
mov r5d, eax
@@ -151,9 +147,10 @@ cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
.done:
mov eax, r5d
RET
-%endmacro
-%macro HADAMARD8_DIFF 1
+cglobal hadamard8_diff, 4, 4, %1, 2*mmsize*ARCH_X86_32
+ TAIL_CALL hadamard8x8_diff %+ SUFFIX, 0
+
; r1, r2 and r3 are not clobbered in this function, so 16x16 can
; simply call this 2x2x (and that's why we access rsp+gprsize
; everywhere, which is rsp of calling function)
@@ -171,8 +168,6 @@ hadamard8x8_diff %+ SUFFIX:
HSUM m0, m1, eax
and eax, 0xFFFF
ret
-
-hadamard8_16_wrapper %1, 2*ARCH_X86_32
%endmacro
INIT_XMM sse2
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2025-11-25 15:55 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=176408609136.39.14488614940261261295@2cb04c0e5124 \
--to=ffmpeg-devel@ffmpeg.org \
--cc=code@ffmpeg.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git