* [FFmpeg-devel] [PATCH] Avoid using MMX in me_cmp (PR #20705)
@ 2025-10-14 11:21 mkver via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-10-14 11:21 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: mkver
PR #20705 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20705
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20705.patch
One could probably replace the hadamard mmxext functions by SSE2 ones if one just used an unaligned load/store in case one does not have an aligned stack.
>From 7d72001ccf1e35ccac9ea0c632f517c61490a254 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Mon, 13 Oct 2025 21:20:40 +0200
Subject: [PATCH 1/3] avcodec/x86/me_cmp: Add SSE2 sad 8,16 xy2 functions
The new functions are faster than the existing exact
functions, yet get beaten by the nonexact functions
(they can avoid unpacking to words and back).
The exact (slow) MMX functions have therefore been
removed, which was actually beneficial size-wise
(416B of new functions, 619B of functions removed).
pix_abs_0_3_c: 216.8 ( 1.00x)
pix_abs_0_3_mmx: 71.8 ( 3.02x)
pix_abs_0_3_mmxext (approximative): 17.6 (12.34x)
pix_abs_0_3_sse2: 23.5 ( 9.23x)
pix_abs_0_3_sse2 (approximative): 9.9 (21.94x)
pix_abs_1_3_c: 98.4 ( 1.00x)
pix_abs_1_3_mmx: 36.9 ( 2.66x)
pix_abs_1_3_mmxext (approximative): 9.2 (10.73x)
pix_abs_1_3_sse2: 14.8 ( 6.63x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/me_cmp.asm | 99 ++++++++++++++++++++++++++-
libavcodec/x86/me_cmp_init.c | 128 +++--------------------------------
2 files changed, 106 insertions(+), 121 deletions(-)
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index a494cdeb64..ee83556d14 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -23,10 +23,9 @@
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
-
cextern pb_1
cextern pb_80
+cextern pw_2
SECTION .text
@@ -667,6 +666,102 @@ SAD_Y2 16
INIT_XMM sse2
SAD_Y2 16
+;------------------------------------------------------------------------------------------
+;int ff_sad_xy2_<opt>(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
+;------------------------------------------------------------------------------------------
+
+;%1 = 8/16, %2 = aligned mov, %3 = unaligned mov
+%macro SAD_XY2 3
+cglobal sad%1_xy2, 5, 5, mmsize == 16 ? 8 + ARCH_X86_64 : 7, v, pix1, pix2, stride, h
+ mov%3 m2, [pix2q]
+ mov%3 m3, [pix2q+1]
+%if %1 == mmsize
+%if ARCH_X86_64
+ mova m8, [pw_2]
+ %define PW_2 m8
+%else
+ %define PW_2 [pw_2]
+%endif
+%else ; %1 != mmsize
+ mova m6, [pw_2]
+ %define PW_2 m6
+%endif
+ pxor m1, m1
+ add pix2q, strideq
+%if %1 != mmsize/2
+ mova m6, m2
+ mova m7, m3
+ punpckhbw m6, m1
+ punpckhbw m7, m1
+ paddw m6, m7
+%endif
+ punpcklbw m2, m1
+ punpcklbw m3, m1
+ paddw m2, m3
+ mova m0, m1
+
+.loop:
+ mov%3 m3, [pix2q]
+ mov%3 m4, [pix2q+1]
+%if %1 != mmsize/2
+ mova m5, m3
+ mova m7, m4
+ punpckhbw m5, m1
+ punpckhbw m7, m1
+ paddw m7, m5
+ paddw m7, PW_2
+ paddw m6, m7
+ psraw m6, 2
+%endif
+ mov%2 m5, [pix1q]
+ punpcklbw m3, m1
+ punpcklbw m4, m1
+ paddw m3, m4
+ paddw m3, PW_2
+ paddw m2, m3
+ psraw m2, 2
+ packuswb m2, m6
+ psadbw m2, m5
+ paddw m0, m2
+
+ mov%3 m2, [pix2q+strideq]
+ mov%3 m4, [pix2q+strideq+1]
+%if %1 != mmsize/2
+ mova m5, m2
+ mova m6, m4
+ punpckhbw m5, m1
+ punpckhbw m6, m1
+ paddw m6, m5
+ paddw m7, m6
+ psraw m7, 2
+%endif
+ mov%2 m5, [pix1q+strideq]
+ punpcklbw m2, m1
+ punpcklbw m4, m1
+ paddw m2, m4
+ paddw m3, m2
+ psraw m3, 2
+ packuswb m3, m7
+ psadbw m3, m5
+ paddw m0, m3
+
+ sub hd, 2
+ lea pix1q, [pix1q+2*strideq]
+ lea pix2q, [pix2q+2*strideq]
+ jnz .loop
+
+%if %1 == 16
+ movhlps m1, m0
+ paddw m0, m1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD_XY2 8, h, h
+SAD_XY2 16, a, u
+
;-------------------------------------------------------------------------------------------
;int ff_sad_approx_xy2_<opt>(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
;-------------------------------------------------------------------------------------------
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index 45425f7109..a3897e2a0b 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -24,8 +24,6 @@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
-#include "libavutil/mem_internal.h"
-#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/me_cmp.h"
#include "libavcodec/mpegvideoenc.h"
@@ -60,10 +58,14 @@ int ff_sad16_y2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad8_approx_xy2_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
+int ff_sad8_xy2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+ ptrdiff_t stride, int h);
int ff_sad16_approx_xy2_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_approx_xy2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
+int ff_sad16_xy2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+ ptrdiff_t stride, int h);
int ff_vsad_intra8_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_vsad_intra16_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
@@ -121,127 +123,10 @@ static int nsse8_mmx(MPVEncContext *c, const uint8_t *pix1, const uint8_t *pix2,
#endif /* HAVE_X86ASM */
-#if HAVE_INLINE_ASM
-
-DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
- 0x0000000000000000ULL,
- 0x0001000100010001ULL,
- 0x0002000200020002ULL,
-};
-
-static inline void sad8_4_mmx(const uint8_t *blk1, const uint8_t *blk2,
- ptrdiff_t stride, int h)
-{
- x86_reg len = -stride * h;
- __asm__ volatile (
- "movq (%1, %%"FF_REG_a"), %%mm0\n\t"
- "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpckhbw %%mm7, %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "paddw %%mm2, %%mm0 \n\t"
- "paddw %%mm3, %%mm1 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%2, %%"FF_REG_a"), %%mm2\n\t"
- "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
- "movq %%mm2, %%mm3 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddw %%mm4, %%mm2 \n\t"
- "paddw %%mm5, %%mm3 \n\t"
- "movq %5, %%mm5 \n\t"
- "paddw %%mm2, %%mm0 \n\t"
- "paddw %%mm3, %%mm1 \n\t"
- "paddw %%mm5, %%mm0 \n\t"
- "paddw %%mm5, %%mm1 \n\t"
- "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
- "movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
- "psrlw $2, %%mm0 \n\t"
- "psrlw $2, %%mm1 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "psubusb %%mm0, %%mm4 \n\t"
- "psubusb %%mm5, %%mm0 \n\t"
- "por %%mm4, %%mm0 \n\t"
- "movq %%mm0, %%mm4 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpckhbw %%mm7, %%mm4 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "paddw %%mm4, %%mm6 \n\t"
- "movq %%mm2, %%mm0 \n\t"
- "movq %%mm3, %%mm1 \n\t"
- "add %4, %%"FF_REG_a" \n\t"
- " js 1b \n\t"
- : "+a" (len)
- : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
- "r" (stride), "m" (round_tab[2]));
-}
-
-static inline int sum_mmx(void)
-{
- int ret;
- __asm__ volatile (
- "movq %%mm6, %%mm0 \n\t"
- "psrlq $32, %%mm6 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "movq %%mm6, %%mm0 \n\t"
- "psrlq $16, %%mm6 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "movd %%mm6, %0 \n\t"
- : "=r" (ret));
- return ret & 0xFFFF;
-}
-
-#define PIX_SADXY(suf) \
-static int sad8_xy2_ ## suf(MPVEncContext *v, const uint8_t *blk2, \
- const uint8_t *blk1, ptrdiff_t stride, int h) \
-{ \
- __asm__ volatile ( \
- "pxor %%mm7, %%mm7 \n\t" \
- "pxor %%mm6, %%mm6 \n\t" \
- ::); \
- \
- sad8_4_ ## suf(blk1, blk2, stride, h); \
- \
- return sum_ ## suf(); \
-} \
- \
-static int sad16_xy2_ ## suf(MPVEncContext *v, const uint8_t *blk2, \
- const uint8_t *blk1, ptrdiff_t stride, int h) \
-{ \
- __asm__ volatile ( \
- "pxor %%mm7, %%mm7 \n\t" \
- "pxor %%mm6, %%mm6 \n\t" \
- ::); \
- \
- sad8_4_ ## suf(blk1, blk2, stride, h); \
- sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
- \
- return sum_ ## suf(); \
-} \
-
-PIX_SADXY(mmx)
-
-#endif /* HAVE_INLINE_ASM */
-
av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
-#if HAVE_INLINE_ASM
- if (INLINE_MMX(cpu_flags)) {
- c->pix_abs[0][3] = sad16_xy2_mmx;
- c->pix_abs[1][3] = sad8_xy2_mmx;
- }
-
-#endif /* HAVE_INLINE_ASM */
-
if (EXTERNAL_MMX(cpu_flags)) {
c->sse[1] = ff_sse8_mmx;
#if HAVE_X86ASM
@@ -282,6 +167,8 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->sse[0] = ff_sse16_sse2;
c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
+ c->pix_abs[0][3] = ff_sad16_xy2_sse2;
+
#if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
@@ -298,6 +185,9 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->vsad[0] = ff_vsad16_approx_sse2;
}
}
+ if (avctx->flags & AV_CODEC_FLAG_BITEXACT) {
+ c->pix_abs[1][3] = ff_sad8_xy2_sse2;
+ }
}
if (EXTERNAL_SSSE3(cpu_flags)) {
--
2.49.1
>From c14ec47b6673d518492d28fc3d0d7e472ff5930b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Mon, 13 Oct 2025 22:36:49 +0200
Subject: [PATCH 2/3] avcodec/x86/me_cmp: Remove MMXEXT functions overridden by
SSE2
The SSE2 function overriding them are currently only set
if the SSE2SLOW flag is not set and if the codec is not Snow.
The former affects only outdated processors (AMDs from
before Barcelona (i.e. before 2007)) and is therefore irrelevant.
Snow does not use the pix_abs function pointers at all,
so this is also no obstacle.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/me_cmp.asm | 76 ------------------------------------
libavcodec/x86/me_cmp_init.c | 19 +++------
2 files changed, 5 insertions(+), 90 deletions(-)
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index ee83556d14..8f2d0a6ddf 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -539,16 +539,6 @@ cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
psadbw m0, [pix1q]
psadbw m2, [pix1q+strideq]
paddw m0, m2
-%if %1 != mmsize
- movu m1, [pix2q+8]
- movu m2, [pix2q+strideq+8]
- pavgb m1, [pix2q+9]
- pavgb m2, [pix2q+strideq+9]
- psadbw m1, [pix1q+8]
- psadbw m2, [pix1q+strideq+8]
- paddw m0, m1
- paddw m0, m2
-%endif
sub hd, 2
align 16
@@ -570,16 +560,6 @@ align 16
psadbw m2, [pix1q+strideq]
paddw m0, m1
paddw m0, m2
-%if %1 != mmsize
- movu m1, [pix2q+8]
- movu m2, [pix2q+strideq+8]
- pavgb m1, [pix2q+9]
- pavgb m2, [pix2q+strideq+9]
- psadbw m1, [pix1q+8]
- psadbw m2, [pix1q+strideq+8]
- paddw m0, m1
- paddw m0, m2
-%endif
sub hd, 2
jg .loop
%if mmsize == 16
@@ -592,7 +572,6 @@ align 16
INIT_MMX mmxext
SAD_X2 8
-SAD_X2 16
INIT_XMM sse2
SAD_X2 16
@@ -611,18 +590,6 @@ cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h
psadbw m0, [pix1q+strideq]
paddw m0, m1
mova m1, m3
-%if %1 != mmsize
- movu m4, [pix2q+8]
- movu m5, [pix2q+strideq+8]
- movu m6, [pix2q+2*strideq+8]
- pavgb m4, m5
- pavgb m5, m6
- psadbw m4, [pix1q+8]
- psadbw m5, [pix1q+strideq+8]
- paddw m0, m4
- paddw m0, m5
- mova m4, m6
-%endif
add pix2q, strideq
sub hd, 2
@@ -639,17 +606,6 @@ align 16
paddw m0, m1
paddw m0, m2
mova m1, m3
-%if %1 != mmsize
- movu m5, [pix2q+8]
- movu m6, [pix2q+strideq+8]
- pavgb m4, m5
- pavgb m5, m6
- psadbw m4, [pix1q+8]
- psadbw m5, [pix1q+strideq+8]
- paddw m0, m4
- paddw m0, m5
- mova m4, m6
-%endif
sub hd, 2
jg .loop
%if mmsize == 16
@@ -662,7 +618,6 @@ align 16
INIT_MMX mmxext
SAD_Y2 8
-SAD_Y2 16
INIT_XMM sse2
SAD_Y2 16
@@ -791,22 +746,6 @@ cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h
psadbw m0, [pix1q+strideq]
paddw m0, m1
mova m1, m3
-%if %1 != mmsize
- movu m5, [pix2q+8]
- movu m6, [pix2q+strideq+8]
- movu m7, [pix2q+2*strideq+8]
- pavgb m5, [pix2q+1+8]
- pavgb m6, [pix2q+strideq+1+8]
- pavgb m7, [pix2q+2*strideq+1+8]
- psubusb m6, m4
- pavgb m5, m6
- pavgb m6, m7
- psadbw m5, [pix1q+8]
- psadbw m6, [pix1q+strideq+8]
- paddw m0, m5
- paddw m0, m6
- mova m5, m7
-%endif
add pix2q, strideq
sub hd, 2
@@ -833,20 +772,6 @@ align 16
paddw m0, m1
paddw m0, m2
mova m1, m3
-%if %1 != mmsize
- movu m6, [pix2q+8]
- movu m7, [pix2q+strideq+8]
- pavgb m6, [pix2q+8+1]
- pavgb m7, [pix2q+strideq+8+1]
- psubusb m6, m4
- pavgb m5, m6
- pavgb m6, m7
- psadbw m5, [pix1q+8]
- psadbw m6, [pix1q+strideq+8]
- paddw m0, m5
- paddw m0, m6
- mova m5, m7
-%endif
sub hd, 2
jg .loop
%if mmsize == 16
@@ -859,7 +784,6 @@ align 16
INIT_MMX mmxext
SAD_APPROX_XY2 8
-SAD_APPROX_XY2 16
INIT_XMM sse2
SAD_APPROX_XY2 16
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index a3897e2a0b..658e5bbdc5 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -46,22 +46,16 @@ int ff_sad16_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad8_x2_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
-int ff_sad16_x2_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
- ptrdiff_t stride, int h);
int ff_sad16_x2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad8_y2_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
-int ff_sad16_y2_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
- ptrdiff_t stride, int h);
int ff_sad16_y2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad8_approx_xy2_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad8_xy2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
-int ff_sad16_approx_xy2_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
- ptrdiff_t stride, int h);
int ff_sad16_approx_xy2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_xy2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
@@ -144,9 +138,6 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->sad[0] = ff_sad16_mmxext;
c->sad[1] = ff_sad8_mmxext;
- c->pix_abs[0][0] = ff_sad16_mmxext;
- c->pix_abs[0][1] = ff_sad16_x2_mmxext;
- c->pix_abs[0][2] = ff_sad16_y2_mmxext;
c->pix_abs[1][0] = ff_sad8_mmxext;
c->pix_abs[1][1] = ff_sad8_x2_mmxext;
c->pix_abs[1][2] = ff_sad8_y2_mmxext;
@@ -155,7 +146,6 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->vsad[5] = ff_vsad_intra8_mmxext;
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
- c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
c->vsad[0] = ff_vsad16_approx_mmxext;
@@ -167,6 +157,9 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->sse[0] = ff_sse16_sse2;
c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
+ c->pix_abs[0][0] = ff_sad16_sse2;
+ c->pix_abs[0][1] = ff_sad16_x2_sse2;
+ c->pix_abs[0][2] = ff_sad16_y2_sse2;
c->pix_abs[0][3] = ff_sad16_xy2_sse2;
#if HAVE_ALIGNED_STACK
@@ -175,18 +168,16 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
#endif
if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
c->sad[0] = ff_sad16_sse2;
- c->pix_abs[0][0] = ff_sad16_sse2;
- c->pix_abs[0][1] = ff_sad16_x2_sse2;
- c->pix_abs[0][2] = ff_sad16_y2_sse2;
c->vsad[4] = ff_vsad_intra16_sse2;
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
- c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
c->vsad[0] = ff_vsad16_approx_sse2;
}
}
if (avctx->flags & AV_CODEC_FLAG_BITEXACT) {
c->pix_abs[1][3] = ff_sad8_xy2_sse2;
+ } else {
+ c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
}
}
--
2.49.1
>From 23ae5b723dfcb4c459bbfb3009f5cbeaaa17d366 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 14 Oct 2025 10:33:18 +0200
Subject: [PATCH 3/3] avcodec/x86/me_cmp: Replace MMXEXT size 16 funcs by
unaligned SSE2 funcs
Snow calls some of the me_cmp_funcs with insufficient alignment
for the first pointer (see get_block_rd() in snowenc.c);
therefore SSE2 functions which really need this alignment
don't get set for Snow and 542765ce3eccbca587d54262a512cbdb1407230d
consequently didn't remove MMXEXT functions which are overridden
by these SSE2 functions for normal codecs.
For reference, here is a command line which would segfault
if one simply used the ordinary SSE2 functions for Snow:
./ffmpeg -i mm-short.mpg -an -vcodec snow -t 0.2 -pix_fmt yuv444p \
-vstrict -2 -qscale 2 -flags +qpel -motion_est iter 444iter.avi
This commit adds unaligned SSE2 versions of these functions
and removes the MMXEXT ones. This in particular implies that
sad 16x16 now never uses MMX which allows to remove an emms_c
from ac3enc.c.
Benchmarks (u means unaligned version):
sad_0_c: 8.2 ( 1.00x)
sad_0_mmxext: 10.8 ( 0.76x)
sad_0_sse2: 6.2 ( 1.33x)
sad_0_sse2u: 6.7 ( 1.23x)
vsad_0_c: 44.7 ( 1.00x)
vsad_0_mmxext (approx): 12.2 ( 3.68x)
vsad_0_sse2 (approx): 7.8 ( 5.75x)
vsad_4_c: 88.4 ( 1.00x)
vsad_4_mmxext: 7.1 (12.46x)
vsad_4_sse2: 4.2 (21.15x)
vsad_4_sse2u: 5.5 (15.96x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/ac3enc.c | 3 -
libavcodec/x86/me_cmp.asm | 153 ++++++++++++-----------------------
libavcodec/x86/me_cmp_init.c | 26 +++---
3 files changed, 66 insertions(+), 116 deletions(-)
diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index a316d4e4d7..5a1a3ab63a 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -32,7 +32,6 @@
#include "libavutil/avassert.h"
#include "libavutil/channel_layout.h"
#include "libavutil/crc.h"
-#include "libavutil/emms.h"
#include "libavutil/internal.h"
#include "libavutil/mem.h"
#include "libavutil/mem_internal.h"
@@ -945,8 +944,6 @@ static void ac3_process_exponents(AC3EncodeContext *s)
compute_exp_strategy(s);
encode_exponents(s);
-
- emms_c();
}
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 8f2d0a6ddf..7825c8ef71 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -467,22 +467,25 @@ HF_NOISE 16
;---------------------------------------------------------------------------------------
;int ff_sad_<opt>(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
;---------------------------------------------------------------------------------------
-;%1 = 8/16
-%macro SAD 1
-cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
+;%1 = 8/16, %2 = a/u (whether pix1 is aligned or not)
+%macro SAD 1-2
+%ifidn %2, u
+cglobal sad%1u, 5, 5, 5, v, pix1, pix2, stride, h
+%else
+cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
+%endif
movu m2, [pix2q]
movu m1, [pix2q+strideq]
+%ifidn %2, u
+ movu m0, [pix1q]
+ movu m3, [pix1q+strideq]
+ psadbw m2, m0
+ psadbw m1, m3
+%else
psadbw m2, [pix1q]
psadbw m1, [pix1q+strideq]
- paddw m2, m1
-%if %1 != mmsize
- movu m0, [pix2q+8]
- movu m1, [pix2q+strideq+8]
- psadbw m0, [pix1q+8]
- psadbw m1, [pix1q+strideq+8]
- paddw m2, m0
- paddw m2, m1
%endif
+ paddw m2, m1
sub hd, 2
align 16
@@ -491,18 +494,17 @@ align 16
lea pix2q, [pix2q+strideq*2]
movu m0, [pix2q]
movu m1, [pix2q+strideq]
+%ifidn %2, u
+ movu m3, [pix1q]
+ movu m4, [pix1q+strideq]
+ psadbw m0, m3
+ psadbw m1, m4
+%else
psadbw m0, [pix1q]
psadbw m1, [pix1q+strideq]
- paddw m2, m0
- paddw m2, m1
-%if %1 != mmsize
- movu m0, [pix2q+8]
- movu m1, [pix2q+strideq+8]
- psadbw m0, [pix1q+8]
- psadbw m1, [pix1q+strideq+8]
- paddw m2, m0
- paddw m2, m1
%endif
+ paddw m2, m0
+ paddw m2, m1
sub hd, 2
jg .loop
%if mmsize == 16
@@ -515,9 +517,9 @@ align 16
INIT_MMX mmxext
SAD 8
-SAD 16
INIT_XMM sse2
SAD 16
+SAD 16, u
;------------------------------------------------------------------------------------------
;int ff_sad_x2_<opt>(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
@@ -791,46 +793,26 @@ SAD_APPROX_XY2 16
;int ff_vsad_intra(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
; ptrdiff_t line_size, int h);
;--------------------------------------------------------------------
-; %1 = 8/16
-%macro VSAD_INTRA 1
-cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
- mova m0, [pix1q]
-%if %1 == mmsize
- mova m2, [pix1q+lsizeq]
- psadbw m0, m2
+; %1 = 8/16, %2 = a/u (whether pix1 is aligned or not)
+%macro VSAD_INTRA 2
+%ifidn %2, u
+cglobal vsad_intra%1u, 5, 5, 3, v, pix1, pix2, lsize, h
%else
- mova m2, [pix1q+lsizeq]
- mova m3, [pix1q+8]
- mova m4, [pix1q+lsizeq+8]
- psadbw m0, m2
- psadbw m3, m4
- paddw m0, m3
+cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
%endif
+ mov%2 m0, [pix1q]
+ mov%2 m2, [pix1q+lsizeq]
+ psadbw m0, m2
sub hd, 2
.loop:
lea pix1q, [pix1q + 2*lsizeq]
-%if %1 == mmsize
- mova m1, [pix1q]
+ mov%2 m1, [pix1q]
psadbw m2, m1
paddw m0, m2
- mova m2, [pix1q+lsizeq]
+ mov%2 m2, [pix1q+lsizeq]
psadbw m1, m2
paddw m0, m1
-%else
- mova m1, [pix1q]
- mova m3, [pix1q+8]
- psadbw m2, m1
- psadbw m4, m3
- paddw m0, m2
- paddw m0, m4
- mova m2, [pix1q+lsizeq]
- mova m4, [pix1q+lsizeq+8]
- psadbw m1, m2
- psadbw m3, m4
- paddw m0, m1
- paddw m0, m3
-%endif
sub hd, 2
jg .loop
@@ -843,22 +825,25 @@ cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
%endmacro
INIT_MMX mmxext
-VSAD_INTRA 8
-VSAD_INTRA 16
+VSAD_INTRA 8, a
INIT_XMM sse2
-VSAD_INTRA 16
+VSAD_INTRA 16, a
+VSAD_INTRA 16, u
;---------------------------------------------------------------------
;int ff_vsad_approx(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
; ptrdiff_t line_size, int h);
;---------------------------------------------------------------------
-; %1 = 8/16
-%macro VSAD_APPROX 1
-cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
+; %1 = 8/16, %2 = a/u (whether pix1 is aligned or not)
+%macro VSAD_APPROX 2
+%ifidn %2, u
+cglobal vsad%1u_approx, 5, 5, 5, v, pix1, pix2, lsize, h
+%else
+cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
+%endif
mova m1, [pb_80]
- mova m0, [pix1q]
-%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
- mova m4, [pix1q+lsizeq]
+ mov%2 m0, [pix1q]
+ mov%2 m4, [pix1q+lsizeq]
%if mmsize == 16
movu m3, [pix2q]
movu m2, [pix2q+lsizeq]
@@ -871,29 +856,12 @@ cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
pxor m0, m1
pxor m4, m1
psadbw m0, m4
-%else ; vsad16_mmxext
- mova m3, [pix1q+8]
- psubb m0, [pix2q]
- psubb m3, [pix2q+8]
- pxor m0, m1
- pxor m3, m1
- mova m4, [pix1q+lsizeq]
- mova m5, [pix1q+lsizeq+8]
- psubb m4, [pix2q+lsizeq]
- psubb m5, [pix2q+lsizeq+8]
- pxor m4, m1
- pxor m5, m1
- psadbw m0, m4
- psadbw m3, m5
- paddw m0, m3
-%endif
sub hd, 2
.loop:
lea pix1q, [pix1q + 2*lsizeq]
lea pix2q, [pix2q + 2*lsizeq]
- mova m2, [pix1q]
-%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
+ mov%2 m2, [pix1q]
%if mmsize == 16
movu m3, [pix2q]
psubb m2, m3
@@ -903,33 +871,12 @@ cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
pxor m2, m1
psadbw m4, m2
paddw m0, m4
- mova m4, [pix1q+lsizeq]
+ mov%2 m4, [pix1q+lsizeq]
movu m3, [pix2q+lsizeq]
psubb m4, m3
pxor m4, m1
psadbw m2, m4
paddw m0, m2
-%else ; vsad16_mmxext
- mova m3, [pix1q+8]
- psubb m2, [pix2q]
- psubb m3, [pix2q+8]
- pxor m2, m1
- pxor m3, m1
- psadbw m4, m2
- psadbw m5, m3
- paddw m0, m4
- paddw m0, m5
- mova m4, [pix1q+lsizeq]
- mova m5, [pix1q+lsizeq+8]
- psubb m4, [pix2q+lsizeq]
- psubb m5, [pix2q+lsizeq+8]
- pxor m4, m1
- pxor m5, m1
- psadbw m2, m4
- psadbw m3, m5
- paddw m0, m2
- paddw m0, m3
-%endif
sub hd, 2
jg .loop
@@ -942,7 +889,7 @@ cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
%endmacro
INIT_MMX mmxext
-VSAD_APPROX 8
-VSAD_APPROX 16
+VSAD_APPROX 8, a
INIT_XMM sse2
-VSAD_APPROX 16
+VSAD_APPROX 16, a
+VSAD_APPROX 16, u
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index 658e5bbdc5..9b23cbe4dc 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -40,10 +40,10 @@ int ff_hf_noise8_mmx(const uint8_t *pix1, ptrdiff_t stride, int h);
int ff_hf_noise16_mmx(const uint8_t *pix1, ptrdiff_t stride, int h);
int ff_sad8_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
-int ff_sad16_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
- ptrdiff_t stride, int h);
int ff_sad16_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
+int ff_sad16u_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+ ptrdiff_t stride, int h);
int ff_sad8_x2_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_x2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
@@ -62,16 +62,16 @@ int ff_sad16_xy2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2
ptrdiff_t stride, int h);
int ff_vsad_intra8_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
-int ff_vsad_intra16_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
- ptrdiff_t stride, int h);
int ff_vsad_intra16_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
+int ff_vsad_intra16u_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+ ptrdiff_t stride, int h);
int ff_vsad8_approx_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
-int ff_vsad16_approx_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
- ptrdiff_t stride, int h);
int ff_vsad16_approx_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
+int ff_vsad16u_approx_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+ ptrdiff_t stride, int h);
#define hadamard_func(cpu) \
int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1, \
@@ -135,20 +135,17 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
#endif
- c->sad[0] = ff_sad16_mmxext;
c->sad[1] = ff_sad8_mmxext;
c->pix_abs[1][0] = ff_sad8_mmxext;
c->pix_abs[1][1] = ff_sad8_x2_mmxext;
c->pix_abs[1][2] = ff_sad8_y2_mmxext;
- c->vsad[4] = ff_vsad_intra16_mmxext;
c->vsad[5] = ff_vsad_intra8_mmxext;
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
- c->vsad[0] = ff_vsad16_approx_mmxext;
c->vsad[1] = ff_vsad8_approx_mmxext;
}
}
@@ -166,13 +163,22 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
#endif
- if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
+ if (avctx->codec_id != AV_CODEC_ID_SNOW) {
c->sad[0] = ff_sad16_sse2;
c->vsad[4] = ff_vsad_intra16_sse2;
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
c->vsad[0] = ff_vsad16_approx_sse2;
}
+ } else {
+ // Snow does not abide by the alignment requirements
+ // of blk1, so we use special versions without them for it.
+ c->sad[0] = ff_sad16u_sse2;
+
+ c->vsad[4] = ff_vsad_intra16u_sse2;
+ if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+ c->vsad[0] = ff_vsad16u_approx_sse2;
+ }
}
if (avctx->flags & AV_CODEC_FLAG_BITEXACT) {
c->pix_abs[1][3] = ff_sad8_xy2_sse2;
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-10-14 11:22 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-14 11:21 [FFmpeg-devel] [PATCH] Avoid using MMX in me_cmp (PR #20705) mkver via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git