From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
To: ffmpeg-devel@ffmpeg.org
Subject: Re: [FFmpeg-devel] [PATCH 1/7] postproc/postprocess: Remove obsolete MMX(EXT)/3Dnow functions
Date: Tue, 8 Nov 2022 16:41:52 +0100
Message-ID: <AS8P250MB0744CF39802FD5D4A137A4A88F3F9@AS8P250MB0744.EURP250.PROD.OUTLOOK.COM> (raw)
In-Reply-To: <GV1P250MB073755CC994F87E01FD0373A8F3D9@GV1P250MB0737.EURP250.PROD.OUTLOOK.COM>
Andreas Rheinhardt:
> postprocess.c currently has C, MMX, MMXEXT, 3DNow as well as
> SSE2 versions of its internal functions. But given that only
> ancient 32-bit x86 CPUs don't support SSE2, the MMX, MMXEXT
> and 3DNow versions are obsolete and are therefore removed by
> this commit. This saves about 56KB here.
>
> (The SSE2 version in particular is not really complete,
> so that it often falls back to MMXEXT (which means that
> there were some identical (apart from the name) MMXEXT
> and SSE2 functions; this duplication no longer exists
> with this commit.)
>
> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
> ---
> The PP_CPU_CAPS_MMX(2)/3DNOW could now be deprecated.
>
> libpostproc/postprocess.c | 69 ++--
> libpostproc/postprocess_template.c | 521 ++---------------------------
> 2 files changed, 57 insertions(+), 533 deletions(-)
>
> diff --git a/libpostproc/postprocess.c b/libpostproc/postprocess.c
> index 383c691cb4..0586e458b4 100644
> --- a/libpostproc/postprocess.c
> +++ b/libpostproc/postprocess.c
> @@ -26,28 +26,27 @@
> */
>
> /*
> - C MMX MMX2 3DNow AltiVec
> -isVertDC Ec Ec Ec
> -isVertMinMaxOk Ec Ec Ec
> -doVertLowPass E e e Ec
> -doVertDefFilter Ec Ec e e Ec
> -isHorizDC Ec Ec Ec
> -isHorizMinMaxOk a E Ec
> -doHorizLowPass E e e Ec
> -doHorizDefFilter Ec Ec e e Ec
> -do_a_deblock Ec E Ec E
> -deRing E e e* Ecp
> -Vertical RKAlgo1 E a a
> -Horizontal RKAlgo1 a a
> -Vertical X1# a E E
> -Horizontal X1# a E E
> -LinIpolDeinterlace e E E*
> -CubicIpolDeinterlace a e e*
> -LinBlendDeinterlace e E E*
> + C MMX MMX2 AltiVec
> +isVertDC Ec Ec Ec
> +isVertMinMaxOk Ec Ec Ec
> +doVertLowPass E e Ec
> +doVertDefFilter Ec Ec e Ec
> +isHorizDC Ec Ec Ec
> +isHorizMinMaxOk a E Ec
> +doHorizLowPass E e Ec
> +doHorizDefFilter Ec Ec e Ec
> +do_a_deblock Ec E Ec
> +deRing E e Ecp
> +Vertical RKAlgo1 E a
> +Horizontal RKAlgo1 a
> +Vertical X1# a E
> +Horizontal X1# a E
> +LinIpolDeinterlace e E
> +CubicIpolDeinterlace a e
> +LinBlendDeinterlace e E
> MedianDeinterlace# E Ec Ec
> -TempDeNoiser# E e e Ec
> +TempDeNoiser# E e Ec
>
> -* I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
> # more or less selfinvented filters so the exactness is not too meaningful
> E = Exact implementation
> e = almost exact implementation (slightly different rounding,...)
> @@ -83,7 +82,6 @@ try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
> #include <stdlib.h>
> #include <string.h>
> //#undef HAVE_MMXEXT_INLINE
> -//#define HAVE_AMD3DNOW_INLINE
> //#undef HAVE_MMX_INLINE
> //#undef ARCH_X86
> //#define DEBUG_BRIGHTNESS
> @@ -494,7 +492,7 @@ static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
> }
> }
>
> -//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
> +//Note: we have C and SSE2 version (which uses MMX(EXT) when advantageous)
> //Plain C versions
> //we always compile C for testing which needs bitexactness
> #define TEMPLATE_PP_C 1
> @@ -508,27 +506,12 @@ static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
>
> #if ARCH_X86 && HAVE_INLINE_ASM
> # if CONFIG_RUNTIME_CPUDETECT
> -# define TEMPLATE_PP_MMX 1
> -# include "postprocess_template.c"
> -# define TEMPLATE_PP_MMXEXT 1
> -# include "postprocess_template.c"
> -# define TEMPLATE_PP_3DNOW 1
> -# include "postprocess_template.c"
> # define TEMPLATE_PP_SSE2 1
> # include "postprocess_template.c"
> # else
> # if HAVE_SSE2_INLINE
> # define TEMPLATE_PP_SSE2 1
> # include "postprocess_template.c"
> -# elif HAVE_MMXEXT_INLINE
> -# define TEMPLATE_PP_MMXEXT 1
> -# include "postprocess_template.c"
> -# elif HAVE_AMD3DNOW_INLINE
> -# define TEMPLATE_PP_3DNOW 1
> -# include "postprocess_template.c"
> -# elif HAVE_MMX_INLINE
> -# define TEMPLATE_PP_MMX 1
> -# include "postprocess_template.c"
> # endif
> # endif
> #endif
> @@ -549,21 +532,12 @@ static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[]
> #if ARCH_X86 && HAVE_INLINE_ASM
> // ordered per speed fastest first
> if (c->cpuCaps & AV_CPU_FLAG_SSE2) pp = postProcess_SSE2;
> - else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT) pp = postProcess_MMX2;
> - else if (c->cpuCaps & AV_CPU_FLAG_3DNOW) pp = postProcess_3DNow;
> - else if (c->cpuCaps & AV_CPU_FLAG_MMX) pp = postProcess_MMX;
> #elif HAVE_ALTIVEC
> if (c->cpuCaps & AV_CPU_FLAG_ALTIVEC) pp = postProcess_altivec;
> #endif
> #else /* CONFIG_RUNTIME_CPUDETECT */
> #if HAVE_SSE2_INLINE
> pp = postProcess_SSE2;
> -#elif HAVE_MMXEXT_INLINE
> - pp = postProcess_MMX2;
> -#elif HAVE_AMD3DNOW_INLINE
> - pp = postProcess_3DNow;
> -#elif HAVE_MMX_INLINE
> - pp = postProcess_MMX;
> #elif HAVE_ALTIVEC
> pp = postProcess_altivec;
> #endif
> @@ -877,9 +851,6 @@ av_cold pp_context *pp_get_context(int width, int height, int cpuCaps){
> c->cpuCaps = av_get_cpu_flags();
> } else {
> c->cpuCaps = 0;
> - if (cpuCaps & PP_CPU_CAPS_MMX) c->cpuCaps |= AV_CPU_FLAG_MMX;
> - if (cpuCaps & PP_CPU_CAPS_MMX2) c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
> - if (cpuCaps & PP_CPU_CAPS_3DNOW) c->cpuCaps |= AV_CPU_FLAG_3DNOW;
> if (cpuCaps & PP_CPU_CAPS_ALTIVEC) c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
> }
>
> diff --git a/libpostproc/postprocess_template.c b/libpostproc/postprocess_template.c
> index 9f76b7c587..bcf7bdad66 100644
> --- a/libpostproc/postprocess_template.c
> +++ b/libpostproc/postprocess_template.c
> @@ -20,7 +20,7 @@
>
> /**
> * @file
> - * mmx/mmx2/3dnow postprocess code.
> + * mmx/mmx2/sse2 postprocess code.
> */
>
> #include "libavutil/mem_internal.h"
> @@ -57,14 +57,6 @@
> # define TEMPLATE_PP_MMXEXT 0
> #endif
>
> -#ifdef TEMPLATE_PP_3DNOW
> -# undef TEMPLATE_PP_MMX
> -# define TEMPLATE_PP_MMX 1
> -# define RENAME(a) a ## _3DNow
> -#else
> -# define TEMPLATE_PP_3DNOW 0
> -#endif
> -
> #ifdef TEMPLATE_PP_SSE2
> # undef TEMPLATE_PP_MMX
> # define TEMPLATE_PP_MMX 1
> @@ -82,30 +74,19 @@
>
> #if TEMPLATE_PP_MMXEXT
> #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
> -#elif TEMPLATE_PP_3DNOW
> -#define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
> #endif
> #define PAVGB(a,b) REAL_PAVGB(a,b)
>
> #if TEMPLATE_PP_MMXEXT
> #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
> -#elif TEMPLATE_PP_MMX
> -#define PMINUB(b,a,t) \
> - "movq " #a ", " #t " \n\t"\
> - "psubusb " #b ", " #t " \n\t"\
> - "psubb " #t ", " #a " \n\t"
> #endif
>
> #if TEMPLATE_PP_MMXEXT
> #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
> -#elif TEMPLATE_PP_MMX
> -#define PMAXUB(a,b) \
> - "psubusb " #a ", " #b " \n\t"\
> - "paddb " #a ", " #b " \n\t"
> #endif
>
> //FIXME? |255-0| = 1 (should not be a problem ...)
> -#if TEMPLATE_PP_MMX
> +#if TEMPLATE_PP_MMXEXT
> /**
> * Check if the middle 8x8 Block in the given 8x16 block is flat
> */
> @@ -185,20 +166,8 @@ static inline int RENAME(vertClassify)(const uint8_t src[], int stride, PPContex
> "psubusb %%mm3, %%mm4 \n\t"
>
> " \n\t"
> -#if TEMPLATE_PP_MMXEXT
> "pxor %%mm7, %%mm7 \n\t"
> "psadbw %%mm7, %%mm0 \n\t"
> -#else
> - "movq %%mm0, %%mm1 \n\t"
> - "psrlw $8, %%mm0 \n\t"
> - "paddb %%mm1, %%mm0 \n\t"
> - "movq %%mm0, %%mm1 \n\t"
> - "psrlq $16, %%mm0 \n\t"
> - "paddb %%mm1, %%mm0 \n\t"
> - "movq %%mm0, %%mm1 \n\t"
> - "psrlq $32, %%mm0 \n\t"
> - "paddb %%mm1, %%mm0 \n\t"
> -#endif
> "movq %4, %%mm7 \n\t" // QP,..., QP
> "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
> "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
> @@ -219,7 +188,7 @@ static inline int RENAME(vertClassify)(const uint8_t src[], int stride, PPContex
> return 2;
> }
> }
> -#endif //TEMPLATE_PP_MMX
> +#endif //TEMPLATE_PP_MMXEXT
>
> /**
> * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
> @@ -228,7 +197,7 @@ static inline int RENAME(vertClassify)(const uint8_t src[], int stride, PPContex
> #if !TEMPLATE_PP_ALTIVEC
> static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
> {
> -#if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#if TEMPLATE_PP_MMXEXT
> src+= stride*3;
> __asm__ volatile( //"movv %0 %1 %2\n\t"
> "movq %2, %%mm0 \n\t" // QP,..., QP
> @@ -355,7 +324,7 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
> : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
> : "%"FF_REG_a, "%"FF_REG_c
> );
> -#else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#else //TEMPLATE_PP_MMXEXT
> const int l1= stride;
> const int l2= stride + l1;
> const int l3= stride + l2;
> @@ -394,7 +363,7 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
>
> src++;
> }
> -#endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#endif //TEMPLATE_PP_MMXEXT
> }
> #endif //TEMPLATE_PP_ALTIVEC
>
> @@ -407,7 +376,7 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
> */
> static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
> {
> -#if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#if TEMPLATE_PP_MMXEXT
> src+= stride*3;
>
> __asm__ volatile(
> @@ -494,7 +463,7 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
> NAMED_CONSTRAINTS_ADD(b01)
> : "%"FF_REG_a, "%"FF_REG_c
> );
> -#else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#else //TEMPLATE_PP_MMXEXT
>
> const int l1= stride;
> const int l2= stride + l1;
> @@ -528,13 +497,13 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
> }
> src++;
> }
> -#endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#endif //TEMPLATE_PP_MMXEXT
> }
>
> #if !TEMPLATE_PP_ALTIVEC
> static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
> {
> -#if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#if TEMPLATE_PP_MMXEXT
> /*
> uint8_t tmp[16];
> const int l1= stride;
> @@ -816,239 +785,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
> }
> }
> */
> -#elif TEMPLATE_PP_MMX
> - DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
> - src+= stride*4;
> - __asm__ volatile(
> - "pxor %%mm7, %%mm7 \n\t"
> -// 0 1 2 3 4 5 6 7
> -// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1
> -// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1
> -
> - "movq (%0), %%mm0 \n\t"
> - "movq %%mm0, %%mm1 \n\t"
> - "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
> - "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
> -
> - "movq (%0, %1), %%mm2 \n\t"
> - "lea (%0, %1, 2), %%"FF_REG_a" \n\t"
> - "movq %%mm2, %%mm3 \n\t"
> - "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
> - "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
> -
> - "movq (%%"FF_REG_a"), %%mm4 \n\t"
> - "movq %%mm4, %%mm5 \n\t"
> - "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
> - "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
> -
> - "paddw %%mm0, %%mm0 \n\t" // 2L0
> - "paddw %%mm1, %%mm1 \n\t" // 2H0
> - "psubw %%mm4, %%mm2 \n\t" // L1 - L2
> - "psubw %%mm5, %%mm3 \n\t" // H1 - H2
> - "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
> - "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
> -
> - "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
> - "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
> - "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
> - "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
> -
> - "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
> - "movq %%mm2, %%mm3 \n\t"
> - "punpcklbw %%mm7, %%mm2 \n\t" // L3
> - "punpckhbw %%mm7, %%mm3 \n\t" // H3
> -
> - "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
> - "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
> - "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
> - "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
> - "movq %%mm0, (%3) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
> - "movq %%mm1, 8(%3) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
> -
> - "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
> - "movq %%mm0, %%mm1 \n\t"
> - "punpcklbw %%mm7, %%mm0 \n\t" // L4
> - "punpckhbw %%mm7, %%mm1 \n\t" // H4
> -
> - "psubw %%mm0, %%mm2 \n\t" // L3 - L4
> - "psubw %%mm1, %%mm3 \n\t" // H3 - H4
> - "movq %%mm2, 16(%3) \n\t" // L3 - L4
> - "movq %%mm3, 24(%3) \n\t" // H3 - H4
> - "paddw %%mm4, %%mm4 \n\t" // 2L2
> - "paddw %%mm5, %%mm5 \n\t" // 2H2
> - "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
> - "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
> -
> - "lea (%%"FF_REG_a", %1), %0 \n\t"
> - "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
> - "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
> - "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
> - "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
> -//50 opcodes so far
> - "movq (%0, %1, 2), %%mm2 \n\t"
> - "movq %%mm2, %%mm3 \n\t"
> - "punpcklbw %%mm7, %%mm2 \n\t" // L5
> - "punpckhbw %%mm7, %%mm3 \n\t" // H5
> - "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
> - "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
> - "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
> - "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
> -
> - "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
> - "punpcklbw %%mm7, %%mm6 \n\t" // L6
> - "psubw %%mm6, %%mm2 \n\t" // L5 - L6
> - "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
> - "punpckhbw %%mm7, %%mm6 \n\t" // H6
> - "psubw %%mm6, %%mm3 \n\t" // H5 - H6
> -
> - "paddw %%mm0, %%mm0 \n\t" // 2L4
> - "paddw %%mm1, %%mm1 \n\t" // 2H4
> - "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
> - "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
> -
> - "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
> - "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
> - "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
> - "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
> -
> - "movq (%0, %1, 4), %%mm2 \n\t"
> - "movq %%mm2, %%mm3 \n\t"
> - "punpcklbw %%mm7, %%mm2 \n\t" // L7
> - "punpckhbw %%mm7, %%mm3 \n\t" // H7
> -
> - "paddw %%mm2, %%mm2 \n\t" // 2L7
> - "paddw %%mm3, %%mm3 \n\t" // 2H7
> - "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
> - "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
> -
> - "movq (%3), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
> - "movq 8(%3), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
> -
> -#if TEMPLATE_PP_MMXEXT
> - "movq %%mm7, %%mm6 \n\t" // 0
> - "psubw %%mm0, %%mm6 \n\t"
> - "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
> - "movq %%mm7, %%mm6 \n\t" // 0
> - "psubw %%mm1, %%mm6 \n\t"
> - "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
> - "movq %%mm7, %%mm6 \n\t" // 0
> - "psubw %%mm2, %%mm6 \n\t"
> - "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
> - "movq %%mm7, %%mm6 \n\t" // 0
> - "psubw %%mm3, %%mm6 \n\t"
> - "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
> -#else
> - "movq %%mm7, %%mm6 \n\t" // 0
> - "pcmpgtw %%mm0, %%mm6 \n\t"
> - "pxor %%mm6, %%mm0 \n\t"
> - "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
> - "movq %%mm7, %%mm6 \n\t" // 0
> - "pcmpgtw %%mm1, %%mm6 \n\t"
> - "pxor %%mm6, %%mm1 \n\t"
> - "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
> - "movq %%mm7, %%mm6 \n\t" // 0
> - "pcmpgtw %%mm2, %%mm6 \n\t"
> - "pxor %%mm6, %%mm2 \n\t"
> - "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
> - "movq %%mm7, %%mm6 \n\t" // 0
> - "pcmpgtw %%mm3, %%mm6 \n\t"
> - "pxor %%mm6, %%mm3 \n\t"
> - "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
> -#endif
> -
> -#if TEMPLATE_PP_MMXEXT
> - "pminsw %%mm2, %%mm0 \n\t"
> - "pminsw %%mm3, %%mm1 \n\t"
> -#else
> - "movq %%mm0, %%mm6 \n\t"
> - "psubusw %%mm2, %%mm6 \n\t"
> - "psubw %%mm6, %%mm0 \n\t"
> - "movq %%mm1, %%mm6 \n\t"
> - "psubusw %%mm3, %%mm6 \n\t"
> - "psubw %%mm6, %%mm1 \n\t"
> -#endif
> -
> - "movd %2, %%mm2 \n\t" // QP
> - "punpcklbw %%mm7, %%mm2 \n\t"
> -
> - "movq %%mm7, %%mm6 \n\t" // 0
> - "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
> - "pxor %%mm6, %%mm4 \n\t"
> - "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
> - "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
> - "pxor %%mm7, %%mm5 \n\t"
> - "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
> -// 100 opcodes
> - "psllw $3, %%mm2 \n\t" // 8QP
> - "movq %%mm2, %%mm3 \n\t" // 8QP
> - "pcmpgtw %%mm4, %%mm2 \n\t"
> - "pcmpgtw %%mm5, %%mm3 \n\t"
> - "pand %%mm2, %%mm4 \n\t"
> - "pand %%mm3, %%mm5 \n\t"
> -
> -
> - "psubusw %%mm0, %%mm4 \n\t" // hd
> - "psubusw %%mm1, %%mm5 \n\t" // ld
> -
> -
> - "movq "MANGLE(w05)", %%mm2 \n\t" // 5
> - "pmullw %%mm2, %%mm4 \n\t"
> - "pmullw %%mm2, %%mm5 \n\t"
> - "movq "MANGLE(w20)", %%mm2 \n\t" // 32
> - "paddw %%mm2, %%mm4 \n\t"
> - "paddw %%mm2, %%mm5 \n\t"
> - "psrlw $6, %%mm4 \n\t"
> - "psrlw $6, %%mm5 \n\t"
> -
> - "movq 16(%3), %%mm0 \n\t" // L3 - L4
> - "movq 24(%3), %%mm1 \n\t" // H3 - H4
> -
> - "pxor %%mm2, %%mm2 \n\t"
> - "pxor %%mm3, %%mm3 \n\t"
> -
> - "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
> - "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
> - "pxor %%mm2, %%mm0 \n\t"
> - "pxor %%mm3, %%mm1 \n\t"
> - "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
> - "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
> - "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
> - "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
> -
> - "pxor %%mm6, %%mm2 \n\t"
> - "pxor %%mm7, %%mm3 \n\t"
> - "pand %%mm2, %%mm4 \n\t"
> - "pand %%mm3, %%mm5 \n\t"
> -
> -#if TEMPLATE_PP_MMXEXT
> - "pminsw %%mm0, %%mm4 \n\t"
> - "pminsw %%mm1, %%mm5 \n\t"
> -#else
> - "movq %%mm4, %%mm2 \n\t"
> - "psubusw %%mm0, %%mm2 \n\t"
> - "psubw %%mm2, %%mm4 \n\t"
> - "movq %%mm5, %%mm2 \n\t"
> - "psubusw %%mm1, %%mm2 \n\t"
> - "psubw %%mm2, %%mm5 \n\t"
> -#endif
> - "pxor %%mm6, %%mm4 \n\t"
> - "pxor %%mm7, %%mm5 \n\t"
> - "psubw %%mm6, %%mm4 \n\t"
> - "psubw %%mm7, %%mm5 \n\t"
> - "packsswb %%mm5, %%mm4 \n\t"
> - "movq (%0), %%mm0 \n\t"
> - "paddb %%mm4, %%mm0 \n\t"
> - "movq %%mm0, (%0) \n\t"
> - "movq (%0, %1), %%mm0 \n\t"
> - "psubb %%mm4, %%mm0 \n\t"
> - "movq %%mm0, (%0, %1) \n\t"
> -
> - : "+r" (src)
> - : "r" ((x86_reg)stride), "m" (c->pQPb), "r"(tmp)
> - NAMED_CONSTRAINTS_ADD(w05,w20)
> - : "%"FF_REG_a
> - );
> -#else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#else //TEMPLATE_PP_MMXEXT
> const int l1= stride;
> const int l2= stride + l1;
> const int l3= stride + l2;
> @@ -1086,14 +823,14 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
> }
> src++;
> }
> -#endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#endif //TEMPLATE_PP_MMXEXT
> }
> #endif //TEMPLATE_PP_ALTIVEC
>
> #if !TEMPLATE_PP_ALTIVEC
> static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
> {
> -#if HAVE_7REGS && (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW)
> +#if HAVE_7REGS && TEMPLATE_PP_MMXEXT
> DECLARE_ALIGNED(8, uint64_t, tmp)[3];
> __asm__ volatile(
> "pxor %%mm6, %%mm6 \n\t"
> @@ -1113,20 +850,10 @@ static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
>
> #undef REAL_FIND_MIN_MAX
> #undef FIND_MIN_MAX
> -#if TEMPLATE_PP_MMXEXT
> #define REAL_FIND_MIN_MAX(addr)\
> "movq " #addr ", %%mm0 \n\t"\
> "pminub %%mm0, %%mm7 \n\t"\
> "pmaxub %%mm0, %%mm6 \n\t"
> -#else
> -#define REAL_FIND_MIN_MAX(addr)\
> - "movq " #addr ", %%mm0 \n\t"\
> - "movq %%mm7, %%mm1 \n\t"\
> - "psubusb %%mm0, %%mm6 \n\t"\
> - "paddb %%mm0, %%mm6 \n\t"\
> - "psubusb %%mm0, %%mm1 \n\t"\
> - "psubb %%mm1, %%mm7 \n\t"
> -#endif
> #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
>
> FIND_MIN_MAX((%%FF_REGa))
> @@ -1140,49 +867,20 @@ FIND_MIN_MAX((%0, %1, 8))
>
> "movq %%mm7, %%mm4 \n\t"
> "psrlq $8, %%mm7 \n\t"
> -#if TEMPLATE_PP_MMXEXT
> "pminub %%mm4, %%mm7 \n\t" // min of pixels
> "pshufw $0xF9, %%mm7, %%mm4 \n\t"
> "pminub %%mm4, %%mm7 \n\t" // min of pixels
> "pshufw $0xFE, %%mm7, %%mm4 \n\t"
> "pminub %%mm4, %%mm7 \n\t"
> -#else
> - "movq %%mm7, %%mm1 \n\t"
> - "psubusb %%mm4, %%mm1 \n\t"
> - "psubb %%mm1, %%mm7 \n\t"
> - "movq %%mm7, %%mm4 \n\t"
> - "psrlq $16, %%mm7 \n\t"
> - "movq %%mm7, %%mm1 \n\t"
> - "psubusb %%mm4, %%mm1 \n\t"
> - "psubb %%mm1, %%mm7 \n\t"
> - "movq %%mm7, %%mm4 \n\t"
> - "psrlq $32, %%mm7 \n\t"
> - "movq %%mm7, %%mm1 \n\t"
> - "psubusb %%mm4, %%mm1 \n\t"
> - "psubb %%mm1, %%mm7 \n\t"
> -#endif
>
>
> "movq %%mm6, %%mm4 \n\t"
> "psrlq $8, %%mm6 \n\t"
> -#if TEMPLATE_PP_MMXEXT
> "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
> "pshufw $0xF9, %%mm6, %%mm4 \n\t"
> "pmaxub %%mm4, %%mm6 \n\t"
> "pshufw $0xFE, %%mm6, %%mm4 \n\t"
> "pmaxub %%mm4, %%mm6 \n\t"
> -#else
> - "psubusb %%mm4, %%mm6 \n\t"
> - "paddb %%mm4, %%mm6 \n\t"
> - "movq %%mm6, %%mm4 \n\t"
> - "psrlq $16, %%mm6 \n\t"
> - "psubusb %%mm4, %%mm6 \n\t"
> - "paddb %%mm4, %%mm6 \n\t"
> - "movq %%mm6, %%mm4 \n\t"
> - "psrlq $32, %%mm6 \n\t"
> - "psubusb %%mm4, %%mm6 \n\t"
> - "paddb %%mm4, %%mm6 \n\t"
> -#endif
> "movq %%mm6, %%mm0 \n\t" // max
> "psubb %%mm7, %%mm6 \n\t" // max - min
> "push %%"FF_REG_a" \n\t"
> @@ -1320,7 +1018,7 @@ DERING_CORE((%0, %1, 8) ,(%%FF_REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,
> NAMED_CONSTRAINTS_ADD(deringThreshold,b00,b02,b08)
> : "%"FF_REG_a, "%"FF_REG_d
> );
> -#else // HAVE_7REGS && (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW)
> +#else // HAVE_7REGS && TEMPLATE_PP_MMXEXT
> int y;
> int min=255;
> int max=0;
> @@ -1438,7 +1136,7 @@ DERING_CORE((%0, %1, 8) ,(%%FF_REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,
> // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
> }
> #endif
> -#endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#endif //TEMPLATE_PP_MMXEXT
> }
> #endif //TEMPLATE_PP_ALTIVEC
>
> @@ -1450,7 +1148,7 @@ DERING_CORE((%0, %1, 8) ,(%%FF_REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,
> */
> static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
> {
> -#if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#if TEMPLATE_PP_MMXEXT
> src+= 4*stride;
> __asm__ volatile(
> "lea (%0, %1), %%"FF_REG_a" \n\t"
> @@ -1503,14 +1201,13 @@ static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int strid
> */
> static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
> {
> -#if TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#if TEMPLATE_PP_SSE2
> src+= stride*3;
> __asm__ volatile(
> "lea (%0, %1), %%"FF_REG_a" \n\t"
> "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
> "lea (%%"FF_REG_d", %1, 4), %%"FF_REG_c"\n\t"
> "add %1, %%"FF_REG_c" \n\t"
> -#if TEMPLATE_PP_SSE2
> "pxor %%xmm7, %%xmm7 \n\t"
> #define REAL_DEINT_CUBIC(a,b,c,d,e)\
> "movq " #a ", %%xmm0 \n\t"\
> @@ -1526,33 +1223,6 @@ static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride
> "psubw %%xmm0, %%xmm1 \n\t"\
> "packuswb %%xmm1, %%xmm1 \n\t"\
> "movlps %%xmm1, " #c " \n\t"
> -#else //TEMPLATE_PP_SSE2
> - "pxor %%mm7, %%mm7 \n\t"
> -// 0 1 2 3 4 5 6 7 8 9 10
> -// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
> -
> -#define REAL_DEINT_CUBIC(a,b,c,d,e)\
> - "movq " #a ", %%mm0 \n\t"\
> - "movq " #b ", %%mm1 \n\t"\
> - "movq " #d ", %%mm2 \n\t"\
> - "movq " #e ", %%mm3 \n\t"\
> - PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
> - PAVGB(%%mm3, %%mm0) /* (a+e) /2 */\
> - "movq %%mm0, %%mm2 \n\t"\
> - "punpcklbw %%mm7, %%mm0 \n\t"\
> - "punpckhbw %%mm7, %%mm2 \n\t"\
> - "movq %%mm1, %%mm3 \n\t"\
> - "punpcklbw %%mm7, %%mm1 \n\t"\
> - "punpckhbw %%mm7, %%mm3 \n\t"\
> - "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
> - "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
> - "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
> - "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
> - "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
> - "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
> - "packuswb %%mm3, %%mm1 \n\t"\
> - "movq %%mm1, " #c " \n\t"
> -#endif //TEMPLATE_PP_SSE2
> #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
>
> DEINT_CUBIC((%0) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd, %1))
> @@ -1562,13 +1232,11 @@ DEINT_CUBIC((%%FF_REGd, %1), (%0, %1, 8) , (%%FF_REGd, %1, 4), (%%FF_REGc)
>
> : : "r" (src), "r" ((x86_reg)stride)
> :
> -#if TEMPLATE_PP_SSE2
> XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm7",)
> -#endif
> "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c
> );
> #undef REAL_DEINT_CUBIC
> -#else //TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#else //TEMPLATE_PP_SSE2
> int x;
> src+= stride*3;
> for(x=0; x<8; x++){
> @@ -1578,7 +1246,7 @@ DEINT_CUBIC((%%FF_REGd, %1), (%0, %1, 8) , (%%FF_REGd, %1, 4), (%%FF_REGc)
> src[stride*9] = av_clip_uint8((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
> src++;
> }
> -#endif //TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#endif //TEMPLATE_PP_SSE2
> }
>
> /**
> @@ -1590,7 +1258,7 @@ DEINT_CUBIC((%%FF_REGd, %1), (%0, %1, 8) , (%%FF_REGd, %1, 4), (%%FF_REGc)
> */
> static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
> {
> -#if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#if TEMPLATE_PP_MMXEXT
> src+= stride*4;
> __asm__ volatile(
> "lea (%0, %1), %%"FF_REG_a" \n\t"
> @@ -1639,7 +1307,7 @@ DEINT_FF((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4
> : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
> : "%"FF_REG_a, "%"FF_REG_d
> );
> -#else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#else //TEMPLATE_PP_MMXEXT
> int x;
> src+= stride*4;
> for(x=0; x<8; x++){
> @@ -1657,7 +1325,7 @@ DEINT_FF((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4
>
> src++;
> }
> -#endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#endif //TEMPLATE_PP_MMXEXT
> }
>
> /**
> @@ -1669,7 +1337,7 @@ DEINT_FF((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4
> */
> static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
> {
> -#if (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
> +#if TEMPLATE_PP_MMXEXT && HAVE_6REGS
> src+= stride*4;
> __asm__ volatile(
> "lea (%0, %1), %%"FF_REG_a" \n\t"
> @@ -1729,7 +1397,7 @@ DEINT_L5(%%mm1, %%mm0, (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4
> : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
> : "%"FF_REG_a, "%"FF_REG_d
> );
> -#else //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
> +#else //TEMPLATE_PP_MMXEXT && HAVE_6REGS
> int x;
> src+= stride*4;
> for(x=0; x<8; x++){
> @@ -1758,7 +1426,7 @@ DEINT_L5(%%mm1, %%mm0, (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4
>
> src++;
> }
> -#endif //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
> +#endif // TEMPLATE_PP_MMXEXT && HAVE_6REGS
> }
>
> /**
> @@ -1770,7 +1438,7 @@ DEINT_L5(%%mm1, %%mm0, (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4
> */
> static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
> {
> -#if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#if TEMPLATE_PP_MMXEXT
> src+= 4*stride;
> __asm__ volatile(
> "lea (%0, %1), %%"FF_REG_a" \n\t"
> @@ -1817,7 +1485,7 @@ static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uin
> : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
> : "%"FF_REG_a, "%"FF_REG_d
> );
> -#else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#else //TEMPLATE_PP_MMXEXT
> int a, b, c, x;
> src+= 4*stride;
>
> @@ -1860,7 +1528,7 @@ static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uin
> src += 4;
> tmp += 4;
> }
> -#endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
> +#endif //TEMPLATE_PP_MMXEXT
> }
>
> /**
> @@ -1871,9 +1539,8 @@ static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uin
> */
> static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
> {
> -#if TEMPLATE_PP_MMX
> - src+= 4*stride;
> #if TEMPLATE_PP_MMXEXT
> + src+= 4*stride;
> __asm__ volatile(
> "lea (%0, %1), %%"FF_REG_a" \n\t"
> "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
> @@ -1922,48 +1589,6 @@ static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
> : "%"FF_REG_a, "%"FF_REG_d
> );
>
> -#else // MMX without MMX2
> - __asm__ volatile(
> - "lea (%0, %1), %%"FF_REG_a" \n\t"
> - "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
> -// 0 1 2 3 4 5 6 7 8 9
> -// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
> - "pxor %%mm7, %%mm7 \n\t"
> -
> -#define REAL_MEDIAN(a,b,c)\
> - "movq " #a ", %%mm0 \n\t"\
> - "movq " #b ", %%mm2 \n\t"\
> - "movq " #c ", %%mm1 \n\t"\
> - "movq %%mm0, %%mm3 \n\t"\
> - "movq %%mm1, %%mm4 \n\t"\
> - "movq %%mm2, %%mm5 \n\t"\
> - "psubusb %%mm1, %%mm3 \n\t"\
> - "psubusb %%mm2, %%mm4 \n\t"\
> - "psubusb %%mm0, %%mm5 \n\t"\
> - "pcmpeqb %%mm7, %%mm3 \n\t"\
> - "pcmpeqb %%mm7, %%mm4 \n\t"\
> - "pcmpeqb %%mm7, %%mm5 \n\t"\
> - "movq %%mm3, %%mm6 \n\t"\
> - "pxor %%mm4, %%mm3 \n\t"\
> - "pxor %%mm5, %%mm4 \n\t"\
> - "pxor %%mm6, %%mm5 \n\t"\
> - "por %%mm3, %%mm1 \n\t"\
> - "por %%mm4, %%mm2 \n\t"\
> - "por %%mm5, %%mm0 \n\t"\
> - "pand %%mm2, %%mm0 \n\t"\
> - "pand %%mm1, %%mm0 \n\t"\
> - "movq %%mm0, " #b " \n\t"
> -#define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c)
> -
> -MEDIAN((%0) , (%%FF_REGa) , (%%FF_REGa, %1))
> -MEDIAN((%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4))
> -MEDIAN((%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1))
> -MEDIAN((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8))
> -
> - : : "r" (src), "r" ((x86_reg)stride)
> - : "%"FF_REG_a, "%"FF_REG_d
> - );
> -#endif //TEMPLATE_PP_MMXEXT
> #else //TEMPLATE_PP_MMX
> int x, y;
> src+= 4*stride;
> @@ -2165,7 +1790,7 @@ static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
>
> #define FAST_L2_DIFF
> //#define L1_DIFF //u should change the thresholds too if u try that one
> -#if (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
> +#if TEMPLATE_PP_MMXEXT && HAVE_6REGS
> __asm__ volatile(
> "lea (%2, %2, 2), %%"FF_REG_a" \n\t" // 3*stride
> "lea (%2, %2, 4), %%"FF_REG_d" \n\t" // 5*stride
> @@ -2454,7 +2079,7 @@ L2_DIFF_CORE((%0, %%FF_REGc) , (%1, %%FF_REGc))
> NAMED_CONSTRAINTS_ADD(b80)
> : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c, "memory"
> );
> -#else //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
> +#else //TEMPLATE_PP_MMXEXT && HAVE_6REGS
> {
> int y;
> int d=0;
> @@ -2537,11 +2162,11 @@ Switch between
> }
> }
> }
> -#endif //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
> +#endif //TEMPLATE_PP_MMXEXT && HAVE_6REGS
> }
> #endif //TEMPLATE_PP_ALTIVEC
>
> -#if TEMPLATE_PP_MMX
> +#if TEMPLATE_PP_MMXEXT
> /**
> * accurate deblock filter
> */
> @@ -2945,7 +2570,6 @@ static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int st
> "movq (%4), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
> "movq 8(%4), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
>
> -#if TEMPLATE_PP_MMXEXT
> "movq %%mm7, %%mm6 \n\t" // 0
> "psubw %%mm0, %%mm6 \n\t"
> "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
> @@ -2958,36 +2582,9 @@ static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int st
> "movq %%mm7, %%mm6 \n\t" // 0
> "psubw %%mm3, %%mm6 \n\t"
> "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
> -#else
> - "movq %%mm7, %%mm6 \n\t" // 0
> - "pcmpgtw %%mm0, %%mm6 \n\t"
> - "pxor %%mm6, %%mm0 \n\t"
> - "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
> - "movq %%mm7, %%mm6 \n\t" // 0
> - "pcmpgtw %%mm1, %%mm6 \n\t"
> - "pxor %%mm6, %%mm1 \n\t"
> - "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
> - "movq %%mm7, %%mm6 \n\t" // 0
> - "pcmpgtw %%mm2, %%mm6 \n\t"
> - "pxor %%mm6, %%mm2 \n\t"
> - "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
> - "movq %%mm7, %%mm6 \n\t" // 0
> - "pcmpgtw %%mm3, %%mm6 \n\t"
> - "pxor %%mm6, %%mm3 \n\t"
> - "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
> -#endif
>
> -#if TEMPLATE_PP_MMXEXT
> "pminsw %%mm2, %%mm0 \n\t"
> "pminsw %%mm3, %%mm1 \n\t"
> -#else
> - "movq %%mm0, %%mm6 \n\t"
> - "psubusw %%mm2, %%mm6 \n\t"
> - "psubw %%mm6, %%mm0 \n\t"
> - "movq %%mm1, %%mm6 \n\t"
> - "psubusw %%mm3, %%mm6 \n\t"
> - "psubw %%mm6, %%mm1 \n\t"
> -#endif
>
> "movd %2, %%mm2 \n\t" // QP
> "punpcklbw %%mm7, %%mm2 \n\t"
> @@ -3041,17 +2638,8 @@ static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int st
> "pand %%mm2, %%mm4 \n\t"
> "pand %%mm3, %%mm5 \n\t"
>
> -#if TEMPLATE_PP_MMXEXT
> "pminsw %%mm0, %%mm4 \n\t"
> "pminsw %%mm1, %%mm5 \n\t"
> -#else
> - "movq %%mm4, %%mm2 \n\t"
> - "psubusw %%mm0, %%mm2 \n\t"
> - "psubw %%mm2, %%mm4 \n\t"
> - "movq %%mm5, %%mm2 \n\t"
> - "psubusw %%mm1, %%mm2 \n\t"
> - "psubw %%mm2, %%mm5 \n\t"
> -#endif
> "pxor %%mm6, %%mm4 \n\t"
> "pxor %%mm7, %%mm5 \n\t"
> "psubw %%mm6, %%mm4 \n\t"
> @@ -3088,18 +2676,14 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
> static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
> int levelFix, int64_t *packedOffsetAndScale)
> {
> -#if !TEMPLATE_PP_MMX || !HAVE_6REGS
> - int i;
> -#endif
> if(levelFix){
> -#if TEMPLATE_PP_MMX && HAVE_6REGS
> +#if TEMPLATE_PP_MMXEXT && HAVE_6REGS
> __asm__ volatile(
> "movq (%%"FF_REG_a"), %%mm2 \n\t" // packedYOffset
> "movq 8(%%"FF_REG_a"), %%mm3 \n\t" // packedYScale
> "lea (%2,%4), %%"FF_REG_a" \n\t"
> "lea (%3,%5), %%"FF_REG_d" \n\t"
> "pxor %%mm4, %%mm4 \n\t"
> -#if TEMPLATE_PP_MMXEXT
> #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
> "movq " #src1 ", %%mm0 \n\t"\
> "movq " #src1 ", %%mm5 \n\t"\
> @@ -3122,34 +2706,6 @@ static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t
> "movq %%mm0, " #dst1 " \n\t"\
> "movq %%mm1, " #dst2 " \n\t"\
>
> -#else //TEMPLATE_PP_MMXEXT
> -#define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
> - "movq " #src1 ", %%mm0 \n\t"\
> - "movq " #src1 ", %%mm5 \n\t"\
> - "punpcklbw %%mm4, %%mm0 \n\t"\
> - "punpckhbw %%mm4, %%mm5 \n\t"\
> - "psubw %%mm2, %%mm0 \n\t"\
> - "psubw %%mm2, %%mm5 \n\t"\
> - "movq " #src2 ", %%mm1 \n\t"\
> - "psllw $6, %%mm0 \n\t"\
> - "psllw $6, %%mm5 \n\t"\
> - "pmulhw %%mm3, %%mm0 \n\t"\
> - "movq " #src2 ", %%mm6 \n\t"\
> - "pmulhw %%mm3, %%mm5 \n\t"\
> - "punpcklbw %%mm4, %%mm1 \n\t"\
> - "punpckhbw %%mm4, %%mm6 \n\t"\
> - "psubw %%mm2, %%mm1 \n\t"\
> - "psubw %%mm2, %%mm6 \n\t"\
> - "psllw $6, %%mm1 \n\t"\
> - "psllw $6, %%mm6 \n\t"\
> - "pmulhw %%mm3, %%mm1 \n\t"\
> - "pmulhw %%mm3, %%mm6 \n\t"\
> - "packuswb %%mm5, %%mm0 \n\t"\
> - "packuswb %%mm6, %%mm1 \n\t"\
> - "movq %%mm0, " #dst1 " \n\t"\
> - "movq %%mm1, " #dst2 " \n\t"\
> -
> -#endif //TEMPLATE_PP_MMXEXT
> #define SCALED_CPY(src1, src2, dst1, dst2)\
> REAL_SCALED_CPY(src1, src2, dst1, dst2)
>
> @@ -3170,7 +2726,7 @@ SCALED_CPY((%%FF_REGa, %4), (%%FF_REGa, %4, 2), (%%FF_REGd, %5), (%%FF_REGd, %5,
> : "%"FF_REG_d
> );
> #else //TEMPLATE_PP_MMX && HAVE_6REGS
> - for(i=0; i<8; i++)
> + for (int i = 0; i < 8; i++)
> memcpy( &(dst[dstStride*i]),
> &(src[srcStride*i]), BLOCK_SIZE);
> #endif //TEMPLATE_PP_MMX && HAVE_6REGS
> @@ -3203,7 +2759,7 @@ SIMPLE_CPY((%%FF_REGa, %2), (%%FF_REGa, %2, 2), (%%FF_REGd, %3), (%%FF_REGd, %3,
> : "%"FF_REG_a, "%"FF_REG_d
> );
> #else //TEMPLATE_PP_MMX && HAVE_6REGS
> - for(i=0; i<8; i++)
> + for (int i = 0; i < 8; i++)
> memcpy( &(dst[dstStride*i]),
> &(src[srcStride*i]), BLOCK_SIZE);
> #endif //TEMPLATE_PP_MMX && HAVE_6REGS
> @@ -3696,9 +3252,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
> }
> }
> }
> -#if TEMPLATE_PP_3DNOW
> - __asm__ volatile("femms");
> -#elif TEMPLATE_PP_MMX
> +#if TEMPLATE_PP_MMX
> __asm__ volatile("emms");
> #endif
>
> @@ -3734,5 +3288,4 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
> #undef TEMPLATE_PP_ALTIVEC
> #undef TEMPLATE_PP_MMX
> #undef TEMPLATE_PP_MMXEXT
> -#undef TEMPLATE_PP_3DNOW
> #undef TEMPLATE_PP_SSE2
Will apply this patchset tomorrow unless there are objections.
- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2022-11-08 15:41 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-11-06 16:27 Andreas Rheinhardt
2022-11-06 16:30 ` [FFmpeg-devel] [PATCH 2/7] avcodec/x86/cavsdsp: Remove unused 3DNow-macro Andreas Rheinhardt
2022-11-06 16:30 ` [FFmpeg-devel] [PATCH 3/7] swscale/utils: Remove obsolete 3DNow reference Andreas Rheinhardt
2022-11-06 16:30 ` [FFmpeg-devel] [PATCH 4/7] postproc/postprocess_template: Fix alignment Andreas Rheinhardt
2022-11-06 16:30 ` [FFmpeg-devel] [PATCH 5/7] avfilter/vf_qp: Don't store NAN in int Andreas Rheinhardt
2022-11-06 16:31 ` [FFmpeg-devel] [PATCH 6/7] avfilter/phase_template: Fix left-shift of negative numbers Andreas Rheinhardt
2022-11-06 16:31 ` [FFmpeg-devel] [PATCH 7/7] avfilter/vf_hqdn3d: " Andreas Rheinhardt
2022-11-08 15:41 ` Andreas Rheinhardt [this message]
2022-11-08 15:43 ` [FFmpeg-devel] [PATCH 1/7] postproc/postprocess: Remove obsolete MMX(EXT)/3Dnow functions Paul B Mahol
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=AS8P250MB0744CF39802FD5D4A137A4A88F3F9@AS8P250MB0744.EURP250.PROD.OUTLOOK.COM \
--to=andreas.rheinhardt@outlook.com \
--cc=ffmpeg-devel@ffmpeg.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git