* [FFmpeg-devel] [PATCH] hpeldsp (PR #20668)
@ 2025-10-08 15:20 mkver via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-10-08 15:20 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: mkver
PR #20668 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20668
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20668.patch
>From 09c0c45423ddbfe6925a7b1aed004a2f35d92e9e Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 3 Oct 2025 04:14:59 +0200
Subject: [PATCH 1/3] avcodec/x86/hpeldsp: Add SSE2 of {avg,put} no_rnd xy2
with blocksize 16
Also remove the now superseded MMX versions (the new functions have the
exact same codesize as the removed ones).
Old benchmarks:
avg_no_rnd_pixels_tab[0][3]_c: 233.7 ( 1.00x)
avg_no_rnd_pixels_tab[0][3]_mmx: 121.5 ( 1.92x)
put_no_rnd_pixels_tab[0][3]_c: 171.4 ( 1.00x)
put_no_rnd_pixels_tab[0][3]_mmx: 82.6 ( 2.08x)
New benchmarks:
avg_no_rnd_pixels_tab[0][3]_c: 233.3 ( 1.00x)
avg_no_rnd_pixels_tab[0][3]_sse2: 45.0 ( 5.18x)
put_no_rnd_pixels_tab[0][3]_c: 172.1 ( 1.00x)
put_no_rnd_pixels_tab[0][3]_sse2: 40.9 ( 4.21x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/hpeldsp.asm | 13 +++--
libavcodec/x86/hpeldsp_init.c | 89 +++--------------------------------
2 files changed, 14 insertions(+), 88 deletions(-)
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index e9f988f7b5..4c19da3e2a 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -29,6 +29,7 @@
SECTION_RODATA
cextern pb_1
+cextern pw_1
cextern pw_2
pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7
@@ -407,10 +408,10 @@ AVG_PIXELS8_Y2
; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro SET_PIXELS_XY2 1
-cglobal %1_pixels16_xy2, 4,5,8
+%macro SET_PIXELS_XY2 2-3
+cglobal %1%3_pixels16_xy2, 4,5,8
pxor m7, m7
- mova m6, [pw_2]
+ mova m6, [%2]
movu m0, [r1]
movu m4, [r1+1]
mova m1, m0
@@ -481,8 +482,10 @@ cglobal %1_pixels16_xy2, 4,5,8
%endmacro
INIT_XMM sse2
-SET_PIXELS_XY2 put
-SET_PIXELS_XY2 avg
+SET_PIXELS_XY2 put, pw_2
+SET_PIXELS_XY2 avg, pw_2
+SET_PIXELS_XY2 put, pw_1, _no_rnd
+SET_PIXELS_XY2 avg, pw_1, _no_rnd
%macro SSSE3_PIXELS_XY2 1-2
%if %0 == 2 ; sse2
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index cb47cb7752..44e44f0975 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -30,7 +30,6 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/hpeldsp.h"
-#include "libavcodec/pixels.h"
#include "fpel.h"
#include "hpeldsp.h"
#include "inline_asm.h"
@@ -65,6 +64,10 @@ void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
@@ -143,94 +146,12 @@ static void put_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
:FF_REG_a, "memory");
}
-// this routine is 'slightly' suboptimal but mostly unused
-static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h)
-{
- MOVQ_ZERO(mm7);
- MOVQ_WONE(mm6); // =2 for rnd and =1 for no_rnd version
- __asm__ volatile(
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm4 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpckhbw %%mm7, %%mm1 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddusw %%mm0, %%mm4 \n\t"
- "paddusw %%mm1, %%mm5 \n\t"
- "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
- "add %3, %1 \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
- "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpckhbw %%mm7, %%mm1 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "paddusw %%mm2, %%mm0 \n\t"
- "paddusw %%mm3, %%mm1 \n\t"
- "paddusw %%mm6, %%mm4 \n\t"
- "paddusw %%mm6, %%mm5 \n\t"
- "paddusw %%mm0, %%mm4 \n\t"
- "paddusw %%mm1, %%mm5 \n\t"
- "psrlw $2, %%mm4 \n\t"
- "psrlw $2, %%mm5 \n\t"
- "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
- "packuswb %%mm5, %%mm4 \n\t"
- "pcmpeqd %%mm2, %%mm2 \n\t"
- "paddb %%mm2, %%mm2 \n\t"
- PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
- "movq %%mm5, (%2, %%"FF_REG_a") \n\t"
- "add %3, %%"FF_REG_a" \n\t"
-
- "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
- "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddusw %%mm2, %%mm4 \n\t"
- "paddusw %%mm3, %%mm5 \n\t"
- "paddusw %%mm6, %%mm0 \n\t"
- "paddusw %%mm6, %%mm1 \n\t"
- "paddusw %%mm4, %%mm0 \n\t"
- "paddusw %%mm5, %%mm1 \n\t"
- "psrlw $2, %%mm0 \n\t"
- "psrlw $2, %%mm1 \n\t"
- "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "pcmpeqd %%mm2, %%mm2 \n\t"
- "paddb %%mm2, %%mm2 \n\t"
- PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
- "movq %%mm1, (%2, %%"FF_REG_a") \n\t"
- "add %3, %%"FF_REG_a" \n\t"
-
- "subl $2, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels)
- :"D"(block), "r"((x86_reg)line_size)
- :FF_REG_a, "memory");
-}
-
-#if HAVE_MMX
-CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
-CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
-#endif
#endif /* HAVE_INLINE_ASM */
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
{
#if HAVE_MMX_INLINE
- c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
- c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx;
#endif
#if HAVE_MMX_EXTERNAL
c->put_no_rnd_pixels_tab[1][0] =
@@ -269,6 +190,7 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags)
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_sse2;
c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_sse2;
+ c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
@@ -278,6 +200,7 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags)
c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_sse2;
c->avg_no_rnd_pixels_tab[1] = ff_avg_no_rnd_pixels16_x2_sse2;
c->avg_no_rnd_pixels_tab[2] = ff_avg_no_rnd_pixels16_y2_sse2;
+ c->avg_no_rnd_pixels_tab[3] = ff_avg_no_rnd_pixels16_xy2_sse2;
#endif /* HAVE_SSE2_EXTERNAL */
}
--
2.49.1
>From 1f91867029276984e464eb2b46cf8a053b2155a7 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 3 Oct 2025 05:20:32 +0200
Subject: [PATCH 2/3] avcodec/x86/hpeldsp: Add
ff_put_no_rnd_pixels8_xy2_ssse3()
Given that one has to deal with 16 byte intermediates it is
unsurprising that SSE2 wins against MMX; the MMX version has
therefore been removed (as well as the now unused inline_asm.h).
The new function is even 32B smaller than the old MMX one.
Old benchmarks:
put_no_rnd_pixels_tab[1][3]_c: 84.1 ( 1.00x)
put_no_rnd_pixels_tab[1][3]_mmx: 41.1 ( 2.05x)
New benchmarks:
put_no_rnd_pixels_tab[1][3]_c: 84.0 ( 1.00x)
put_no_rnd_pixels_tab[1][3]_ssse3: 22.1 ( 3.80x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/hpeldsp.asm | 54 ++++++++++++++++++
libavcodec/x86/hpeldsp_init.c | 83 ++--------------------------
libavcodec/x86/inline_asm.h | 100 ----------------------------------
3 files changed, 58 insertions(+), 179 deletions(-)
delete mode 100644 libavcodec/x86/inline_asm.h
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 4c19da3e2a..fc51d72d17 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -407,6 +407,60 @@ INIT_XMM sse2
AVG_PIXELS8_Y2
+; void ff_put_no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro SET_PIXELS8_XY2 2-3
+cglobal %1%3_pixels8_xy2, 4,5,5
+ mova m4, [pb_1]
+ mova m3, [%2]
+ movh m0, [r1]
+ movh m2, [r1+1]
+ punpcklbw m2, m0
+ pmaddubsw m2, m4
+ xor r4, r4
+ add r1, r2
+.loop:
+ movh m0, [r1+r4]
+ movh m1, [r1+r4+1]
+ punpcklbw m0, m1
+ pmaddubsw m0, m4
+ paddusw m2, m3
+ paddusw m2, m0
+ psrlw m2, 2
+%ifidn %1, avg
+ movh m1, [r0+r4]
+ packuswb m2, m2
+ pavgb m2, m1
+%else
+ packuswb m2, m2
+%endif
+ movh [r0+r4], m2
+ add r4, r2
+
+ movh m1, [r1+r4]
+ movh m2, [r1+r4+1]
+ punpcklbw m2, m1
+ pmaddubsw m2, m4
+ paddusw m0, m3
+ paddusw m0, m2
+ psrlw m0, 2
+%ifidn %1, avg
+ movh m1, [r0+r4]
+ packuswb m0, m0
+ pavgb m0, m1
+%else
+ packuswb m0, m0
+%endif
+ movh [r0+r4], m0
+ add r4, r2
+ sub r3d, 2
+ jnz .loop
+ RET
+%endmacro
+
+INIT_XMM ssse3
+SET_PIXELS8_XY2 put, pw_1, _no_rnd
+
+
; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro SET_PIXELS_XY2 2-3
cglobal %1%3_pixels16_xy2, 4,5,8
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 44e44f0975..f4b123ce03 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -32,7 +32,6 @@
#include "libavcodec/hpeldsp.h"
#include "fpel.h"
#include "hpeldsp.h"
-#include "inline_asm.h"
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
@@ -64,6 +63,8 @@ void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
@@ -73,86 +74,8 @@ void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
-#if HAVE_INLINE_ASM
-
-/***********************************/
-/* MMX no rounding */
-
-// put_pixels
-static void put_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h)
-{
- MOVQ_ZERO(mm7);
- MOVQ_WONE(mm6); // =1 for no_rnd version
- __asm__ volatile(
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm4 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpckhbw %%mm7, %%mm1 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddusw %%mm0, %%mm4 \n\t"
- "paddusw %%mm1, %%mm5 \n\t"
- "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
- "add %3, %1 \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
- "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpckhbw %%mm7, %%mm1 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "paddusw %%mm2, %%mm0 \n\t"
- "paddusw %%mm3, %%mm1 \n\t"
- "paddusw %%mm6, %%mm4 \n\t"
- "paddusw %%mm6, %%mm5 \n\t"
- "paddusw %%mm0, %%mm4 \n\t"
- "paddusw %%mm1, %%mm5 \n\t"
- "psrlw $2, %%mm4 \n\t"
- "psrlw $2, %%mm5 \n\t"
- "packuswb %%mm5, %%mm4 \n\t"
- "movq %%mm4, (%2, %%"FF_REG_a") \n\t"
- "add %3, %%"FF_REG_a" \n\t"
-
- "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
- "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddusw %%mm2, %%mm4 \n\t"
- "paddusw %%mm3, %%mm5 \n\t"
- "paddusw %%mm6, %%mm0 \n\t"
- "paddusw %%mm6, %%mm1 \n\t"
- "paddusw %%mm4, %%mm0 \n\t"
- "paddusw %%mm5, %%mm1 \n\t"
- "psrlw $2, %%mm0 \n\t"
- "psrlw $2, %%mm1 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "movq %%mm0, (%2, %%"FF_REG_a") \n\t"
- "add %3, %%"FF_REG_a" \n\t"
-
- "subl $2, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels)
- :"D"(block), "r"((x86_reg)line_size)
- :FF_REG_a, "memory");
-}
-
-#endif /* HAVE_INLINE_ASM */
-
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
{
-#if HAVE_MMX_INLINE
- c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
-#endif
#if HAVE_MMX_EXTERNAL
c->put_no_rnd_pixels_tab[1][0] =
c->put_pixels_tab[1][0] = ff_put_pixels8_mmx;
@@ -211,6 +134,8 @@ static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags)
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3;
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3;
+
+ c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_ssse3;
#endif
}
diff --git a/libavcodec/x86/inline_asm.h b/libavcodec/x86/inline_asm.h
deleted file mode 100644
index 0198746719..0000000000
--- a/libavcodec/x86/inline_asm.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * inline assembly helper macros
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_INLINE_ASM_H
-#define AVCODEC_X86_INLINE_ASM_H
-
-#include "constants.h"
-
-#define MOVQ_WONE(regd) \
- __asm__ volatile ( \
- "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
- "psrlw $15, %%" #regd ::)
-
-#define JUMPALIGN() __asm__ volatile (".p2align 3"::)
-#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
-
-#define MOVQ_BFE(regd) \
- __asm__ volatile ( \
- "pcmpeqd %%"#regd", %%"#regd" \n\t" \
- "paddb %%"#regd", %%"#regd" \n\t" ::)
-
-#ifndef PIC
-#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2))
-#else
-// for shared library it's better to use this way for accessing constants
-// pcmpeqd -> -1
-#define MOVQ_WTWO(regd) \
- __asm__ volatile ( \
- "pcmpeqd %%"#regd", %%"#regd" \n\t" \
- "psrlw $15, %%"#regd" \n\t" \
- "psllw $1, %%"#regd" \n\t"::)
-
-#endif
-
-// using regr as temporary and for the output result
-// first argument is unmodified and second is trashed
-// regfe is supposed to contain 0xfefefefefefefefe
-#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
- "movq "#rega", "#regr" \n\t" \
- "pand "#regb", "#regr" \n\t" \
- "pxor "#rega", "#regb" \n\t" \
- "pand "#regfe", "#regb" \n\t" \
- "psrlq $1, "#regb" \n\t" \
- "paddb "#regb", "#regr" \n\t"
-
-#define PAVGB_MMX(rega, regb, regr, regfe) \
- "movq "#rega", "#regr" \n\t" \
- "por "#regb", "#regr" \n\t" \
- "pxor "#rega", "#regb" \n\t" \
- "pand "#regfe", "#regb" \n\t" \
- "psrlq $1, "#regb" \n\t" \
- "psubb "#regb", "#regr" \n\t"
-
-// mm6 is supposed to contain 0xfefefefefefefefe
-#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
- "movq "#rega", "#regr" \n\t" \
- "movq "#regc", "#regp" \n\t" \
- "pand "#regb", "#regr" \n\t" \
- "pand "#regd", "#regp" \n\t" \
- "pxor "#rega", "#regb" \n\t" \
- "pxor "#regc", "#regd" \n\t" \
- "pand %%mm6, "#regb" \n\t" \
- "pand %%mm6, "#regd" \n\t" \
- "psrlq $1, "#regb" \n\t" \
- "psrlq $1, "#regd" \n\t" \
- "paddb "#regb", "#regr" \n\t" \
- "paddb "#regd", "#regp" \n\t"
-
-#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
- "movq "#rega", "#regr" \n\t" \
- "movq "#regc", "#regp" \n\t" \
- "por "#regb", "#regr" \n\t" \
- "por "#regd", "#regp" \n\t" \
- "pxor "#rega", "#regb" \n\t" \
- "pxor "#regc", "#regd" \n\t" \
- "pand %%mm6, "#regb" \n\t" \
- "pand %%mm6, "#regd" \n\t" \
- "psrlq $1, "#regd" \n\t" \
- "psrlq $1, "#regb" \n\t" \
- "psubb "#regb", "#regr" \n\t" \
- "psubb "#regd", "#regp" \n\t"
-
-#endif /* AVCODEC_X86_INLINE_ASM_H */
--
2.49.1
>From 790d1fa6b3638d84306b5803a5efb5019881a429 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 3 Oct 2025 09:29:24 +0200
Subject: [PATCH 3/3] avcodec/x86/hpeldsp: Improve
ff_{avg,put}_pixels8_xy2_ssse3()
This SSSE3 function uses MMX registers (of course without emms
at the end) and processes eight bytes of input by unpacking
it into two MMX registers. This is very suboptimal given
that one can just use XMM registers to process eight words.
This commit switches them to using XMM registers.
Old benchmarks:
avg_pixels_tab[1][3]_c: 114.5 ( 1.00x)
avg_pixels_tab[1][3]_ssse3: 43.6 ( 2.62x)
put_pixels_tab[1][3]_c: 83.6 ( 1.00x)
put_pixels_tab[1][3]_ssse3: 34.0 ( 2.46x)
New benchmarks:
avg_pixels_tab[1][3]_c: 115.3 ( 1.00x)
avg_pixels_tab[1][3]_ssse3: 24.6 ( 4.69x)
put_pixels_tab[1][3]_c: 83.8 ( 1.00x)
put_pixels_tab[1][3]_ssse3: 19.7 ( 4.24x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/hpeldsp.asm | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index fc51d72d17..2587e3c315 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -32,7 +32,6 @@ cextern pb_1
cextern pw_1
cextern pw_2
pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
-pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7
cextern pw_8192
@@ -423,9 +422,14 @@ cglobal %1%3_pixels8_xy2, 4,5,5
movh m1, [r1+r4+1]
punpcklbw m0, m1
pmaddubsw m0, m4
+%ifidn %3, _no_rnd
paddusw m2, m3
paddusw m2, m0
psrlw m2, 2
+%else
+ paddusw m2, m0
+ pmulhrsw m2, [pw_8192]
+%endif
%ifidn %1, avg
movh m1, [r0+r4]
packuswb m2, m2
@@ -440,9 +444,14 @@ cglobal %1%3_pixels8_xy2, 4,5,5
movh m2, [r1+r4+1]
punpcklbw m2, m1
pmaddubsw m2, m4
+%ifidn %3, _no_rnd
paddusw m0, m3
paddusw m0, m2
psrlw m0, 2
+%else
+ paddusw m0, m2
+ pmulhrsw m0, [pw_8192]
+%endif
%ifidn %1, avg
movh m1, [r0+r4]
packuswb m0, m0
@@ -459,6 +468,8 @@ cglobal %1%3_pixels8_xy2, 4,5,5
INIT_XMM ssse3
SET_PIXELS8_XY2 put, pw_1, _no_rnd
+SET_PIXELS8_XY2 avg, pw_8192
+SET_PIXELS8_XY2 put, pw_8192
; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -542,13 +553,8 @@ SET_PIXELS_XY2 put, pw_1, _no_rnd
SET_PIXELS_XY2 avg, pw_1, _no_rnd
%macro SSSE3_PIXELS_XY2 1-2
-%if %0 == 2 ; sse2
cglobal %1_pixels16_xy2, 4,5,%2
mova m4, [pb_interleave16]
-%else
-cglobal %1_pixels8_xy2, 4,5
- mova m4, [pb_interleave8]
-%endif
mova m5, [pb_1]
movu m0, [r1]
movu m1, [r1+1]
@@ -601,9 +607,6 @@ cglobal %1_pixels8_xy2, 4,5
RET
%endmacro
-INIT_MMX ssse3
-SSSE3_PIXELS_XY2 put
-SSSE3_PIXELS_XY2 avg
INIT_XMM ssse3
SSSE3_PIXELS_XY2 put, 6
SSSE3_PIXELS_XY2 avg, 7
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-10-08 15:21 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-08 15:20 [FFmpeg-devel] [PATCH] hpeldsp (PR #20668) mkver via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git