[FFmpeg-devel] [PATCH] hpeldsp (PR #20668)

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] [PATCH] hpeldsp (PR #20668)
@ 2025-10-08 15:20 mkver via ffmpeg-devel
  0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-10-08 15:20 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: mkver

PR #20668 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20668
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20668.patch


>From 09c0c45423ddbfe6925a7b1aed004a2f35d92e9e Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 3 Oct 2025 04:14:59 +0200
Subject: [PATCH 1/3] avcodec/x86/hpeldsp: Add SSE2 of {avg,put} no_rnd xy2
 with blocksize 16

Also remove the now superseded MMX versions (the new functions have the
exact same codesize as the removed ones).

Old benchmarks:
avg_no_rnd_pixels_tab[0][3]_c:                         233.7 ( 1.00x)
avg_no_rnd_pixels_tab[0][3]_mmx:                       121.5 ( 1.92x)
put_no_rnd_pixels_tab[0][3]_c:                         171.4 ( 1.00x)
put_no_rnd_pixels_tab[0][3]_mmx:                        82.6 ( 2.08x)

New benchmarks:
avg_no_rnd_pixels_tab[0][3]_c:                         233.3 ( 1.00x)
avg_no_rnd_pixels_tab[0][3]_sse2:                       45.0 ( 5.18x)
put_no_rnd_pixels_tab[0][3]_c:                         172.1 ( 1.00x)
put_no_rnd_pixels_tab[0][3]_sse2:                       40.9 ( 4.21x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/hpeldsp.asm    | 13 +++--
 libavcodec/x86/hpeldsp_init.c | 89 +++--------------------------------
 2 files changed, 14 insertions(+), 88 deletions(-)

diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index e9f988f7b5..4c19da3e2a 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -29,6 +29,7 @@
 
 SECTION_RODATA
 cextern pb_1
+cextern pw_1
 cextern pw_2
 pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
 pb_interleave8:  db 0, 4, 1, 5, 2, 6, 3, 7
@@ -407,10 +408,10 @@ AVG_PIXELS8_Y2
 
 
 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro SET_PIXELS_XY2 1
-cglobal %1_pixels16_xy2, 4,5,8
+%macro SET_PIXELS_XY2 2-3
+cglobal %1%3_pixels16_xy2, 4,5,8
     pxor        m7, m7
-    mova        m6, [pw_2]
+    mova        m6, [%2]
     movu        m0, [r1]
     movu        m4, [r1+1]
     mova        m1, m0
@@ -481,8 +482,10 @@ cglobal %1_pixels16_xy2, 4,5,8
 %endmacro
 
 INIT_XMM sse2
-SET_PIXELS_XY2 put
-SET_PIXELS_XY2 avg
+SET_PIXELS_XY2 put, pw_2
+SET_PIXELS_XY2 avg, pw_2
+SET_PIXELS_XY2 put, pw_1, _no_rnd
+SET_PIXELS_XY2 avg, pw_1, _no_rnd
 
 %macro SSSE3_PIXELS_XY2 1-2
 %if %0 == 2 ; sse2
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index cb47cb7752..44e44f0975 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -30,7 +30,6 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/hpeldsp.h"
-#include "libavcodec/pixels.h"
 #include "fpel.h"
 #include "hpeldsp.h"
 #include "inline_asm.h"
@@ -65,6 +64,10 @@ void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
                                     ptrdiff_t line_size, int h);
 void ff_avg_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
                                     ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void ff_avg_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
@@ -143,94 +146,12 @@ static void put_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
         :FF_REG_a, "memory");
 }
 
-// this routine is 'slightly' suboptimal but mostly unused
-static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
-                                       ptrdiff_t line_size, int h)
-{
-    MOVQ_ZERO(mm7);
-    MOVQ_WONE(mm6); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm4            \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
-        "add    %3, %1                  \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
-        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "paddusw %%mm2, %%mm0           \n\t"
-        "paddusw %%mm3, %%mm1           \n\t"
-        "paddusw %%mm6, %%mm4           \n\t"
-        "paddusw %%mm6, %%mm5           \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "psrlw  $2, %%mm4               \n\t"
-        "psrlw  $2, %%mm5               \n\t"
-                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-                "pcmpeqd %%mm2, %%mm2   \n\t"
-                "paddb %%mm2, %%mm2     \n\t"
-                PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
-                "movq   %%mm5, (%2, %%"FF_REG_a")  \n\t"
-        "add    %3, %%"FF_REG_a"        \n\t"
-
-        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
-        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm2, %%mm4           \n\t"
-        "paddusw %%mm3, %%mm5           \n\t"
-        "paddusw %%mm6, %%mm0           \n\t"
-        "paddusw %%mm6, %%mm1           \n\t"
-        "paddusw %%mm4, %%mm0           \n\t"
-        "paddusw %%mm5, %%mm1           \n\t"
-        "psrlw  $2, %%mm0               \n\t"
-        "psrlw  $2, %%mm1               \n\t"
-                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-                "pcmpeqd %%mm2, %%mm2   \n\t"
-                "paddb %%mm2, %%mm2     \n\t"
-                PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
-                "movq   %%mm1, (%2, %%"FF_REG_a")  \n\t"
-        "add    %3, %%"FF_REG_a"           \n\t"
-
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels)
-        :"D"(block), "r"((x86_reg)line_size)
-        :FF_REG_a, "memory");
-}
-
-#if HAVE_MMX
-CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
-CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
-#endif
 #endif /* HAVE_INLINE_ASM */
 
 static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
 {
 #if HAVE_MMX_INLINE
-    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
     c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
-    c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx;
 #endif
 #if HAVE_MMX_EXTERNAL
     c->put_no_rnd_pixels_tab[1][0] =
@@ -269,6 +190,7 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags)
     c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
     c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_sse2;
     c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_sse2;
+    c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_sse2;
 
     c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
     c->avg_pixels_tab[0][1]        = ff_avg_pixels16_x2_sse2;
@@ -278,6 +200,7 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags)
     c->avg_no_rnd_pixels_tab[0]    = ff_avg_pixels16_sse2;
     c->avg_no_rnd_pixels_tab[1]    = ff_avg_no_rnd_pixels16_x2_sse2;
     c->avg_no_rnd_pixels_tab[2]    = ff_avg_no_rnd_pixels16_y2_sse2;
+    c->avg_no_rnd_pixels_tab[3]    = ff_avg_no_rnd_pixels16_xy2_sse2;
 #endif /* HAVE_SSE2_EXTERNAL */
 }
 
-- 
2.49.1


>From 1f91867029276984e464eb2b46cf8a053b2155a7 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 3 Oct 2025 05:20:32 +0200
Subject: [PATCH 2/3] avcodec/x86/hpeldsp: Add
 ff_put_no_rnd_pixels8_xy2_ssse3()

Given that one has to deal with 16 byte intermediates it is
unsurprising that SSE2 wins against MMX; the MMX version has
therefore been removed (as well as the now unused inline_asm.h).
The new function is even 32B smaller than the old MMX one.

Old benchmarks:
put_no_rnd_pixels_tab[1][3]_c:                          84.1 ( 1.00x)
put_no_rnd_pixels_tab[1][3]_mmx:                        41.1 ( 2.05x)

New benchmarks:
put_no_rnd_pixels_tab[1][3]_c:                          84.0 ( 1.00x)
put_no_rnd_pixels_tab[1][3]_ssse3:                      22.1 ( 3.80x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/hpeldsp.asm    |  54 ++++++++++++++++++
 libavcodec/x86/hpeldsp_init.c |  83 ++--------------------------
 libavcodec/x86/inline_asm.h   | 100 ----------------------------------
 3 files changed, 58 insertions(+), 179 deletions(-)
 delete mode 100644 libavcodec/x86/inline_asm.h

diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 4c19da3e2a..fc51d72d17 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -407,6 +407,60 @@ INIT_XMM sse2
 AVG_PIXELS8_Y2
 
 
+; void ff_put_no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro SET_PIXELS8_XY2 2-3
+cglobal %1%3_pixels8_xy2, 4,5,5
+    mova        m4, [pb_1]
+    mova        m3, [%2]
+    movh        m0, [r1]
+    movh        m2, [r1+1]
+    punpcklbw   m2, m0
+    pmaddubsw   m2, m4
+    xor         r4, r4
+    add         r1, r2
+.loop:
+    movh        m0, [r1+r4]
+    movh        m1, [r1+r4+1]
+    punpcklbw   m0, m1
+    pmaddubsw   m0, m4
+    paddusw     m2, m3
+    paddusw     m2, m0
+    psrlw       m2, 2
+%ifidn %1, avg
+    movh        m1, [r0+r4]
+    packuswb    m2, m2
+    pavgb       m2, m1
+%else
+    packuswb    m2, m2
+%endif
+    movh   [r0+r4], m2
+    add         r4, r2
+
+    movh        m1, [r1+r4]
+    movh        m2, [r1+r4+1]
+    punpcklbw   m2, m1
+    pmaddubsw   m2, m4
+    paddusw     m0, m3
+    paddusw     m0, m2
+    psrlw       m0, 2
+%ifidn %1, avg
+    movh        m1, [r0+r4]
+    packuswb    m0, m0
+    pavgb       m0, m1
+%else
+    packuswb    m0, m0
+%endif
+    movh   [r0+r4], m0
+    add         r4, r2
+    sub        r3d, 2
+    jnz .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+SET_PIXELS8_XY2 put, pw_1, _no_rnd
+
+
 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro SET_PIXELS_XY2 2-3
 cglobal %1%3_pixels16_xy2, 4,5,8
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 44e44f0975..f4b123ce03 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -32,7 +32,6 @@
 #include "libavcodec/hpeldsp.h"
 #include "fpel.h"
 #include "hpeldsp.h"
-#include "inline_asm.h"
 
 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
@@ -64,6 +63,8 @@ void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
                                     ptrdiff_t line_size, int h);
 void ff_avg_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
                                     ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
                                      ptrdiff_t line_size, int h);
 void ff_avg_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
@@ -73,86 +74,8 @@ void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 
-#if HAVE_INLINE_ASM
-
-/***********************************/
-/* MMX no rounding */
-
-// put_pixels
-static void put_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
-                                       ptrdiff_t line_size, int h)
-{
-    MOVQ_ZERO(mm7);
-    MOVQ_WONE(mm6); // =1 for no_rnd version
-    __asm__ volatile(
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm4            \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
-        "add    %3, %1                  \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
-        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "paddusw %%mm2, %%mm0           \n\t"
-        "paddusw %%mm3, %%mm1           \n\t"
-        "paddusw %%mm6, %%mm4           \n\t"
-        "paddusw %%mm6, %%mm5           \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "psrlw  $2, %%mm4               \n\t"
-        "psrlw  $2, %%mm5               \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-        "movq   %%mm4, (%2, %%"FF_REG_a")  \n\t"
-        "add    %3, %%"FF_REG_a"           \n\t"
-
-        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
-        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm2, %%mm4           \n\t"
-        "paddusw %%mm3, %%mm5           \n\t"
-        "paddusw %%mm6, %%mm0           \n\t"
-        "paddusw %%mm6, %%mm1           \n\t"
-        "paddusw %%mm4, %%mm0           \n\t"
-        "paddusw %%mm5, %%mm1           \n\t"
-        "psrlw  $2, %%mm0               \n\t"
-        "psrlw  $2, %%mm1               \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-        "movq   %%mm0, (%2, %%"FF_REG_a")  \n\t"
-        "add    %3, %%"FF_REG_a"        \n\t"
-
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels)
-        :"D"(block), "r"((x86_reg)line_size)
-        :FF_REG_a, "memory");
-}
-
-#endif /* HAVE_INLINE_ASM */
-
 static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
 {
-#if HAVE_MMX_INLINE
-    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
-#endif
 #if HAVE_MMX_EXTERNAL
     c->put_no_rnd_pixels_tab[1][0] =
     c->put_pixels_tab[1][0] = ff_put_pixels8_mmx;
@@ -211,6 +134,8 @@ static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags)
     c->avg_pixels_tab[0][3]            = ff_avg_pixels16_xy2_ssse3;
     c->put_pixels_tab[1][3]            = ff_put_pixels8_xy2_ssse3;
     c->avg_pixels_tab[1][3]            = ff_avg_pixels8_xy2_ssse3;
+
+    c->put_no_rnd_pixels_tab[1][3]     = ff_put_no_rnd_pixels8_xy2_ssse3;
 #endif
 }
 
diff --git a/libavcodec/x86/inline_asm.h b/libavcodec/x86/inline_asm.h
deleted file mode 100644
index 0198746719..0000000000
--- a/libavcodec/x86/inline_asm.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * inline assembly helper macros
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_INLINE_ASM_H
-#define AVCODEC_X86_INLINE_ASM_H
-
-#include "constants.h"
-
-#define MOVQ_WONE(regd) \
-    __asm__ volatile ( \
-    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
-    "psrlw $15, %%" #regd ::)
-
-#define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
-#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
-
-#define MOVQ_BFE(regd)                                  \
-    __asm__ volatile (                                  \
-        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
-        "paddb   %%"#regd", %%"#regd"   \n\t" ::)
-
-#ifndef PIC
-#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2))
-#else
-// for shared library it's better to use this way for accessing constants
-// pcmpeqd -> -1
-#define MOVQ_WTWO(regd)                                 \
-    __asm__ volatile (                                  \
-        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
-        "psrlw         $15, %%"#regd"   \n\t"           \
-        "psllw          $1, %%"#regd"   \n\t"::)
-
-#endif
-
-// using regr as temporary and for the output result
-// first argument is unmodified and second is trashed
-// regfe is supposed to contain 0xfefefefefefefefe
-#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe)                \
-    "movq   "#rega", "#regr"            \n\t"                    \
-    "pand   "#regb", "#regr"            \n\t"                    \
-    "pxor   "#rega", "#regb"            \n\t"                    \
-    "pand  "#regfe", "#regb"            \n\t"                    \
-    "psrlq       $1, "#regb"            \n\t"                    \
-    "paddb  "#regb", "#regr"            \n\t"
-
-#define PAVGB_MMX(rega, regb, regr, regfe)                       \
-    "movq   "#rega", "#regr"            \n\t"                    \
-    "por    "#regb", "#regr"            \n\t"                    \
-    "pxor   "#rega", "#regb"            \n\t"                    \
-    "pand  "#regfe", "#regb"            \n\t"                    \
-    "psrlq       $1, "#regb"            \n\t"                    \
-    "psubb  "#regb", "#regr"            \n\t"
-
-// mm6 is supposed to contain 0xfefefefefefefefe
-#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
-    "movq  "#rega", "#regr"             \n\t"                    \
-    "movq  "#regc", "#regp"             \n\t"                    \
-    "pand  "#regb", "#regr"             \n\t"                    \
-    "pand  "#regd", "#regp"             \n\t"                    \
-    "pxor  "#rega", "#regb"             \n\t"                    \
-    "pxor  "#regc", "#regd"             \n\t"                    \
-    "pand    %%mm6, "#regb"             \n\t"                    \
-    "pand    %%mm6, "#regd"             \n\t"                    \
-    "psrlq      $1, "#regb"             \n\t"                    \
-    "psrlq      $1, "#regd"             \n\t"                    \
-    "paddb "#regb", "#regr"             \n\t"                    \
-    "paddb "#regd", "#regp"             \n\t"
-
-#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
-    "movq  "#rega", "#regr"             \n\t"                    \
-    "movq  "#regc", "#regp"             \n\t"                    \
-    "por   "#regb", "#regr"             \n\t"                    \
-    "por   "#regd", "#regp"             \n\t"                    \
-    "pxor  "#rega", "#regb"             \n\t"                    \
-    "pxor  "#regc", "#regd"             \n\t"                    \
-    "pand    %%mm6, "#regb"             \n\t"                    \
-    "pand    %%mm6, "#regd"             \n\t"                    \
-    "psrlq      $1, "#regd"             \n\t"                    \
-    "psrlq      $1, "#regb"             \n\t"                    \
-    "psubb "#regb", "#regr"             \n\t"                    \
-    "psubb "#regd", "#regp"             \n\t"
-
-#endif /* AVCODEC_X86_INLINE_ASM_H */
-- 
2.49.1


>From 790d1fa6b3638d84306b5803a5efb5019881a429 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 3 Oct 2025 09:29:24 +0200
Subject: [PATCH 3/3] avcodec/x86/hpeldsp: Improve
 ff_{avg,put}_pixels8_xy2_ssse3()

This SSSE3 function uses MMX registers (of course without emms
at the end) and processes eight bytes of input by unpacking
it into two MMX registers. This is very suboptimal given
that one can just use XMM registers to process eight words.
This commit switches them to using XMM registers.

Old benchmarks:
avg_pixels_tab[1][3]_c:                                114.5 ( 1.00x)
avg_pixels_tab[1][3]_ssse3:                             43.6 ( 2.62x)
put_pixels_tab[1][3]_c:                                 83.6 ( 1.00x)
put_pixels_tab[1][3]_ssse3:                             34.0 ( 2.46x)

New benchmarks:
avg_pixels_tab[1][3]_c:                                115.3 ( 1.00x)
avg_pixels_tab[1][3]_ssse3:                             24.6 ( 4.69x)
put_pixels_tab[1][3]_c:                                 83.8 ( 1.00x)
put_pixels_tab[1][3]_ssse3:                             19.7 ( 4.24x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/hpeldsp.asm | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index fc51d72d17..2587e3c315 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -32,7 +32,6 @@ cextern pb_1
 cextern pw_1
 cextern pw_2
 pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
-pb_interleave8:  db 0, 4, 1, 5, 2, 6, 3, 7
 
 cextern pw_8192
 
@@ -423,9 +422,14 @@ cglobal %1%3_pixels8_xy2, 4,5,5
     movh        m1, [r1+r4+1]
     punpcklbw   m0, m1
     pmaddubsw   m0, m4
+%ifidn %3, _no_rnd
     paddusw     m2, m3
     paddusw     m2, m0
     psrlw       m2, 2
+%else
+    paddusw     m2, m0
+    pmulhrsw    m2, [pw_8192]
+%endif
 %ifidn %1, avg
     movh        m1, [r0+r4]
     packuswb    m2, m2
@@ -440,9 +444,14 @@ cglobal %1%3_pixels8_xy2, 4,5,5
     movh        m2, [r1+r4+1]
     punpcklbw   m2, m1
     pmaddubsw   m2, m4
+%ifidn %3, _no_rnd
     paddusw     m0, m3
     paddusw     m0, m2
     psrlw       m0, 2
+%else
+    paddusw     m0, m2
+    pmulhrsw    m0, [pw_8192]
+%endif
 %ifidn %1, avg
     movh        m1, [r0+r4]
     packuswb    m0, m0
@@ -459,6 +468,8 @@ cglobal %1%3_pixels8_xy2, 4,5,5
 
 INIT_XMM ssse3
 SET_PIXELS8_XY2 put, pw_1, _no_rnd
+SET_PIXELS8_XY2 avg, pw_8192
+SET_PIXELS8_XY2 put, pw_8192
 
 
 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -542,13 +553,8 @@ SET_PIXELS_XY2 put, pw_1, _no_rnd
 SET_PIXELS_XY2 avg, pw_1, _no_rnd
 
 %macro SSSE3_PIXELS_XY2 1-2
-%if %0 == 2 ; sse2
 cglobal %1_pixels16_xy2, 4,5,%2
     mova        m4, [pb_interleave16]
-%else
-cglobal %1_pixels8_xy2, 4,5
-    mova        m4, [pb_interleave8]
-%endif
     mova        m5, [pb_1]
     movu        m0, [r1]
     movu        m1, [r1+1]
@@ -601,9 +607,6 @@ cglobal %1_pixels8_xy2, 4,5
     RET
 %endmacro
 
-INIT_MMX ssse3
-SSSE3_PIXELS_XY2 put
-SSSE3_PIXELS_XY2 avg
 INIT_XMM ssse3
 SSSE3_PIXELS_XY2 put, 6
 SSSE3_PIXELS_XY2 avg, 7
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2025-10-08 15:21 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-08 15:20 [FFmpeg-devel] [PATCH] hpeldsp (PR #20668) mkver via ffmpeg-devel

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git