Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] avfilter/x86/vf_{pullup,spp}: Port functions to SSE2, SSSE3 (PR #20696)
@ 2025-10-12 17:46 mkver via ffmpeg-devel
  0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-10-12 17:46 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: mkver

PR #20696 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20696
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20696.patch


From 1b3235d4163e0bf31d017c2df12d3198387f4798 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 12 Oct 2025 17:19:26 +0200
Subject: [PATCH 1/2] avfilter/x86/vf_spp: Port store_slice to SSE2

This allows to remove an emms_c from the filter. It also gives
25% speedup here (when timing the calls to store_slice using
START/STOP_TIMER).

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavfilter/vf_spp.c     |  2 --
 libavfilter/x86/vf_spp.c | 49 ++++++++++++++++++++--------------------
 2 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/libavfilter/vf_spp.c b/libavfilter/vf_spp.c
index 5c5b98f8db..20c9fd4340 100644
--- a/libavfilter/vf_spp.c
+++ b/libavfilter/vf_spp.c
@@ -31,7 +31,6 @@
  * ported by Clément Bœsch for FFmpeg.
  */
 
-#include "libavutil/emms.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/mem.h"
 #include "libavutil/mem_internal.h"
@@ -425,7 +424,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
                 filter(s, out->data[1], in->data[1], out->linesize[1], in->linesize[1], cw,        ch,        qp_table, qp_stride, 0, depth);
                 filter(s, out->data[2], in->data[2], out->linesize[2], in->linesize[2], cw,        ch,        qp_table, qp_stride, 0, depth);
             }
-            emms_c();
         }
     }
 
diff --git a/libavfilter/x86/vf_spp.c b/libavfilter/x86/vf_spp.c
index f8e5727bfc..48c3d25d7c 100644
--- a/libavfilter/x86/vf_spp.c
+++ b/libavfilter/x86/vf_spp.c
@@ -18,16 +18,20 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include <stdint.h>
 
+#include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
 #include "libavfilter/vf_spp.h"
 
-#if HAVE_MMX_INLINE
-static void store_slice_mmx(uint8_t *dst, const int16_t *src,
-                            int dst_stride, int src_stride,
-                            int width, int height, int log2_scale,
-                            const uint8_t dither[8][8])
+#if HAVE_SSE2_INLINE
+static void store_slice_sse2(uint8_t *dst, const int16_t *src,
+                             int dst_stride, int src_stride,
+                             int width, int height, int log2_scale,
+                             const uint8_t dither[8][8])
 {
     int y;
 
@@ -35,30 +39,25 @@ static void store_slice_mmx(uint8_t *dst, const int16_t *src,
         uint8_t *dst1 = dst;
         const int16_t *src1 = src;
         __asm__ volatile(
-            "movq (%3), %%mm3           \n"
-            "movq (%3), %%mm4           \n"
-            "movd %4, %%mm2             \n"
-            "pxor %%mm0, %%mm0          \n"
-            "punpcklbw %%mm0, %%mm3     \n"
-            "punpckhbw %%mm0, %%mm4     \n"
-            "psraw %%mm2, %%mm3         \n"
-            "psraw %%mm2, %%mm4         \n"
-            "movd %5, %%mm2             \n"
+            "movq         (%3), %%xmm1  \n"
+            "movd           %4, %%xmm2  \n"
+            "pxor       %%xmm0, %%xmm0  \n"
+            "punpcklbw  %%xmm0, %%xmm1  \n"
+            "psraw      %%xmm2, %%xmm1  \n"
+            "movd           %5, %%xmm2  \n"
             "1:                         \n"
-            "movq (%0), %%mm0           \n"
-            "movq 8(%0), %%mm1          \n"
-            "paddw %%mm3, %%mm0         \n"
-            "paddw %%mm4, %%mm1         \n"
-            "psraw %%mm2, %%mm0         \n"
-            "psraw %%mm2, %%mm1         \n"
-            "packuswb %%mm1, %%mm0      \n"
-            "movq %%mm0, (%1)           \n"
+            "movdqa       (%0), %%xmm0  \n"
+            "paddw      %%xmm1, %%xmm0  \n"
+            "psraw      %%xmm2, %%xmm0  \n"
+            "packuswb   %%xmm0, %%xmm0  \n"
+            "movq       %%xmm0, (%1)    \n"
             "add $16, %0                \n"
             "add $8, %1                 \n"
             "cmp %2, %1                 \n"
             " jb 1b                     \n"
             : "+r" (src1), "+r"(dst1)
             : "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(MAX_LEVEL - log2_scale)
+            XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2")
         );
         src += src_stride;
         dst += dst_stride;
@@ -69,11 +68,11 @@ static void store_slice_mmx(uint8_t *dst, const int16_t *src,
 
 av_cold void ff_spp_init_x86(SPPContext *s)
 {
-#if HAVE_MMX_INLINE
+#if HAVE_SSE2_INLINE
     int cpu_flags = av_get_cpu_flags();
 
-    if (cpu_flags & AV_CPU_FLAG_MMX) {
-        s->store_slice = store_slice_mmx;
+    if (INLINE_SSE2(cpu_flags)) {
+        s->store_slice = store_slice_sse2;
     }
 #endif
 }
-- 
2.49.1


From d9571bb9b2e49042a7d2fb0bd18c390b6dc63f57 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 12 Oct 2025 19:28:35 +0200
Subject: [PATCH 2/2] avfilter/x86/vf_pullup: Port pullup functions to SSE2,
 SSSE3

The diff and var functions benefit from psadbw, comb from wider
registers which allows to avoid reloading values, reducing the number
of loads from 48 to 10. Performance increased by 117% (the loop
in compute_metric() has been timed); codesize decreased by 144B.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavfilter/vf_pullup.c          |   2 -
 libavfilter/x86/vf_pullup.asm    | 182 ++++++++++---------------------
 libavfilter/x86/vf_pullup_init.c |  16 +--
 3 files changed, 66 insertions(+), 134 deletions(-)

diff --git a/libavfilter/vf_pullup.c b/libavfilter/vf_pullup.c
index 1e4289aab1..d963840fe9 100644
--- a/libavfilter/vf_pullup.c
+++ b/libavfilter/vf_pullup.c
@@ -19,7 +19,6 @@
  */
 
 #include "libavutil/avassert.h"
-#include "libavutil/emms.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
@@ -597,7 +596,6 @@ static void pullup_submit_field(PullupContext *s, PullupBuffer *b, int parity)
     compute_metric(s, f->diffs, f, parity, f->prev->prev, parity, s->diff);
     compute_metric(s, f->combs, parity ? f->prev : f, 0, parity ? f : f->prev, 1, s->comb);
     compute_metric(s, f->vars, f, parity, f, -1, s->var);
-    emms_c();
 
     /* Advance the circular list */
     if (!s->first)
diff --git a/libavfilter/x86/vf_pullup.asm b/libavfilter/x86/vf_pullup.asm
index 26c2a27d37..6875a846a2 100644
--- a/libavfilter/x86/vf_pullup.asm
+++ b/libavfilter/x86/vf_pullup.asm
@@ -22,157 +22,89 @@
 
 SECTION .text
 
-INIT_MMX mmx
-cglobal pullup_filter_diff, 3, 5, 8, first, second, size
+INIT_XMM sse2
+cglobal pullup_filter_diff, 3, 4, 3, first, second, size
     mov        r3, 4
-    pxor       m4, m4
-    pxor       m7, m7
+    pxor       m2, m2
 
 .loop:
     movq       m0, [firstq]
-    movq       m2, [firstq]
     add        firstq, sizeq
     movq       m1, [secondq]
     add        secondq, sizeq
-    psubusb    m2, m1
-    psubusb    m1, m0
-    movq       m0, m2
-    movq       m3, m1
-    punpcklbw  m0, m7
-    punpcklbw  m1, m7
-    punpckhbw  m2, m7
-    punpckhbw  m3, m7
-    paddw      m4, m0
-    paddw      m4, m1
-    paddw      m4, m2
-    paddw      m4, m3
+    psadbw     m0, m1
+    paddw      m2, m0
 
     dec        r3
     jnz .loop
 
-    movq       m3, m4
-    punpcklwd  m4, m7
-    punpckhwd  m3, m7
-    paddd      m3, m4
-    movd      eax, m3
-    psrlq      m3, 32
-    movd      r4d, m3
-    add       eax, r4d
+    movd      eax, m2
     RET
 
-INIT_MMX mmx
-cglobal pullup_filter_comb, 3, 5, 8, first, second, size
-    mov        r3, 4
+INIT_XMM ssse3
+cglobal pullup_filter_comb, 3, 5, 7, first, second, size
+    movq       m0, [firstq]
+    sub   secondq, sizeq
+    movq       m1, [secondq]
     pxor       m6, m6
-    pxor       m7, m7
-    sub        secondq, sizeq
+    punpcklbw  m0, m6
+    punpcklbw  m1, m6
+    add    firstq, sizeq
+    add   secondq, sizeq
+    pxor       m5, m5
+    mov        r3, 4
 
 .loop:
-    movq       m0, [firstq]
-    movq       m1, [secondq]
-    punpcklbw  m0, m7
-    movq       m2, [secondq+sizeq]
-    punpcklbw  m1, m7
-    punpcklbw  m2, m7
-    paddw      m0, m0
-    paddw      m1, m2
-    movq       m2, m0
-    psubusw    m0, m1
-    psubusw    m1, m2
-    paddw      m6, m0
-    paddw      m6, m1
+    movq       m2, [firstq]
+    movq       m3, [secondq]
+    add    firstq, sizeq
+    add   secondq, sizeq
+    punpcklbw  m2, m6
+    punpcklbw  m3, m6
+    mova       m4, m0
 
-    movq       m0, [firstq]
-    movq       m1, [secondq]
-    punpckhbw  m0, m7
-    movq       m2, [secondq+sizeq]
-    punpckhbw  m1, m7
-    punpckhbw  m2, m7
     paddw      m0, m0
-    paddw      m1, m2
-    movq       m2, m0
-    psubusw    m0, m1
-    psubusw    m1, m2
-    paddw      m6, m0
-    paddw      m6, m1
+    paddw      m1, m3
+    psubw      m0, m1
+    pabsw      m0, m0
+    paddw      m5, m0
 
-    movq       m0, [secondq+sizeq]
-    movq       m1, [firstq]
-    punpcklbw  m0, m7
-    movq       m2, [firstq+sizeq]
-    punpcklbw  m1, m7
-    punpcklbw  m2, m7
-    paddw      m0, m0
-    paddw      m1, m2
-    movq       m2, m0
-    psubusw    m0, m1
-    psubusw    m1, m2
-    paddw      m6, m0
-    paddw      m6, m1
+    mova       m1, m3
+    paddw      m4, m2
+    paddw      m3, m3
+    psubw      m3, m4
+    pabsw      m3, m3
+    paddw      m5, m3
+    mova       m2, m0
 
-    movq       m0, [secondq+sizeq]
-    movq       m1, [firstq]
-    punpckhbw  m0, m7
-    movq       m2, [firstq+sizeq]
-    punpckhbw  m1, m7
-    punpckhbw  m2, m7
-    paddw      m0, m0
-    paddw      m1, m2
-    movq       m2, m0
-    psubusw    m0, m1
-    psubusw    m1, m2
-    paddw      m6, m0
-    paddw      m6, m1
-
-    add        firstq, sizeq
-    add        secondq, sizeq
     dec        r3
     jnz .loop
 
-    movq       m5, m6
-    punpcklwd  m6, m7
-    punpckhwd  m5, m7
-    paddd      m5, m6
-    movd      eax, m5
-    psrlq      m5, 32
-    movd      r4d, m5
-    add       eax, r4d
+    movq       m0, m5
+    punpcklwd  m5, m6
+    punpckhwd  m0, m6
+    paddd      m0, m5
+    pshufd     m5, m0, 0xE
+    paddd      m0, m5
+    pshufd     m5, m0, 0x1
+    paddd      m0, m5
+    movd      eax, m0
     RET
 
-INIT_MMX mmx
-cglobal pullup_filter_var, 3, 5, 8, first, second, size
-    mov        r3, 3
-    pxor       m4, m4
-    pxor       m7, m7
-
-.loop:
+INIT_XMM sse2
+cglobal pullup_filter_var, 3, 3, 3, first, second, size
     movq       m0, [firstq]
-    movq       m2, [firstq]
-    movq       m1, [firstq+sizeq]
     add        firstq, sizeq
-    psubusb    m2, m1
-    psubusb    m1, m0
-    movq       m0, m2
-    movq       m3, m1
-    punpcklbw  m0, m7
-    punpcklbw  m1, m7
-    punpckhbw  m2, m7
-    punpckhbw  m3, m7
-    paddw      m4, m0
-    paddw      m4, m1
-    paddw      m4, m2
-    paddw      m4, m3
-
-    dec        r3
-    jnz .loop
-
-    movq       m3, m4
-    punpcklwd  m4, m7
-    punpckhwd  m3, m7
-    paddd      m3, m4
-    movd      eax, m3
-    psrlq      m3, 32
-    movd      r4d, m3
-    add       eax, r4d
+    movq       m1, [firstq]
+    pxor       m2, m2
+    psadbw     m0, m1
+    paddw      m2, m0
+    movq       m0, [firstq+sizeq]
+    psadbw     m1, m0
+    paddw      m2, m1
+    movq       m1, [firstq+2*sizeq]
+    psadbw     m0, m1
+    paddw      m2, m0
+    movd      eax, m2
     shl       eax, 2
     RET
diff --git a/libavfilter/x86/vf_pullup_init.c b/libavfilter/x86/vf_pullup_init.c
index 562a3fb625..943c1de9d7 100644
--- a/libavfilter/x86/vf_pullup_init.c
+++ b/libavfilter/x86/vf_pullup_init.c
@@ -22,19 +22,21 @@
 #include "libavutil/x86/cpu.h"
 #include "libavfilter/vf_pullup.h"
 
-int ff_pullup_filter_diff_mmx(const uint8_t *a, const uint8_t *b, ptrdiff_t s);
-int ff_pullup_filter_comb_mmx(const uint8_t *a, const uint8_t *b, ptrdiff_t s);
-int ff_pullup_filter_var_mmx (const uint8_t *a, const uint8_t *b, ptrdiff_t s);
+int ff_pullup_filter_diff_sse2 (const uint8_t *a, const uint8_t *b, ptrdiff_t s);
+int ff_pullup_filter_comb_ssse3(const uint8_t *a, const uint8_t *b, ptrdiff_t s);
+int ff_pullup_filter_var_sse2  (const uint8_t *a, const uint8_t *b, ptrdiff_t s);
 
 av_cold void ff_pullup_init_x86(PullupContext *s)
 {
 #if HAVE_X86ASM
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_MMX(cpu_flags)) {
-        s->diff = ff_pullup_filter_diff_mmx;
-        s->comb = ff_pullup_filter_comb_mmx;
-        s->var  = ff_pullup_filter_var_mmx;
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        s->diff = ff_pullup_filter_diff_sse2;
+        s->var  = ff_pullup_filter_var_sse2;
+    }
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        s->comb = ff_pullup_filter_comb_ssse3;
     }
 #endif
 }
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2025-10-12 17:46 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-12 17:46 [FFmpeg-devel] [PATCH] avfilter/x86/vf_{pullup,spp}: Port functions to SSE2, SSSE3 (PR #20696) mkver via ffmpeg-devel

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git