* [FFmpeg-devel] [PATCH] avfilter/x86/vf_{pullup,spp}: Port functions to SSE2, SSSE3 (PR #20696)
@ 2025-10-12 17:46 mkver via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-10-12 17:46 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: mkver
PR #20696 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20696
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20696.patch
From 1b3235d4163e0bf31d017c2df12d3198387f4798 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 12 Oct 2025 17:19:26 +0200
Subject: [PATCH 1/2] avfilter/x86/vf_spp: Port store_slice to SSE2
This allows to remove an emms_c from the filter. It also gives
25% speedup here (when timing the calls to store_slice using
START/STOP_TIMER).
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_spp.c | 2 --
libavfilter/x86/vf_spp.c | 49 ++++++++++++++++++++--------------------
2 files changed, 24 insertions(+), 27 deletions(-)
diff --git a/libavfilter/vf_spp.c b/libavfilter/vf_spp.c
index 5c5b98f8db..20c9fd4340 100644
--- a/libavfilter/vf_spp.c
+++ b/libavfilter/vf_spp.c
@@ -31,7 +31,6 @@
* ported by Clément Bœsch for FFmpeg.
*/
-#include "libavutil/emms.h"
#include "libavutil/imgutils.h"
#include "libavutil/mem.h"
#include "libavutil/mem_internal.h"
@@ -425,7 +424,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
filter(s, out->data[1], in->data[1], out->linesize[1], in->linesize[1], cw, ch, qp_table, qp_stride, 0, depth);
filter(s, out->data[2], in->data[2], out->linesize[2], in->linesize[2], cw, ch, qp_table, qp_stride, 0, depth);
}
- emms_c();
}
}
diff --git a/libavfilter/x86/vf_spp.c b/libavfilter/x86/vf_spp.c
index f8e5727bfc..48c3d25d7c 100644
--- a/libavfilter/x86/vf_spp.c
+++ b/libavfilter/x86/vf_spp.c
@@ -18,16 +18,20 @@
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
+#include <stdint.h>
+#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
#include "libavfilter/vf_spp.h"
-#if HAVE_MMX_INLINE
-static void store_slice_mmx(uint8_t *dst, const int16_t *src,
- int dst_stride, int src_stride,
- int width, int height, int log2_scale,
- const uint8_t dither[8][8])
+#if HAVE_SSE2_INLINE
+static void store_slice_sse2(uint8_t *dst, const int16_t *src,
+ int dst_stride, int src_stride,
+ int width, int height, int log2_scale,
+ const uint8_t dither[8][8])
{
int y;
@@ -35,30 +39,25 @@ static void store_slice_mmx(uint8_t *dst, const int16_t *src,
uint8_t *dst1 = dst;
const int16_t *src1 = src;
__asm__ volatile(
- "movq (%3), %%mm3 \n"
- "movq (%3), %%mm4 \n"
- "movd %4, %%mm2 \n"
- "pxor %%mm0, %%mm0 \n"
- "punpcklbw %%mm0, %%mm3 \n"
- "punpckhbw %%mm0, %%mm4 \n"
- "psraw %%mm2, %%mm3 \n"
- "psraw %%mm2, %%mm4 \n"
- "movd %5, %%mm2 \n"
+ "movq (%3), %%xmm1 \n"
+ "movd %4, %%xmm2 \n"
+ "pxor %%xmm0, %%xmm0 \n"
+ "punpcklbw %%xmm0, %%xmm1 \n"
+ "psraw %%xmm2, %%xmm1 \n"
+ "movd %5, %%xmm2 \n"
"1: \n"
- "movq (%0), %%mm0 \n"
- "movq 8(%0), %%mm1 \n"
- "paddw %%mm3, %%mm0 \n"
- "paddw %%mm4, %%mm1 \n"
- "psraw %%mm2, %%mm0 \n"
- "psraw %%mm2, %%mm1 \n"
- "packuswb %%mm1, %%mm0 \n"
- "movq %%mm0, (%1) \n"
+ "movdqa (%0), %%xmm0 \n"
+ "paddw %%xmm1, %%xmm0 \n"
+ "psraw %%xmm2, %%xmm0 \n"
+ "packuswb %%xmm0, %%xmm0 \n"
+ "movq %%xmm0, (%1) \n"
"add $16, %0 \n"
"add $8, %1 \n"
"cmp %2, %1 \n"
" jb 1b \n"
: "+r" (src1), "+r"(dst1)
: "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(MAX_LEVEL - log2_scale)
+ XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2")
);
src += src_stride;
dst += dst_stride;
@@ -69,11 +68,11 @@ static void store_slice_mmx(uint8_t *dst, const int16_t *src,
av_cold void ff_spp_init_x86(SPPContext *s)
{
-#if HAVE_MMX_INLINE
+#if HAVE_SSE2_INLINE
int cpu_flags = av_get_cpu_flags();
- if (cpu_flags & AV_CPU_FLAG_MMX) {
- s->store_slice = store_slice_mmx;
+ if (INLINE_SSE2(cpu_flags)) {
+ s->store_slice = store_slice_sse2;
}
#endif
}
--
2.49.1
From d9571bb9b2e49042a7d2fb0bd18c390b6dc63f57 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 12 Oct 2025 19:28:35 +0200
Subject: [PATCH 2/2] avfilter/x86/vf_pullup: Port pullup functions to SSE2,
SSSE3
The diff and var functions benefit from psadbw, comb from wider
registers which allows to avoid reloading values, reducing the number
of loads from 48 to 10. Performance increased by 117% (the loop
in compute_metric() has been timed); codesize decreased by 144B.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_pullup.c | 2 -
libavfilter/x86/vf_pullup.asm | 182 ++++++++++---------------------
libavfilter/x86/vf_pullup_init.c | 16 +--
3 files changed, 66 insertions(+), 134 deletions(-)
diff --git a/libavfilter/vf_pullup.c b/libavfilter/vf_pullup.c
index 1e4289aab1..d963840fe9 100644
--- a/libavfilter/vf_pullup.c
+++ b/libavfilter/vf_pullup.c
@@ -19,7 +19,6 @@
*/
#include "libavutil/avassert.h"
-#include "libavutil/emms.h"
#include "libavutil/imgutils.h"
#include "libavutil/mem.h"
#include "libavutil/opt.h"
@@ -597,7 +596,6 @@ static void pullup_submit_field(PullupContext *s, PullupBuffer *b, int parity)
compute_metric(s, f->diffs, f, parity, f->prev->prev, parity, s->diff);
compute_metric(s, f->combs, parity ? f->prev : f, 0, parity ? f : f->prev, 1, s->comb);
compute_metric(s, f->vars, f, parity, f, -1, s->var);
- emms_c();
/* Advance the circular list */
if (!s->first)
diff --git a/libavfilter/x86/vf_pullup.asm b/libavfilter/x86/vf_pullup.asm
index 26c2a27d37..6875a846a2 100644
--- a/libavfilter/x86/vf_pullup.asm
+++ b/libavfilter/x86/vf_pullup.asm
@@ -22,157 +22,89 @@
SECTION .text
-INIT_MMX mmx
-cglobal pullup_filter_diff, 3, 5, 8, first, second, size
+INIT_XMM sse2
+cglobal pullup_filter_diff, 3, 4, 3, first, second, size
mov r3, 4
- pxor m4, m4
- pxor m7, m7
+ pxor m2, m2
.loop:
movq m0, [firstq]
- movq m2, [firstq]
add firstq, sizeq
movq m1, [secondq]
add secondq, sizeq
- psubusb m2, m1
- psubusb m1, m0
- movq m0, m2
- movq m3, m1
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpckhbw m2, m7
- punpckhbw m3, m7
- paddw m4, m0
- paddw m4, m1
- paddw m4, m2
- paddw m4, m3
+ psadbw m0, m1
+ paddw m2, m0
dec r3
jnz .loop
- movq m3, m4
- punpcklwd m4, m7
- punpckhwd m3, m7
- paddd m3, m4
- movd eax, m3
- psrlq m3, 32
- movd r4d, m3
- add eax, r4d
+ movd eax, m2
RET
-INIT_MMX mmx
-cglobal pullup_filter_comb, 3, 5, 8, first, second, size
- mov r3, 4
+INIT_XMM ssse3
+cglobal pullup_filter_comb, 3, 5, 7, first, second, size
+ movq m0, [firstq]
+ sub secondq, sizeq
+ movq m1, [secondq]
pxor m6, m6
- pxor m7, m7
- sub secondq, sizeq
+ punpcklbw m0, m6
+ punpcklbw m1, m6
+ add firstq, sizeq
+ add secondq, sizeq
+ pxor m5, m5
+ mov r3, 4
.loop:
- movq m0, [firstq]
- movq m1, [secondq]
- punpcklbw m0, m7
- movq m2, [secondq+sizeq]
- punpcklbw m1, m7
- punpcklbw m2, m7
- paddw m0, m0
- paddw m1, m2
- movq m2, m0
- psubusw m0, m1
- psubusw m1, m2
- paddw m6, m0
- paddw m6, m1
+ movq m2, [firstq]
+ movq m3, [secondq]
+ add firstq, sizeq
+ add secondq, sizeq
+ punpcklbw m2, m6
+ punpcklbw m3, m6
+ mova m4, m0
- movq m0, [firstq]
- movq m1, [secondq]
- punpckhbw m0, m7
- movq m2, [secondq+sizeq]
- punpckhbw m1, m7
- punpckhbw m2, m7
paddw m0, m0
- paddw m1, m2
- movq m2, m0
- psubusw m0, m1
- psubusw m1, m2
- paddw m6, m0
- paddw m6, m1
+ paddw m1, m3
+ psubw m0, m1
+ pabsw m0, m0
+ paddw m5, m0
- movq m0, [secondq+sizeq]
- movq m1, [firstq]
- punpcklbw m0, m7
- movq m2, [firstq+sizeq]
- punpcklbw m1, m7
- punpcklbw m2, m7
- paddw m0, m0
- paddw m1, m2
- movq m2, m0
- psubusw m0, m1
- psubusw m1, m2
- paddw m6, m0
- paddw m6, m1
+ mova m1, m3
+ paddw m4, m2
+ paddw m3, m3
+ psubw m3, m4
+ pabsw m3, m3
+ paddw m5, m3
+ mova m2, m0
- movq m0, [secondq+sizeq]
- movq m1, [firstq]
- punpckhbw m0, m7
- movq m2, [firstq+sizeq]
- punpckhbw m1, m7
- punpckhbw m2, m7
- paddw m0, m0
- paddw m1, m2
- movq m2, m0
- psubusw m0, m1
- psubusw m1, m2
- paddw m6, m0
- paddw m6, m1
-
- add firstq, sizeq
- add secondq, sizeq
dec r3
jnz .loop
- movq m5, m6
- punpcklwd m6, m7
- punpckhwd m5, m7
- paddd m5, m6
- movd eax, m5
- psrlq m5, 32
- movd r4d, m5
- add eax, r4d
+ movq m0, m5
+ punpcklwd m5, m6
+ punpckhwd m0, m6
+ paddd m0, m5
+ pshufd m5, m0, 0xE
+ paddd m0, m5
+ pshufd m5, m0, 0x1
+ paddd m0, m5
+ movd eax, m0
RET
-INIT_MMX mmx
-cglobal pullup_filter_var, 3, 5, 8, first, second, size
- mov r3, 3
- pxor m4, m4
- pxor m7, m7
-
-.loop:
+INIT_XMM sse2
+cglobal pullup_filter_var, 3, 3, 3, first, second, size
movq m0, [firstq]
- movq m2, [firstq]
- movq m1, [firstq+sizeq]
add firstq, sizeq
- psubusb m2, m1
- psubusb m1, m0
- movq m0, m2
- movq m3, m1
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpckhbw m2, m7
- punpckhbw m3, m7
- paddw m4, m0
- paddw m4, m1
- paddw m4, m2
- paddw m4, m3
-
- dec r3
- jnz .loop
-
- movq m3, m4
- punpcklwd m4, m7
- punpckhwd m3, m7
- paddd m3, m4
- movd eax, m3
- psrlq m3, 32
- movd r4d, m3
- add eax, r4d
+ movq m1, [firstq]
+ pxor m2, m2
+ psadbw m0, m1
+ paddw m2, m0
+ movq m0, [firstq+sizeq]
+ psadbw m1, m0
+ paddw m2, m1
+ movq m1, [firstq+2*sizeq]
+ psadbw m0, m1
+ paddw m2, m0
+ movd eax, m2
shl eax, 2
RET
diff --git a/libavfilter/x86/vf_pullup_init.c b/libavfilter/x86/vf_pullup_init.c
index 562a3fb625..943c1de9d7 100644
--- a/libavfilter/x86/vf_pullup_init.c
+++ b/libavfilter/x86/vf_pullup_init.c
@@ -22,19 +22,21 @@
#include "libavutil/x86/cpu.h"
#include "libavfilter/vf_pullup.h"
-int ff_pullup_filter_diff_mmx(const uint8_t *a, const uint8_t *b, ptrdiff_t s);
-int ff_pullup_filter_comb_mmx(const uint8_t *a, const uint8_t *b, ptrdiff_t s);
-int ff_pullup_filter_var_mmx (const uint8_t *a, const uint8_t *b, ptrdiff_t s);
+int ff_pullup_filter_diff_sse2 (const uint8_t *a, const uint8_t *b, ptrdiff_t s);
+int ff_pullup_filter_comb_ssse3(const uint8_t *a, const uint8_t *b, ptrdiff_t s);
+int ff_pullup_filter_var_sse2 (const uint8_t *a, const uint8_t *b, ptrdiff_t s);
av_cold void ff_pullup_init_x86(PullupContext *s)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
- if (EXTERNAL_MMX(cpu_flags)) {
- s->diff = ff_pullup_filter_diff_mmx;
- s->comb = ff_pullup_filter_comb_mmx;
- s->var = ff_pullup_filter_var_mmx;
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ s->diff = ff_pullup_filter_diff_sse2;
+ s->var = ff_pullup_filter_var_sse2;
+ }
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ s->comb = ff_pullup_filter_comb_ssse3;
}
#endif
}
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-10-12 17:46 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-12 17:46 [FFmpeg-devel] [PATCH] avfilter/x86/vf_{pullup,spp}: Port functions to SSE2, SSSE3 (PR #20696) mkver via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git