From: mkver via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> To: ffmpeg-devel@ffmpeg.org Cc: mkver <code@ffmpeg.org> Subject: [FFmpeg-devel] [PATCH] avfilter/x86/vf_{pullup,spp}: Port functions to SSE2, SSSE3 (PR #20696) Date: Sun, 12 Oct 2025 17:46:06 -0000 Message-ID: <176029116721.52.15970957343339096878@bf249f23a2c8> (raw) PR #20696 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20696 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20696.patch From 1b3235d4163e0bf31d017c2df12d3198387f4798 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sun, 12 Oct 2025 17:19:26 +0200 Subject: [PATCH 1/2] avfilter/x86/vf_spp: Port store_slice to SSE2 This allows to remove an emms_c from the filter. It also gives 25% speedup here (when timing the calls to store_slice using START/STOP_TIMER). Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavfilter/vf_spp.c | 2 -- libavfilter/x86/vf_spp.c | 49 ++++++++++++++++++++-------------------- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/libavfilter/vf_spp.c b/libavfilter/vf_spp.c index 5c5b98f8db..20c9fd4340 100644 --- a/libavfilter/vf_spp.c +++ b/libavfilter/vf_spp.c @@ -31,7 +31,6 @@ * ported by Clément Bœsch for FFmpeg. */ -#include "libavutil/emms.h" #include "libavutil/imgutils.h" #include "libavutil/mem.h" #include "libavutil/mem_internal.h" @@ -425,7 +424,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) filter(s, out->data[1], in->data[1], out->linesize[1], in->linesize[1], cw, ch, qp_table, qp_stride, 0, depth); filter(s, out->data[2], in->data[2], out->linesize[2], in->linesize[2], cw, ch, qp_table, qp_stride, 0, depth); } - emms_c(); } } diff --git a/libavfilter/x86/vf_spp.c b/libavfilter/x86/vf_spp.c index f8e5727bfc..48c3d25d7c 100644 --- a/libavfilter/x86/vf_spp.c +++ b/libavfilter/x86/vf_spp.c @@ -18,16 +18,20 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ +#include <stdint.h> +#include "config.h" #include "libavutil/attributes.h" #include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" #include "libavfilter/vf_spp.h" -#if HAVE_MMX_INLINE -static void store_slice_mmx(uint8_t *dst, const int16_t *src, - int dst_stride, int src_stride, - int width, int height, int log2_scale, - const uint8_t dither[8][8]) +#if HAVE_SSE2_INLINE +static void store_slice_sse2(uint8_t *dst, const int16_t *src, + int dst_stride, int src_stride, + int width, int height, int log2_scale, + const uint8_t dither[8][8]) { int y; @@ -35,30 +39,25 @@ static void store_slice_mmx(uint8_t *dst, const int16_t *src, uint8_t *dst1 = dst; const int16_t *src1 = src; __asm__ volatile( - "movq (%3), %%mm3 \n" - "movq (%3), %%mm4 \n" - "movd %4, %%mm2 \n" - "pxor %%mm0, %%mm0 \n" - "punpcklbw %%mm0, %%mm3 \n" - "punpckhbw %%mm0, %%mm4 \n" - "psraw %%mm2, %%mm3 \n" - "psraw %%mm2, %%mm4 \n" - "movd %5, %%mm2 \n" + "movq (%3), %%xmm1 \n" + "movd %4, %%xmm2 \n" + "pxor %%xmm0, %%xmm0 \n" + "punpcklbw %%xmm0, %%xmm1 \n" + "psraw %%xmm2, %%xmm1 \n" + "movd %5, %%xmm2 \n" "1: \n" - "movq (%0), %%mm0 \n" - "movq 8(%0), %%mm1 \n" - "paddw %%mm3, %%mm0 \n" - "paddw %%mm4, %%mm1 \n" - "psraw %%mm2, %%mm0 \n" - "psraw %%mm2, %%mm1 \n" - "packuswb %%mm1, %%mm0 \n" - "movq %%mm0, (%1) \n" + "movdqa (%0), %%xmm0 \n" + "paddw %%xmm1, %%xmm0 \n" + "psraw %%xmm2, %%xmm0 \n" + "packuswb %%xmm0, %%xmm0 \n" + "movq %%xmm0, (%1) \n" "add $16, %0 \n" "add $8, %1 \n" "cmp %2, %1 \n" " jb 1b \n" : "+r" (src1), "+r"(dst1) : "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(MAX_LEVEL - log2_scale) + XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2") ); src += src_stride; dst += dst_stride; @@ -69,11 +68,11 @@ static void store_slice_mmx(uint8_t *dst, const int16_t *src, av_cold void ff_spp_init_x86(SPPContext *s) { -#if HAVE_MMX_INLINE +#if HAVE_SSE2_INLINE int cpu_flags = av_get_cpu_flags(); - if (cpu_flags & AV_CPU_FLAG_MMX) { - s->store_slice = store_slice_mmx; + if (INLINE_SSE2(cpu_flags)) { + s->store_slice = store_slice_sse2; } #endif } -- 2.49.1 From d9571bb9b2e49042a7d2fb0bd18c390b6dc63f57 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sun, 12 Oct 2025 19:28:35 +0200 Subject: [PATCH 2/2] avfilter/x86/vf_pullup: Port pullup functions to SSE2, SSSE3 The diff and var functions benefit from psadbw, comb from wider registers which allows to avoid reloading values, reducing the number of loads from 48 to 10. Performance increased by 117% (the loop in compute_metric() has been timed); codesize decreased by 144B. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavfilter/vf_pullup.c | 2 - libavfilter/x86/vf_pullup.asm | 182 ++++++++++--------------------- libavfilter/x86/vf_pullup_init.c | 16 +-- 3 files changed, 66 insertions(+), 134 deletions(-) diff --git a/libavfilter/vf_pullup.c b/libavfilter/vf_pullup.c index 1e4289aab1..d963840fe9 100644 --- a/libavfilter/vf_pullup.c +++ b/libavfilter/vf_pullup.c @@ -19,7 +19,6 @@ */ #include "libavutil/avassert.h" -#include "libavutil/emms.h" #include "libavutil/imgutils.h" #include "libavutil/mem.h" #include "libavutil/opt.h" @@ -597,7 +596,6 @@ static void pullup_submit_field(PullupContext *s, PullupBuffer *b, int parity) compute_metric(s, f->diffs, f, parity, f->prev->prev, parity, s->diff); compute_metric(s, f->combs, parity ? f->prev : f, 0, parity ? f : f->prev, 1, s->comb); compute_metric(s, f->vars, f, parity, f, -1, s->var); - emms_c(); /* Advance the circular list */ if (!s->first) diff --git a/libavfilter/x86/vf_pullup.asm b/libavfilter/x86/vf_pullup.asm index 26c2a27d37..6875a846a2 100644 --- a/libavfilter/x86/vf_pullup.asm +++ b/libavfilter/x86/vf_pullup.asm @@ -22,157 +22,89 @@ SECTION .text -INIT_MMX mmx -cglobal pullup_filter_diff, 3, 5, 8, first, second, size +INIT_XMM sse2 +cglobal pullup_filter_diff, 3, 4, 3, first, second, size mov r3, 4 - pxor m4, m4 - pxor m7, m7 + pxor m2, m2 .loop: movq m0, [firstq] - movq m2, [firstq] add firstq, sizeq movq m1, [secondq] add secondq, sizeq - psubusb m2, m1 - psubusb m1, m0 - movq m0, m2 - movq m3, m1 - punpcklbw m0, m7 - punpcklbw m1, m7 - punpckhbw m2, m7 - punpckhbw m3, m7 - paddw m4, m0 - paddw m4, m1 - paddw m4, m2 - paddw m4, m3 + psadbw m0, m1 + paddw m2, m0 dec r3 jnz .loop - movq m3, m4 - punpcklwd m4, m7 - punpckhwd m3, m7 - paddd m3, m4 - movd eax, m3 - psrlq m3, 32 - movd r4d, m3 - add eax, r4d + movd eax, m2 RET -INIT_MMX mmx -cglobal pullup_filter_comb, 3, 5, 8, first, second, size - mov r3, 4 +INIT_XMM ssse3 +cglobal pullup_filter_comb, 3, 5, 7, first, second, size + movq m0, [firstq] + sub secondq, sizeq + movq m1, [secondq] pxor m6, m6 - pxor m7, m7 - sub secondq, sizeq + punpcklbw m0, m6 + punpcklbw m1, m6 + add firstq, sizeq + add secondq, sizeq + pxor m5, m5 + mov r3, 4 .loop: - movq m0, [firstq] - movq m1, [secondq] - punpcklbw m0, m7 - movq m2, [secondq+sizeq] - punpcklbw m1, m7 - punpcklbw m2, m7 - paddw m0, m0 - paddw m1, m2 - movq m2, m0 - psubusw m0, m1 - psubusw m1, m2 - paddw m6, m0 - paddw m6, m1 + movq m2, [firstq] + movq m3, [secondq] + add firstq, sizeq + add secondq, sizeq + punpcklbw m2, m6 + punpcklbw m3, m6 + mova m4, m0 - movq m0, [firstq] - movq m1, [secondq] - punpckhbw m0, m7 - movq m2, [secondq+sizeq] - punpckhbw m1, m7 - punpckhbw m2, m7 paddw m0, m0 - paddw m1, m2 - movq m2, m0 - psubusw m0, m1 - psubusw m1, m2 - paddw m6, m0 - paddw m6, m1 + paddw m1, m3 + psubw m0, m1 + pabsw m0, m0 + paddw m5, m0 - movq m0, [secondq+sizeq] - movq m1, [firstq] - punpcklbw m0, m7 - movq m2, [firstq+sizeq] - punpcklbw m1, m7 - punpcklbw m2, m7 - paddw m0, m0 - paddw m1, m2 - movq m2, m0 - psubusw m0, m1 - psubusw m1, m2 - paddw m6, m0 - paddw m6, m1 + mova m1, m3 + paddw m4, m2 + paddw m3, m3 + psubw m3, m4 + pabsw m3, m3 + paddw m5, m3 + mova m2, m0 - movq m0, [secondq+sizeq] - movq m1, [firstq] - punpckhbw m0, m7 - movq m2, [firstq+sizeq] - punpckhbw m1, m7 - punpckhbw m2, m7 - paddw m0, m0 - paddw m1, m2 - movq m2, m0 - psubusw m0, m1 - psubusw m1, m2 - paddw m6, m0 - paddw m6, m1 - - add firstq, sizeq - add secondq, sizeq dec r3 jnz .loop - movq m5, m6 - punpcklwd m6, m7 - punpckhwd m5, m7 - paddd m5, m6 - movd eax, m5 - psrlq m5, 32 - movd r4d, m5 - add eax, r4d + movq m0, m5 + punpcklwd m5, m6 + punpckhwd m0, m6 + paddd m0, m5 + pshufd m5, m0, 0xE + paddd m0, m5 + pshufd m5, m0, 0x1 + paddd m0, m5 + movd eax, m0 RET -INIT_MMX mmx -cglobal pullup_filter_var, 3, 5, 8, first, second, size - mov r3, 3 - pxor m4, m4 - pxor m7, m7 - -.loop: +INIT_XMM sse2 +cglobal pullup_filter_var, 3, 3, 3, first, second, size movq m0, [firstq] - movq m2, [firstq] - movq m1, [firstq+sizeq] add firstq, sizeq - psubusb m2, m1 - psubusb m1, m0 - movq m0, m2 - movq m3, m1 - punpcklbw m0, m7 - punpcklbw m1, m7 - punpckhbw m2, m7 - punpckhbw m3, m7 - paddw m4, m0 - paddw m4, m1 - paddw m4, m2 - paddw m4, m3 - - dec r3 - jnz .loop - - movq m3, m4 - punpcklwd m4, m7 - punpckhwd m3, m7 - paddd m3, m4 - movd eax, m3 - psrlq m3, 32 - movd r4d, m3 - add eax, r4d + movq m1, [firstq] + pxor m2, m2 + psadbw m0, m1 + paddw m2, m0 + movq m0, [firstq+sizeq] + psadbw m1, m0 + paddw m2, m1 + movq m1, [firstq+2*sizeq] + psadbw m0, m1 + paddw m2, m0 + movd eax, m2 shl eax, 2 RET diff --git a/libavfilter/x86/vf_pullup_init.c b/libavfilter/x86/vf_pullup_init.c index 562a3fb625..943c1de9d7 100644 --- a/libavfilter/x86/vf_pullup_init.c +++ b/libavfilter/x86/vf_pullup_init.c @@ -22,19 +22,21 @@ #include "libavutil/x86/cpu.h" #include "libavfilter/vf_pullup.h" -int ff_pullup_filter_diff_mmx(const uint8_t *a, const uint8_t *b, ptrdiff_t s); -int ff_pullup_filter_comb_mmx(const uint8_t *a, const uint8_t *b, ptrdiff_t s); -int ff_pullup_filter_var_mmx (const uint8_t *a, const uint8_t *b, ptrdiff_t s); +int ff_pullup_filter_diff_sse2 (const uint8_t *a, const uint8_t *b, ptrdiff_t s); +int ff_pullup_filter_comb_ssse3(const uint8_t *a, const uint8_t *b, ptrdiff_t s); +int ff_pullup_filter_var_sse2 (const uint8_t *a, const uint8_t *b, ptrdiff_t s); av_cold void ff_pullup_init_x86(PullupContext *s) { #if HAVE_X86ASM int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(cpu_flags)) { - s->diff = ff_pullup_filter_diff_mmx; - s->comb = ff_pullup_filter_comb_mmx; - s->var = ff_pullup_filter_var_mmx; + if (EXTERNAL_SSE2(cpu_flags)) { + s->diff = ff_pullup_filter_diff_sse2; + s->var = ff_pullup_filter_var_sse2; + } + if (EXTERNAL_SSSE3(cpu_flags)) { + s->comb = ff_pullup_filter_comb_ssse3; } #endif } -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2025-10-12 17:46 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=176029116721.52.15970957343339096878@bf249f23a2c8 \ --to=ffmpeg-devel@ffmpeg.org \ --cc=code@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git