From: mkver via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> To: ffmpeg-devel@ffmpeg.org Cc: mkver <code@ffmpeg.org> Subject: [FFmpeg-devel] [PATCH] avcodec/x86/h263_loopfilter: Port loop filter to SSE2 (PR #20636) Date: Wed, 01 Oct 2025 12:31:28 -0000 Message-ID: <175932188871.69.12902028291436852776@bf249f23a2c8> (raw) PR #20636 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20636 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20636.patch >From 0c2f259863bea7908422a5ae43ec380fce1f8135 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Wed, 1 Oct 2025 13:42:09 +0200 Subject: [PATCH 1/2] tests/checkasm/llviddsp: Use the same width for each cpuflag Otherwise the benchmark numbers would be incomparable nonsense. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- tests/checkasm/llviddsp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/checkasm/llviddsp.c b/tests/checkasm/llviddsp.c index 9f8de65df4..0552e98106 100644 --- a/tests/checkasm/llviddsp.c +++ b/tests/checkasm/llviddsp.c @@ -195,9 +195,13 @@ static void check_add_gradient_pred(LLVidDSPContext *c, int w) { void checkasm_check_llviddsp(void) { LLVidDSPContext c; - int width = 16 * av_clip(rnd(), 16, 128); + static int saved_width = 0; + int width = saved_width; int accRnd = rnd() & 0xFF; + if (!width) + saved_width = width = 16 * av_clip(rnd(), 16, 128); + ff_llviddsp_init(&c); check_add_bytes(&c, width); -- 2.49.1 >From 412776390c42ed6088752a314ec515792239055d Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Wed, 1 Oct 2025 10:46:39 +0200 Subject: [PATCH 2/2] avcodec/x86/h263_loopfilter: Port loop filter to SSE2 Old benchmarks: h263dsp.h_loop_filter_c: 41.2 ( 1.00x) h263dsp.h_loop_filter_mmx: 39.5 ( 1.04x) h263dsp.v_loop_filter_c: 43.5 ( 1.00x) h263dsp.v_loop_filter_mmx: 16.9 ( 2.57x) New benchmarks: h263dsp.h_loop_filter_c: 41.6 ( 1.00x) h263dsp.h_loop_filter_sse2: 28.2 ( 1.48x) h263dsp.v_loop_filter_c: 42.4 ( 1.00x) h263dsp.v_loop_filter_sse2: 15.1 ( 2.81x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/constants.c | 2 +- libavcodec/x86/constants.h | 2 +- libavcodec/x86/h263_loopfilter.asm | 167 ++++++++++++----------------- libavcodec/x86/h263dsp_init.c | 10 +- tests/checkasm/h263dsp.c | 2 +- 5 files changed, 78 insertions(+), 105 deletions(-) diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c index c5f3c6428e..1e2f5990e4 100644 --- a/libavcodec/x86/constants.c +++ b/libavcodec/x86/constants.c @@ -75,7 +75,7 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x808 0x8080808080808080ULL, 0x8080808080808080ULL }; DECLARE_ALIGNED(32, const ymm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL }; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FC) = { 0xFCFCFCFCFCFCFCFCULL, 0xFCFCFCFCFCFCFCFCULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL }; diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h index 4a55adb5b3..7d0bd975b9 100644 --- a/libavcodec/x86/constants.h +++ b/libavcodec/x86/constants.h @@ -56,8 +56,8 @@ extern const ymm_reg ff_pb_1; extern const ymm_reg ff_pb_2; extern const ymm_reg ff_pb_3; extern const ymm_reg ff_pb_80; +extern const xmm_reg ff_pb_FC; extern const ymm_reg ff_pb_FE; -extern const uint64_t ff_pb_FC; extern const xmm_reg ff_ps_neg; diff --git a/libavcodec/x86/h263_loopfilter.asm b/libavcodec/x86/h263_loopfilter.asm index 77c8cf154d..ebe76f01af 100644 --- a/libavcodec/x86/h263_loopfilter.asm +++ b/libavcodec/x86/h263_loopfilter.asm @@ -1,5 +1,5 @@ ;****************************************************************************** -;* MMX-optimized H.263 loop filter +;* SSE2-optimized H.263 loop filter ;* Copyright (c) 2003-2013 Michael Niedermayer ;* Copyright (c) 2013 Daniel Kang ;* @@ -22,7 +22,6 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA cextern pb_FC cextern h263_loop_filter_strength @@ -30,60 +29,45 @@ SECTION .text %macro H263_LOOP_FILTER 5 pxor m7, m7 - mova m0, [%1] - mova m1, [%1] - mova m2, [%4] - mova m3, [%4] + movq m0, [%1] + movq m6, [%4] + mova m5, m0 punpcklbw m0, m7 - punpckhbw m1, m7 + punpcklbw m6, m7 + psubw m0, m6 + movq m2, [%2] + movq m1, [%3] + mova m3, m2 + mova m4, m1 punpcklbw m2, m7 - punpckhbw m3, m7 - psubw m0, m2 - psubw m1, m3 - mova m2, [%2] - mova m3, [%2] - mova m4, [%3] - mova m5, [%3] - punpcklbw m2, m7 - punpckhbw m3, m7 - punpcklbw m4, m7 - punpckhbw m5, m7 - psubw m4, m2 - psubw m5, m3 - psllw m4, 2 - psllw m5, 2 - paddw m4, m0 - paddw m5, m1 + punpcklbw m1, m7 + psubw m1, m2 + psllw m1, 2 + paddw m1, m0 pxor m6, m6 - pcmpgtw m6, m4 - pcmpgtw m7, m5 - pxor m4, m6 - pxor m5, m7 - psubw m4, m6 - psubw m5, m7 - psrlw m4, 3 - psrlw m5, 3 - packuswb m4, m5 + pcmpgtw m6, m1 + pxor m1, m6 + psubw m1, m6 + psrlw m1, 3 + packuswb m1, m7 packsswb m6, m7 - pxor m7, m7 movd m2, %5 punpcklbw m2, m2 punpcklbw m2, m2 punpcklbw m2, m2 - psubusb m2, m4 - mova m3, m2 - psubusb m3, m4 - psubb m2, m3 - mova m3, [%2] - mova m4, [%3] + psubusb m2, m1 + mova m7, m2 + psubusb m7, m1 + psubb m2, m7 pxor m3, m6 pxor m4, m6 paddusb m3, m2 psubusb m4, m2 + pxor m7, m7 pxor m3, m6 pxor m4, m6 paddusb m2, m2 - packsswb m0, m1 + packsswb m0, m7 pcmpgtb m7, m0 pxor m0, m7 psubb m0, m7 @@ -94,22 +78,20 @@ SECTION .text psrlw m1, 2 pxor m1, m7 psubb m1, m7 - mova m5, [%1] - mova m6, [%4] + movq m6, [%4] psubb m5, m1 paddb m6, m1 %endmacro -INIT_MMX mmx -; void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale) -cglobal h263_v_loop_filter, 3,5 +INIT_XMM sse2 +; void ff_h263_v_loop_filter_sse2(uint8_t *src, int stride, int qscale) +cglobal h263_v_loop_filter, 3,5,8 movsxdifnidn r1, r1d movsxdifnidn r2, r2d - lea r4, [h263_loop_filter_strength] - movzx r3d, BYTE [r4+r2] - movsx r2, r3b - shl r2, 1 + lea r3, [h263_loop_filter_strength] + movzx r2d, BYTE [r3+r2] + shl r2d, 1 mov r3, r0 sub r3, r1 @@ -117,73 +99,64 @@ cglobal h263_v_loop_filter, 3,5 sub r4, r1 H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d - mova [r3], m3 - mova [r0], m4 - mova [r4], m5 - mova [r0+r1], m6 + movq [r3], m3 + movq [r0], m4 + movq [r4], m5 + movq [r0+r1], m6 RET %macro TRANSPOSE4X4 2 - movd m0, [%1] - movd m1, [%1+r1] - movd m2, [%1+r1*2] - movd m3, [%1+r3] - punpcklbw m0, m1 - punpcklbw m2, m3 - mova m1, m0 - punpcklwd m0, m2 - punpckhwd m1, m2 - movd [%2+ 0], m0 - punpckhdq m0, m0 - movd [%2+ 8], m0 - movd [%2+16], m1 - punpckhdq m1, m1 - movd [%2+24], m1 + movd %1, [%2] + movd m2, [%2+r1] + movd m3, [%2+r1*2] + movd m4, [%2+r3] + punpcklbw %1, m2 + punpcklbw m3, m4 + punpcklwd %1, m3 %endmacro -; void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale) -INIT_MMX mmx -cglobal h263_h_loop_filter, 3,5,0,32 +; void ff_h263_h_loop_filter_sse2(uint8_t *src, int stride, int qscale) +INIT_XMM sse2 +cglobal h263_h_loop_filter, 3,5,8,32 movsxdifnidn r1, r1d movsxdifnidn r2, r2d lea r4, [h263_loop_filter_strength] - movzx r3d, BYTE [r4+r2] - movsx r2, r3b - shl r2, 1 + movzx r2d, BYTE [r4+r2] + shl r2d, 1 sub r0, 2 lea r3, [r1*3] - - TRANSPOSE4X4 r0, rsp lea r4, [r0+r1*4] - TRANSPOSE4X4 r4, rsp+4 + + TRANSPOSE4X4 m0, r0 + TRANSPOSE4X4 m1, r4 + mova m2, m0 + punpckldq m0, m1 + mova [rsp], m0 + punpckhdq m2, m1 + mova [rsp+16], m2 H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d - mova m1, m5 - mova m0, m4 punpcklbw m5, m3 punpcklbw m4, m6 - punpckhbw m1, m3 - punpckhbw m0, m6 - mova m3, m5 - mova m6, m1 + mova m0, m5 punpcklwd m5, m4 - punpcklwd m1, m0 - punpckhwd m3, m4 - punpckhwd m6, m0 + punpckhwd m0, m4 movd [r0], m5 + movd [r4], m0 + pshufd m1, m5, 0x1 + pshufd m2, m0, 0x1 + movd [r0+r1*1], m1 + movd [r4+r1*1], m2 punpckhdq m5, m5 - movd [r0+r1*1], m5 - movd [r0+r1*2], m3 - punpckhdq m3, m3 - movd [r0+r3], m3 - movd [r4], m1 - punpckhdq m1, m1 - movd [r4+r1*1], m1 - movd [r4+r1*2], m6 - punpckhdq m6, m6 - movd [r4+r3], m6 + punpckhdq m0, m0 + movd [r0+r1*2], m5 + movd [r4+r1*2], m0 + punpckhdq m5, m5 + punpckhdq m0, m0 + movd [r0+r3], m5 + movd [r4+r3], m0 RET diff --git a/libavcodec/x86/h263dsp_init.c b/libavcodec/x86/h263dsp_init.c index ab81063233..3dd5d132e5 100644 --- a/libavcodec/x86/h263dsp_init.c +++ b/libavcodec/x86/h263dsp_init.c @@ -25,15 +25,15 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/h263dsp.h" -void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale); -void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale); +void ff_h263_h_loop_filter_sse2(uint8_t *src, int stride, int qscale); +void ff_h263_v_loop_filter_sse2(uint8_t *src, int stride, int qscale); av_cold void ff_h263dsp_init_x86(H263DSPContext *c) { int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(cpu_flags)) { - c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx; - c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx; + if (EXTERNAL_SSE2(cpu_flags)) { + c->h263_h_loop_filter = ff_h263_h_loop_filter_sse2; + c->h263_v_loop_filter = ff_h263_v_loop_filter_sse2; } } diff --git a/tests/checkasm/h263dsp.c b/tests/checkasm/h263dsp.c index 2d0957a90b..f99d376adc 100644 --- a/tests/checkasm/h263dsp.c +++ b/tests/checkasm/h263dsp.c @@ -34,7 +34,7 @@ static void check_loop_filter(char dim, filter func) LOCAL_ALIGNED_16(uint8_t, buf1, [32 * 32]); int qscale = rnd() % 32; - declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, int); + declare_func(void, uint8_t *, int, int); for (size_t y = 0; y < 32; y++) for (size_t x = 0; x < 32; x++) -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2025-10-01 12:32 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=175932188871.69.12902028291436852776@bf249f23a2c8 \ --to=ffmpeg-devel@ffmpeg.org \ --cc=code@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git