* [FFmpeg-devel] [PATCH] avcodec/x86/h263_loopfilter: Port loop filter to SSE2 (PR #20636)
@ 2025-10-01 12:31 mkver via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-10-01 12:31 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: mkver
PR #20636 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20636
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20636.patch
>From 0c2f259863bea7908422a5ae43ec380fce1f8135 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 1 Oct 2025 13:42:09 +0200
Subject: [PATCH 1/2] tests/checkasm/llviddsp: Use the same width for each
cpuflag
Otherwise the benchmark numbers would be incomparable nonsense.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
tests/checkasm/llviddsp.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/tests/checkasm/llviddsp.c b/tests/checkasm/llviddsp.c
index 9f8de65df4..0552e98106 100644
--- a/tests/checkasm/llviddsp.c
+++ b/tests/checkasm/llviddsp.c
@@ -195,9 +195,13 @@ static void check_add_gradient_pred(LLVidDSPContext *c, int w) {
void checkasm_check_llviddsp(void)
{
LLVidDSPContext c;
- int width = 16 * av_clip(rnd(), 16, 128);
+ static int saved_width = 0;
+ int width = saved_width;
int accRnd = rnd() & 0xFF;
+ if (!width)
+ saved_width = width = 16 * av_clip(rnd(), 16, 128);
+
ff_llviddsp_init(&c);
check_add_bytes(&c, width);
--
2.49.1
>From 412776390c42ed6088752a314ec515792239055d Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 1 Oct 2025 10:46:39 +0200
Subject: [PATCH 2/2] avcodec/x86/h263_loopfilter: Port loop filter to SSE2
Old benchmarks:
h263dsp.h_loop_filter_c: 41.2 ( 1.00x)
h263dsp.h_loop_filter_mmx: 39.5 ( 1.04x)
h263dsp.v_loop_filter_c: 43.5 ( 1.00x)
h263dsp.v_loop_filter_mmx: 16.9 ( 2.57x)
New benchmarks:
h263dsp.h_loop_filter_c: 41.6 ( 1.00x)
h263dsp.h_loop_filter_sse2: 28.2 ( 1.48x)
h263dsp.v_loop_filter_c: 42.4 ( 1.00x)
h263dsp.v_loop_filter_sse2: 15.1 ( 2.81x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/constants.c | 2 +-
libavcodec/x86/constants.h | 2 +-
libavcodec/x86/h263_loopfilter.asm | 167 ++++++++++++-----------------
libavcodec/x86/h263dsp_init.c | 10 +-
tests/checkasm/h263dsp.c | 2 +-
5 files changed, 78 insertions(+), 105 deletions(-)
diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
index c5f3c6428e..1e2f5990e4 100644
--- a/libavcodec/x86/constants.c
+++ b/libavcodec/x86/constants.c
@@ -75,7 +75,7 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x808
0x8080808080808080ULL, 0x8080808080808080ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
-DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FC) = { 0xFCFCFCFCFCFCFCFCULL, 0xFCFCFCFCFCFCFCFCULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL };
diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
index 4a55adb5b3..7d0bd975b9 100644
--- a/libavcodec/x86/constants.h
+++ b/libavcodec/x86/constants.h
@@ -56,8 +56,8 @@ extern const ymm_reg ff_pb_1;
extern const ymm_reg ff_pb_2;
extern const ymm_reg ff_pb_3;
extern const ymm_reg ff_pb_80;
+extern const xmm_reg ff_pb_FC;
extern const ymm_reg ff_pb_FE;
-extern const uint64_t ff_pb_FC;
extern const xmm_reg ff_ps_neg;
diff --git a/libavcodec/x86/h263_loopfilter.asm b/libavcodec/x86/h263_loopfilter.asm
index 77c8cf154d..ebe76f01af 100644
--- a/libavcodec/x86/h263_loopfilter.asm
+++ b/libavcodec/x86/h263_loopfilter.asm
@@ -1,5 +1,5 @@
;******************************************************************************
-;* MMX-optimized H.263 loop filter
+;* SSE2-optimized H.263 loop filter
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
@@ -22,7 +22,6 @@
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
cextern pb_FC
cextern h263_loop_filter_strength
@@ -30,60 +29,45 @@ SECTION .text
%macro H263_LOOP_FILTER 5
pxor m7, m7
- mova m0, [%1]
- mova m1, [%1]
- mova m2, [%4]
- mova m3, [%4]
+ movq m0, [%1]
+ movq m6, [%4]
+ mova m5, m0
punpcklbw m0, m7
- punpckhbw m1, m7
+ punpcklbw m6, m7
+ psubw m0, m6
+ movq m2, [%2]
+ movq m1, [%3]
+ mova m3, m2
+ mova m4, m1
punpcklbw m2, m7
- punpckhbw m3, m7
- psubw m0, m2
- psubw m1, m3
- mova m2, [%2]
- mova m3, [%2]
- mova m4, [%3]
- mova m5, [%3]
- punpcklbw m2, m7
- punpckhbw m3, m7
- punpcklbw m4, m7
- punpckhbw m5, m7
- psubw m4, m2
- psubw m5, m3
- psllw m4, 2
- psllw m5, 2
- paddw m4, m0
- paddw m5, m1
+ punpcklbw m1, m7
+ psubw m1, m2
+ psllw m1, 2
+ paddw m1, m0
pxor m6, m6
- pcmpgtw m6, m4
- pcmpgtw m7, m5
- pxor m4, m6
- pxor m5, m7
- psubw m4, m6
- psubw m5, m7
- psrlw m4, 3
- psrlw m5, 3
- packuswb m4, m5
+ pcmpgtw m6, m1
+ pxor m1, m6
+ psubw m1, m6
+ psrlw m1, 3
+ packuswb m1, m7
packsswb m6, m7
- pxor m7, m7
movd m2, %5
punpcklbw m2, m2
punpcklbw m2, m2
punpcklbw m2, m2
- psubusb m2, m4
- mova m3, m2
- psubusb m3, m4
- psubb m2, m3
- mova m3, [%2]
- mova m4, [%3]
+ psubusb m2, m1
+ mova m7, m2
+ psubusb m7, m1
+ psubb m2, m7
pxor m3, m6
pxor m4, m6
paddusb m3, m2
psubusb m4, m2
+ pxor m7, m7
pxor m3, m6
pxor m4, m6
paddusb m2, m2
- packsswb m0, m1
+ packsswb m0, m7
pcmpgtb m7, m0
pxor m0, m7
psubb m0, m7
@@ -94,22 +78,20 @@ SECTION .text
psrlw m1, 2
pxor m1, m7
psubb m1, m7
- mova m5, [%1]
- mova m6, [%4]
+ movq m6, [%4]
psubb m5, m1
paddb m6, m1
%endmacro
-INIT_MMX mmx
-; void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
-cglobal h263_v_loop_filter, 3,5
+INIT_XMM sse2
+; void ff_h263_v_loop_filter_sse2(uint8_t *src, int stride, int qscale)
+cglobal h263_v_loop_filter, 3,5,8
movsxdifnidn r1, r1d
movsxdifnidn r2, r2d
- lea r4, [h263_loop_filter_strength]
- movzx r3d, BYTE [r4+r2]
- movsx r2, r3b
- shl r2, 1
+ lea r3, [h263_loop_filter_strength]
+ movzx r2d, BYTE [r3+r2]
+ shl r2d, 1
mov r3, r0
sub r3, r1
@@ -117,73 +99,64 @@ cglobal h263_v_loop_filter, 3,5
sub r4, r1
H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
- mova [r3], m3
- mova [r0], m4
- mova [r4], m5
- mova [r0+r1], m6
+ movq [r3], m3
+ movq [r0], m4
+ movq [r4], m5
+ movq [r0+r1], m6
RET
%macro TRANSPOSE4X4 2
- movd m0, [%1]
- movd m1, [%1+r1]
- movd m2, [%1+r1*2]
- movd m3, [%1+r3]
- punpcklbw m0, m1
- punpcklbw m2, m3
- mova m1, m0
- punpcklwd m0, m2
- punpckhwd m1, m2
- movd [%2+ 0], m0
- punpckhdq m0, m0
- movd [%2+ 8], m0
- movd [%2+16], m1
- punpckhdq m1, m1
- movd [%2+24], m1
+ movd %1, [%2]
+ movd m2, [%2+r1]
+ movd m3, [%2+r1*2]
+ movd m4, [%2+r3]
+ punpcklbw %1, m2
+ punpcklbw m3, m4
+ punpcklwd %1, m3
%endmacro
-; void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
-INIT_MMX mmx
-cglobal h263_h_loop_filter, 3,5,0,32
+; void ff_h263_h_loop_filter_sse2(uint8_t *src, int stride, int qscale)
+INIT_XMM sse2
+cglobal h263_h_loop_filter, 3,5,8,32
movsxdifnidn r1, r1d
movsxdifnidn r2, r2d
lea r4, [h263_loop_filter_strength]
- movzx r3d, BYTE [r4+r2]
- movsx r2, r3b
- shl r2, 1
+ movzx r2d, BYTE [r4+r2]
+ shl r2d, 1
sub r0, 2
lea r3, [r1*3]
-
- TRANSPOSE4X4 r0, rsp
lea r4, [r0+r1*4]
- TRANSPOSE4X4 r4, rsp+4
+
+ TRANSPOSE4X4 m0, r0
+ TRANSPOSE4X4 m1, r4
+ mova m2, m0
+ punpckldq m0, m1
+ mova [rsp], m0
+ punpckhdq m2, m1
+ mova [rsp+16], m2
H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
- mova m1, m5
- mova m0, m4
punpcklbw m5, m3
punpcklbw m4, m6
- punpckhbw m1, m3
- punpckhbw m0, m6
- mova m3, m5
- mova m6, m1
+ mova m0, m5
punpcklwd m5, m4
- punpcklwd m1, m0
- punpckhwd m3, m4
- punpckhwd m6, m0
+ punpckhwd m0, m4
movd [r0], m5
+ movd [r4], m0
+ pshufd m1, m5, 0x1
+ pshufd m2, m0, 0x1
+ movd [r0+r1*1], m1
+ movd [r4+r1*1], m2
punpckhdq m5, m5
- movd [r0+r1*1], m5
- movd [r0+r1*2], m3
- punpckhdq m3, m3
- movd [r0+r3], m3
- movd [r4], m1
- punpckhdq m1, m1
- movd [r4+r1*1], m1
- movd [r4+r1*2], m6
- punpckhdq m6, m6
- movd [r4+r3], m6
+ punpckhdq m0, m0
+ movd [r0+r1*2], m5
+ movd [r4+r1*2], m0
+ punpckhdq m5, m5
+ punpckhdq m0, m0
+ movd [r0+r3], m5
+ movd [r4+r3], m0
RET
diff --git a/libavcodec/x86/h263dsp_init.c b/libavcodec/x86/h263dsp_init.c
index ab81063233..3dd5d132e5 100644
--- a/libavcodec/x86/h263dsp_init.c
+++ b/libavcodec/x86/h263dsp_init.c
@@ -25,15 +25,15 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/h263dsp.h"
-void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
-void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
+void ff_h263_h_loop_filter_sse2(uint8_t *src, int stride, int qscale);
+void ff_h263_v_loop_filter_sse2(uint8_t *src, int stride, int qscale);
av_cold void ff_h263dsp_init_x86(H263DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
- if (EXTERNAL_MMX(cpu_flags)) {
- c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
- c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->h263_h_loop_filter = ff_h263_h_loop_filter_sse2;
+ c->h263_v_loop_filter = ff_h263_v_loop_filter_sse2;
}
}
diff --git a/tests/checkasm/h263dsp.c b/tests/checkasm/h263dsp.c
index 2d0957a90b..f99d376adc 100644
--- a/tests/checkasm/h263dsp.c
+++ b/tests/checkasm/h263dsp.c
@@ -34,7 +34,7 @@ static void check_loop_filter(char dim, filter func)
LOCAL_ALIGNED_16(uint8_t, buf1, [32 * 32]);
int qscale = rnd() % 32;
- declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, int);
+ declare_func(void, uint8_t *, int, int);
for (size_t y = 0; y < 32; y++)
for (size_t x = 0; x < 32; x++)
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-10-01 12:32 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-01 12:31 [FFmpeg-devel] [PATCH] avcodec/x86/h263_loopfilter: Port loop filter to SSE2 (PR #20636) mkver via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git