From: mkver via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: mkver <code@ffmpeg.org>
Subject: [FFmpeg-devel] [PATCH] avcodec/x86/h264_chromamc: Use xmm regs in chroma_mc4 SSSE3 functions (PR #20842)
Date: Wed, 05 Nov 2025 13:10:49 -0000
Message-ID: <176234825039.25.17628330336355125775@2cb04c0e5124> (raw)
PR #20842 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20842
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20842.patch
Doubling the register size allowed to avoid two pmaddubsw.
It is also ABI compliant (the old version lacked an emms)
and the average versions no longer rely on padding (the old versions
used pavgb with a memory operand reading eight bytes,
although only four are needed).
Old benchmarks (the latter four refer to RV40):
avg_h264_chroma_mc4_8_c: 145.7 ( 1.00x)
avg_h264_chroma_mc4_8_ssse3: 32.3 ( 4.51x)
put_h264_chroma_mc4_8_c: 136.1 ( 1.00x)
put_h264_chroma_mc4_8_ssse3: 29.0 ( 4.70x)
avg_chroma_mc4_c: 162.1 ( 1.00x)
avg_chroma_mc4_ssse3: 31.1 ( 5.22x)
put_chroma_mc4_c: 137.5 ( 1.00x)
put_chroma_mc4_ssse3: 28.6 ( 4.81x)
New benchmarks:
avg_h264_chroma_mc4_8_c: 146.7 ( 1.00x)
avg_h264_chroma_mc4_8_ssse3: 26.5 ( 5.53x)
put_h264_chroma_mc4_8_c: 136.8 ( 1.00x)
put_h264_chroma_mc4_8_ssse3: 22.5 ( 6.09x)
avg_chroma_mc4_c: 165.5 ( 1.00x)
avg_chroma_mc4_ssse3: 27.2 ( 6.08x)
put_chroma_mc4_c: 138.1 ( 1.00x)
put_chroma_mc4_ssse3: 23.2 ( 5.96x)
>From 16296019a93e612ba4d07495e9bc85c49dbc1aaf Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 5 Nov 2025 12:46:50 +0100
Subject: [PATCH] avcodec/x86/h264_chromamc: Use xmm regs in chroma_mc4 SSSE3
functions
Doubling the register size allowed to avoid two pmaddubsw.
It is also ABI compliant (the old version lacked an emms)
and the average versions no longer rely on padding (the old versions
used pavgb with a memory operand reading eight bytes,
although only four are needed).
Old benchmarks (the latter four refer to RV40):
avg_h264_chroma_mc4_8_c: 145.7 ( 1.00x)
avg_h264_chroma_mc4_8_ssse3: 32.3 ( 4.51x)
put_h264_chroma_mc4_8_c: 136.1 ( 1.00x)
put_h264_chroma_mc4_8_ssse3: 29.0 ( 4.70x)
avg_chroma_mc4_c: 162.1 ( 1.00x)
avg_chroma_mc4_ssse3: 31.1 ( 5.22x)
put_chroma_mc4_c: 137.5 ( 1.00x)
put_chroma_mc4_ssse3: 28.6 ( 4.81x)
New benchmarks:
avg_h264_chroma_mc4_8_c: 146.7 ( 1.00x)
avg_h264_chroma_mc4_8_ssse3: 26.5 ( 5.53x)
put_h264_chroma_mc4_8_c: 136.8 ( 1.00x)
put_h264_chroma_mc4_8_ssse3: 22.5 ( 6.09x)
avg_chroma_mc4_c: 165.5 ( 1.00x)
avg_chroma_mc4_ssse3: 27.2 ( 6.08x)
put_chroma_mc4_c: 138.1 ( 1.00x)
put_chroma_mc4_ssse3: 23.2 ( 5.96x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/h264_chromamc.asm | 89 +++++++++++++++++---------------
1 file changed, 46 insertions(+), 43 deletions(-)
diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index 6a65d5cabd..7c896db179 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -276,51 +276,57 @@ cglobal %1_%2_chroma_mc8%3, 6, 7+UNIX64, 8
%endmacro
%macro chroma_mc4_ssse3_func 2
-cglobal %1_%2_chroma_mc4, 6, 7+UNIX64, 0
- movq m5, [pw_32]
+cglobal %1_%2_chroma_mc4, 6, 7+UNIX64, 8
+ mova m5, [pw_32]
..@%1_%2_chroma_mc4_after_init_ %+ cpuname:
- mov r6, r4
+ mov r6d, r4d
shl r4d, 8
- sub r4d, r6d
- mov r6, 8
- add r4d, 8 ; x*288+8
- sub r6d, r5d
- imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
- imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
+ movd m0, [r1]
+ sub r6d, 8
+ sub r4d, r6d ; x << 8 | (8-x)
+ mov r6d, r5d
+ shl r5d, 16
+ movd m1, [r1+1]
+ sub r6d, 8
+ sub r5d, r6d ; y << 16 | (8-y)
+ imul r4d, r5d ; xy << 24 | (8-x)y << 16 | x(8-y) << 8 | (8-x)(8-y)
+ add r1, r2
- movd m7, r6d
- movd m6, r4d
- movd m0, [r1 ]
- pshufw m7, m7, 0
- punpcklbw m0, [r1+1]
- pshufw m6, m6, 0
+ movd m6, r4d ; ABCD
+ punpcklwd m6, m6 ; ABABCDCD
+ pshufd m7, m6, 0x55 ; CDCDCDCDCDCDCDCD
+ punpcklbw m0, m1
+ pshufd m6, m6, 0x0 ; ABABABABABABABAB
.next2rows:
- movd m1, [r1+r2*1 ]
- movd m3, [r1+r2*2 ]
- punpcklbw m1, [r1+r2*1+1]
- punpcklbw m3, [r1+r2*2+1]
- lea r1, [r1+r2*2]
- movq m2, m1
- movq m4, m3
- pmaddubsw m0, m7
- pmaddubsw m1, m6
- pmaddubsw m2, m7
- pmaddubsw m3, m6
+ movd m1, [r1]
+ movd m2, [r1+1]
+ movd m3, [r1+r2]
+ movd m4, [r1+r2+1]
+ punpcklbw m1, m2
+ punpcklqdq m0, m1
+ pmaddubsw m0, m6
+ punpcklbw m3, m4
+ punpcklqdq m1, m3
+ pmaddubsw m1, m7
+%ifidn %1, avg
+ movd m2, [r0]
+ movd m4, [r0+r2]
+%endif
paddw m0, m5
- paddw m2, m5
- paddw m1, m0
- paddw m3, m2
- psrlw m1, 6
- movq m0, m4
- psrlw m3, 6
- packuswb m1, m1
- packuswb m3, m3
- CHROMAMC_AVG m1, [r0 ]
- CHROMAMC_AVG m3, [r0+r2]
- movd [r0 ], m1
- movd [r0+r2], m3
+ lea r1, [r1+r2*2]
+ paddw m0, m1
+ psrlw m0, 6
+ packuswb m0, m0
+ pshufd m1, m0, 0x1
+%ifidn %1, avg
+ pavgb m0, m2
+ pavgb m1, m4
+%endif
sub r3d, 2
+ movd [r0], m0
+ movd [r0+r2], m1
+ mova m0, m3
lea r0, [r0+r2*2]
jg .next2rows
RET
@@ -379,26 +385,23 @@ cglobal %1_%2_chroma_mc4, 6, 7+UNIX64, 0
%macro rv40_chroma_mc4_func 1 ; put vs avg
%if CONFIG_RV40_DECODER
- cglobal rv40_%1_chroma_mc4, 6, 7+UNIX64, 0
+ cglobal rv40_%1_chroma_mc4, 6, 7+UNIX64, 8
rv40_get_bias m5
jmp ..@%1_h264_chroma_mc4_after_init_ %+ cpuname
%endif
%endmacro
-%define CHROMAMC_AVG NOTHING
INIT_XMM ssse3
+%define CHROMAMC_AVG NOTHING
chroma_mc8_ssse3_func put, h264, _rnd
chroma_mc8_ssse3_func put, vc1, _nornd
rv40_chroma_mc8_func put
-INIT_MMX ssse3
chroma_mc4_ssse3_func put, h264
rv40_chroma_mc4_func put
%define CHROMAMC_AVG DIRECT_AVG
-INIT_XMM ssse3
chroma_mc8_ssse3_func avg, h264, _rnd
chroma_mc8_ssse3_func avg, vc1, _nornd
rv40_chroma_mc8_func avg
-INIT_MMX ssse3
chroma_mc4_ssse3_func avg, h264
rv40_chroma_mc4_func avg
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2025-11-05 13:11 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=176234825039.25.17628330336355125775@2cb04c0e5124 \
--to=ffmpeg-devel@ffmpeg.org \
--cc=code@ffmpeg.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git