* [FFmpeg-devel] [PATCH] avcodec/x86/mpegvideoencdsp_init: Use xmm registers in SSSE3 functions (PR #20692)
@ 2025-10-12 7:10 mkver via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-10-12 7:10 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: mkver
PR #20692 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20692
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20692.patch
>From eb12812e4c6a0a9dd781ff1f721e512e7702f3f1 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 12 Oct 2025 07:18:24 +0200
Subject: [PATCH 1/3] tests/checkasm/mpegvideoencdsp: Add test for add_8x8basis
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
tests/checkasm/mpegvideoencdsp.c | 40 ++++++++++++++++++++++++++++----
1 file changed, 35 insertions(+), 5 deletions(-)
diff --git a/tests/checkasm/mpegvideoencdsp.c b/tests/checkasm/mpegvideoencdsp.c
index 24791d113d..281195cd5f 100644
--- a/tests/checkasm/mpegvideoencdsp.c
+++ b/tests/checkasm/mpegvideoencdsp.c
@@ -16,20 +16,48 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include "libavutil/common.h"
#include "libavutil/intreadwrite.h"
-#include "libavutil/mem.h"
#include "libavutil/mem_internal.h"
+#include "libavcodec/mathops.h"
#include "libavcodec/mpegvideoencdsp.h"
#include "checkasm.h"
-#define randomize_buffers(buf, size) \
- do { \
- for (int j = 0; j < size; j += 4) \
- AV_WN32(buf + j, rnd()); \
+#define randomize_buffers(buf, size) \
+ do { \
+ for (int j = 0; j < size; j += 4) \
+ AV_WN32((char*)buf + j, rnd()); \
} while (0)
+#define randomize_buffer_clipped(buf, min, max) \
+ do { \
+ for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j) \
+ buf[j] = rnd() % (max - min + 1) + min; \
+ } while (0)
+
+static void check_add_8x8basis(MpegvideoEncDSPContext *c)
+{
+ declare_func_emms(AV_CPU_FLAG_SSSE3, void, int16_t rem[64], const int16_t basis[64], int scale);
+ if (check_func(c->add_8x8basis, "add_8x8basis")) {
+ // FIXME: What are the actual ranges for these values?
+ int scale = sign_extend(rnd(), 12);
+ int16_t rem1[64];
+ int16_t rem2[64];
+ int16_t basis[64];
+
+ randomize_buffer_clipped(basis, -15760, 15760);
+ randomize_buffers(rem1, sizeof(rem1));
+ memcpy(rem2, rem1, sizeof(rem2));
+ call_ref(rem1, basis, scale);
+ call_new(rem2, basis, scale);
+ if (memcmp(rem1, rem2, sizeof(rem1)))
+ fail();
+ bench_new(rem1, basis, scale);
+ }
+}
+
static void check_pix_sum(MpegvideoEncDSPContext *c)
{
LOCAL_ALIGNED_16(uint8_t, src, [16 * 16]);
@@ -144,4 +172,6 @@ void checkasm_check_mpegvideoencdsp(void)
report("pix_norm1");
check_draw_edges(&c);
report("draw_edges");
+ check_add_8x8basis(&c);
+ report("add_8x8basis");
}
--
2.49.1
>From 77e38557e5d27c2cc698d1c07b0e6311a89cf113 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 12 Oct 2025 07:29:04 +0200
Subject: [PATCH 2/3] avcodec/x86/mpegvideoencdsp_init: Don't use slow path
unnecessarily
The only requirement of this code (and essentially the pmulhrsw
instruction) is that the scaled scale fits into an int16_t.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/mpegvideoencdsp_init.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c
index 78c2ef87b8..dc8fcd8833 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -90,7 +90,7 @@ static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int sca
{
x86_reg i=0;
- if (FFABS(scale) < MAX_ABS) {
+ if (FFABS(scale) < 1024) {
scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
__asm__ volatile(
"movd %3, %%mm5 \n\t"
--
2.49.1
>From 803493e80ce2a887f1e1b67e51f15f8b548dbe0b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 12 Oct 2025 08:04:11 +0200
Subject: [PATCH 3/3] avcodec/x86/mpegvideoencdsp_init: Use xmm registers in
SSSE3 functions
Improves performance and no longer breaks the ABI (by forgetting
to call emms).
Old benchmarks:
add_8x8basis_c: 43.6 ( 1.00x)
add_8x8basis_ssse3: 12.3 ( 3.55x)
New benchmarks:
add_8x8basis_c: 43.0 ( 1.00x)
add_8x8basis_ssse3: 6.3 ( 6.79x)
Notice that the output of try_8x8basis_ssse3 changes a bit:
Before this commit, it computes certain values and adds the values
for i,i+1,i+4 and i+5 before right shifting them; now it adds
the values for i,i+1,i+8,i+9. The second pair in these lists
could be avoided (by shifting xmm0 and xmm1 before adding both together
instead of only shifting xmm0 after adding them), but the former
i,i+1 is inherent in using pmaddwd. This is the reason that this
function is not bitexact.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/mpegvideo_enc.c | 6 +-
libavcodec/x86/mpegvideoencdsp_init.c | 99 +++++++++++++--------------
tests/checkasm/mpegvideoencdsp.c | 8 +--
3 files changed, 55 insertions(+), 58 deletions(-)
diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index dbf4d25136..9f5da254bf 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -2296,7 +2296,7 @@ static av_always_inline void encode_mb_internal(MPVEncContext *const s,
* and neither of these encoders currently supports 444. */
#define INTERLACED_DCT(s) ((chroma_format == CHROMA_420 || chroma_format == CHROMA_422) && \
(s)->c.avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT)
- int16_t weight[12][64];
+ DECLARE_ALIGNED(16, int16_t, weight)[12][64];
int16_t orig[12][64];
const int mb_x = s->c.mb_x;
const int mb_y = s->c.mb_y;
@@ -4293,7 +4293,7 @@ static int dct_quantize_trellis_c(MPVEncContext *const s,
return last_non_zero;
}
-static int16_t basis[64][64];
+static DECLARE_ALIGNED(16, int16_t, basis)[64][64];
static void build_basis(uint8_t *perm){
int i, j, x, y;
@@ -4317,7 +4317,7 @@ static void build_basis(uint8_t *perm){
static int dct_quantize_refine(MPVEncContext *const s, //FIXME breaks denoise?
int16_t *block, int16_t *weight, int16_t *orig,
int n, int qscale){
- int16_t rem[64];
+ DECLARE_ALIGNED(16, int16_t, rem)[64];
LOCAL_ALIGNED_16(int16_t, d1, [64]);
const uint8_t *scantable;
const uint8_t *perm_scantable;
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c
index dc8fcd8833..3cd16fefbf 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -35,13 +35,6 @@ int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);
#if HAVE_SSSE3_INLINE
#define SCALE_OFFSET -1
-/*
- * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30]
- */
-#define PMULHRW(x, y, s, o) \
- "pmulhrsw " #s ", " #x " \n\t" \
- "pmulhrsw " #s ", " #y " \n\t"
-
#define MAX_ABS 512
static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], const int16_t basis[64], int scale)
@@ -52,36 +45,39 @@ static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], c
scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
__asm__ volatile(
- "pxor %%mm7, %%mm7 \n\t"
- "movd %4, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
- "paddw (%2, %0), %%mm0 \n\t"
- "paddw 8(%2, %0), %%mm1 \n\t"
- "psraw $6, %%mm0 \n\t"
- "psraw $6, %%mm1 \n\t"
- "pmullw (%3, %0), %%mm0 \n\t"
- "pmullw 8(%3, %0), %%mm1 \n\t"
- "pmaddwd %%mm0, %%mm0 \n\t"
- "pmaddwd %%mm1, %%mm1 \n\t"
- "paddd %%mm1, %%mm0 \n\t"
- "psrld $4, %%mm0 \n\t"
- "paddd %%mm0, %%mm7 \n\t"
- "add $16, %0 \n\t"
- "cmp $128, %0 \n\t" //FIXME optimize & bench
- " jb 1b \n\t"
- "pshufw $0x0E, %%mm7, %%mm6 \n\t"
- "paddd %%mm6, %%mm7 \n\t" // faster than phaddd on core2
- "psrld $2, %%mm7 \n\t"
- "movd %%mm7, %0 \n\t"
-
+ "pxor %%xmm2, %%xmm2 \n\t"
+ "movd %4, %%xmm3 \n\t"
+ "punpcklwd %%xmm3, %%xmm3 \n\t"
+ "pshufd $0, %%xmm3, %%xmm3 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movdqa (%1, %0), %%xmm0 \n\t"
+ "movdqa 16(%1, %0), %%xmm1 \n\t"
+ "pmulhrsw %%xmm3, %%xmm0 \n\t"
+ "pmulhrsw %%xmm3, %%xmm1 \n\t"
+ "paddw (%2, %0), %%xmm0 \n\t"
+ "paddw 16(%2, %0), %%xmm1 \n\t"
+ "psraw $6, %%xmm0 \n\t"
+ "psraw $6, %%xmm1 \n\t"
+ "pmullw (%3, %0), %%xmm0 \n\t"
+ "pmullw 16(%3, %0), %%xmm1 \n\t"
+ "pmaddwd %%xmm0, %%xmm0 \n\t"
+ "pmaddwd %%xmm1, %%xmm1 \n\t"
+ "paddd %%xmm1, %%xmm0 \n\t"
+ "psrld $4, %%xmm0 \n\t"
+ "paddd %%xmm0, %%xmm2 \n\t"
+ "add $32, %0 \n\t"
+ "cmp $128, %0 \n\t" //FIXME optimize & bench
+ " jb 1b \n\t"
+ "pshufd $0x0E, %%xmm2, %%xmm0 \n\t"
+ "paddd %%xmm0, %%xmm2 \n\t"
+ "pshufd $0x01, %%xmm2, %%xmm0 \n\t"
+ "paddd %%xmm0, %%xmm2 \n\t"
+ "psrld $2, %%xmm2 \n\t"
+ "movd %%xmm2, %0 \n\t"
: "+r" (i)
: "r"(basis), "r"(rem), "r"(weight), "g"(scale)
+ XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")
);
return i;
}
@@ -93,24 +89,25 @@ static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int sca
if (FFABS(scale) < 1024) {
scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
__asm__ volatile(
- "movd %3, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
- "paddw (%2, %0), %%mm0 \n\t"
- "paddw 8(%2, %0), %%mm1 \n\t"
- "movq %%mm0, (%2, %0) \n\t"
- "movq %%mm1, 8(%2, %0) \n\t"
- "add $16, %0 \n\t"
- "cmp $128, %0 \n\t" // FIXME optimize & bench
- " jb 1b \n\t"
-
+ "movd %3, %%xmm2 \n\t"
+ "punpcklwd %%xmm2, %%xmm2 \n\t"
+ "pshufd $0, %%xmm2, %%xmm2 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movdqa (%1, %0), %%xmm0 \n\t"
+ "movdqa 16(%1, %0), %%xmm1 \n\t"
+ "pmulhrsw %%xmm2, %%xmm0 \n\t"
+ "pmulhrsw %%xmm2, %%xmm1 \n\t"
+ "paddw (%2, %0), %%xmm0 \n\t"
+ "paddw 16(%2, %0), %%xmm1 \n\t"
+ "movdqa %%xmm0, (%2, %0) \n\t"
+ "movdqa %%xmm1, 16(%2, %0) \n\t"
+ "add $32, %0 \n\t"
+ "cmp $128, %0 \n\t" // FIXME optimize & bench
+ " jb 1b \n\t"
: "+r" (i)
: "r"(basis), "r"(rem), "g"(scale)
+ XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2")
);
} else {
for (i=0; i<8*8; i++) {
diff --git a/tests/checkasm/mpegvideoencdsp.c b/tests/checkasm/mpegvideoencdsp.c
index 281195cd5f..a4a4fa6f5c 100644
--- a/tests/checkasm/mpegvideoencdsp.c
+++ b/tests/checkasm/mpegvideoencdsp.c
@@ -39,13 +39,13 @@
static void check_add_8x8basis(MpegvideoEncDSPContext *c)
{
- declare_func_emms(AV_CPU_FLAG_SSSE3, void, int16_t rem[64], const int16_t basis[64], int scale);
+ declare_func(void, int16_t rem[64], const int16_t basis[64], int scale);
if (check_func(c->add_8x8basis, "add_8x8basis")) {
// FIXME: What are the actual ranges for these values?
int scale = sign_extend(rnd(), 12);
- int16_t rem1[64];
- int16_t rem2[64];
- int16_t basis[64];
+ DECLARE_ALIGNED(16, int16_t, rem1)[64];
+ DECLARE_ALIGNED(16, int16_t, rem2)[64];
+ DECLARE_ALIGNED(16, int16_t, basis)[64];
randomize_buffer_clipped(basis, -15760, 15760);
randomize_buffers(rem1, sizeof(rem1));
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-10-12 7:11 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-12 7:10 [FFmpeg-devel] [PATCH] avcodec/x86/mpegvideoencdsp_init: Use xmm registers in SSSE3 functions (PR #20692) mkver via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git