From: mkver via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> To: ffmpeg-devel@ffmpeg.org Cc: mkver <code@ffmpeg.org> Subject: [FFmpeg-devel] [PATCH] avcodec/x86/mpegvideoencdsp_init: Use xmm registers in SSSE3 functions (PR #20692) Date: Sun, 12 Oct 2025 07:10:44 -0000 Message-ID: <176025304521.52.4016490393094737049@bf249f23a2c8> (raw) PR #20692 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20692 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20692.patch >From eb12812e4c6a0a9dd781ff1f721e512e7702f3f1 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sun, 12 Oct 2025 07:18:24 +0200 Subject: [PATCH 1/3] tests/checkasm/mpegvideoencdsp: Add test for add_8x8basis Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- tests/checkasm/mpegvideoencdsp.c | 40 ++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/tests/checkasm/mpegvideoencdsp.c b/tests/checkasm/mpegvideoencdsp.c index 24791d113d..281195cd5f 100644 --- a/tests/checkasm/mpegvideoencdsp.c +++ b/tests/checkasm/mpegvideoencdsp.c @@ -16,20 +16,48 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "libavutil/common.h" #include "libavutil/intreadwrite.h" -#include "libavutil/mem.h" #include "libavutil/mem_internal.h" +#include "libavcodec/mathops.h" #include "libavcodec/mpegvideoencdsp.h" #include "checkasm.h" -#define randomize_buffers(buf, size) \ - do { \ - for (int j = 0; j < size; j += 4) \ - AV_WN32(buf + j, rnd()); \ +#define randomize_buffers(buf, size) \ + do { \ + for (int j = 0; j < size; j += 4) \ + AV_WN32((char*)buf + j, rnd()); \ } while (0) +#define randomize_buffer_clipped(buf, min, max) \ + do { \ + for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j) \ + buf[j] = rnd() % (max - min + 1) + min; \ + } while (0) + +static void check_add_8x8basis(MpegvideoEncDSPContext *c) +{ + declare_func_emms(AV_CPU_FLAG_SSSE3, void, int16_t rem[64], const int16_t basis[64], int scale); + if (check_func(c->add_8x8basis, "add_8x8basis")) { + // FIXME: What are the actual ranges for these values? + int scale = sign_extend(rnd(), 12); + int16_t rem1[64]; + int16_t rem2[64]; + int16_t basis[64]; + + randomize_buffer_clipped(basis, -15760, 15760); + randomize_buffers(rem1, sizeof(rem1)); + memcpy(rem2, rem1, sizeof(rem2)); + call_ref(rem1, basis, scale); + call_new(rem2, basis, scale); + if (memcmp(rem1, rem2, sizeof(rem1))) + fail(); + bench_new(rem1, basis, scale); + } +} + static void check_pix_sum(MpegvideoEncDSPContext *c) { LOCAL_ALIGNED_16(uint8_t, src, [16 * 16]); @@ -144,4 +172,6 @@ void checkasm_check_mpegvideoencdsp(void) report("pix_norm1"); check_draw_edges(&c); report("draw_edges"); + check_add_8x8basis(&c); + report("add_8x8basis"); } -- 2.49.1 >From 77e38557e5d27c2cc698d1c07b0e6311a89cf113 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sun, 12 Oct 2025 07:29:04 +0200 Subject: [PATCH 2/3] avcodec/x86/mpegvideoencdsp_init: Don't use slow path unnecessarily The only requirement of this code (and essentially the pmulhrsw instruction) is that the scaled scale fits into an int16_t. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/mpegvideoencdsp_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c index 78c2ef87b8..dc8fcd8833 100644 --- a/libavcodec/x86/mpegvideoencdsp_init.c +++ b/libavcodec/x86/mpegvideoencdsp_init.c @@ -90,7 +90,7 @@ static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int sca { x86_reg i=0; - if (FFABS(scale) < MAX_ABS) { + if (FFABS(scale) < 1024) { scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; __asm__ volatile( "movd %3, %%mm5 \n\t" -- 2.49.1 >From 803493e80ce2a887f1e1b67e51f15f8b548dbe0b Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sun, 12 Oct 2025 08:04:11 +0200 Subject: [PATCH 3/3] avcodec/x86/mpegvideoencdsp_init: Use xmm registers in SSSE3 functions Improves performance and no longer breaks the ABI (by forgetting to call emms). Old benchmarks: add_8x8basis_c: 43.6 ( 1.00x) add_8x8basis_ssse3: 12.3 ( 3.55x) New benchmarks: add_8x8basis_c: 43.0 ( 1.00x) add_8x8basis_ssse3: 6.3 ( 6.79x) Notice that the output of try_8x8basis_ssse3 changes a bit: Before this commit, it computes certain values and adds the values for i,i+1,i+4 and i+5 before right shifting them; now it adds the values for i,i+1,i+8,i+9. The second pair in these lists could be avoided (by shifting xmm0 and xmm1 before adding both together instead of only shifting xmm0 after adding them), but the former i,i+1 is inherent in using pmaddwd. This is the reason that this function is not bitexact. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/mpegvideo_enc.c | 6 +- libavcodec/x86/mpegvideoencdsp_init.c | 99 +++++++++++++-------------- tests/checkasm/mpegvideoencdsp.c | 8 +-- 3 files changed, 55 insertions(+), 58 deletions(-) diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c index dbf4d25136..9f5da254bf 100644 --- a/libavcodec/mpegvideo_enc.c +++ b/libavcodec/mpegvideo_enc.c @@ -2296,7 +2296,7 @@ static av_always_inline void encode_mb_internal(MPVEncContext *const s, * and neither of these encoders currently supports 444. */ #define INTERLACED_DCT(s) ((chroma_format == CHROMA_420 || chroma_format == CHROMA_422) && \ (s)->c.avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) - int16_t weight[12][64]; + DECLARE_ALIGNED(16, int16_t, weight)[12][64]; int16_t orig[12][64]; const int mb_x = s->c.mb_x; const int mb_y = s->c.mb_y; @@ -4293,7 +4293,7 @@ static int dct_quantize_trellis_c(MPVEncContext *const s, return last_non_zero; } -static int16_t basis[64][64]; +static DECLARE_ALIGNED(16, int16_t, basis)[64][64]; static void build_basis(uint8_t *perm){ int i, j, x, y; @@ -4317,7 +4317,7 @@ static void build_basis(uint8_t *perm){ static int dct_quantize_refine(MPVEncContext *const s, //FIXME breaks denoise? int16_t *block, int16_t *weight, int16_t *orig, int n, int qscale){ - int16_t rem[64]; + DECLARE_ALIGNED(16, int16_t, rem)[64]; LOCAL_ALIGNED_16(int16_t, d1, [64]); const uint8_t *scantable; const uint8_t *perm_scantable; diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c index dc8fcd8833..3cd16fefbf 100644 --- a/libavcodec/x86/mpegvideoencdsp_init.c +++ b/libavcodec/x86/mpegvideoencdsp_init.c @@ -35,13 +35,6 @@ int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size); #if HAVE_SSSE3_INLINE #define SCALE_OFFSET -1 -/* - * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30] - */ -#define PMULHRW(x, y, s, o) \ - "pmulhrsw " #s ", " #x " \n\t" \ - "pmulhrsw " #s ", " #y " \n\t" - #define MAX_ABS 512 static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], const int16_t basis[64], int scale) @@ -52,36 +45,39 @@ static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], c scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "movd %4, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq 8(%1, %0), %%mm1 \n\t" - PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) - "paddw (%2, %0), %%mm0 \n\t" - "paddw 8(%2, %0), %%mm1 \n\t" - "psraw $6, %%mm0 \n\t" - "psraw $6, %%mm1 \n\t" - "pmullw (%3, %0), %%mm0 \n\t" - "pmullw 8(%3, %0), %%mm1 \n\t" - "pmaddwd %%mm0, %%mm0 \n\t" - "pmaddwd %%mm1, %%mm1 \n\t" - "paddd %%mm1, %%mm0 \n\t" - "psrld $4, %%mm0 \n\t" - "paddd %%mm0, %%mm7 \n\t" - "add $16, %0 \n\t" - "cmp $128, %0 \n\t" //FIXME optimize & bench - " jb 1b \n\t" - "pshufw $0x0E, %%mm7, %%mm6 \n\t" - "paddd %%mm6, %%mm7 \n\t" // faster than phaddd on core2 - "psrld $2, %%mm7 \n\t" - "movd %%mm7, %0 \n\t" - + "pxor %%xmm2, %%xmm2 \n\t" + "movd %4, %%xmm3 \n\t" + "punpcklwd %%xmm3, %%xmm3 \n\t" + "pshufd $0, %%xmm3, %%xmm3 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movdqa (%1, %0), %%xmm0 \n\t" + "movdqa 16(%1, %0), %%xmm1 \n\t" + "pmulhrsw %%xmm3, %%xmm0 \n\t" + "pmulhrsw %%xmm3, %%xmm1 \n\t" + "paddw (%2, %0), %%xmm0 \n\t" + "paddw 16(%2, %0), %%xmm1 \n\t" + "psraw $6, %%xmm0 \n\t" + "psraw $6, %%xmm1 \n\t" + "pmullw (%3, %0), %%xmm0 \n\t" + "pmullw 16(%3, %0), %%xmm1 \n\t" + "pmaddwd %%xmm0, %%xmm0 \n\t" + "pmaddwd %%xmm1, %%xmm1 \n\t" + "paddd %%xmm1, %%xmm0 \n\t" + "psrld $4, %%xmm0 \n\t" + "paddd %%xmm0, %%xmm2 \n\t" + "add $32, %0 \n\t" + "cmp $128, %0 \n\t" //FIXME optimize & bench + " jb 1b \n\t" + "pshufd $0x0E, %%xmm2, %%xmm0 \n\t" + "paddd %%xmm0, %%xmm2 \n\t" + "pshufd $0x01, %%xmm2, %%xmm0 \n\t" + "paddd %%xmm0, %%xmm2 \n\t" + "psrld $2, %%xmm2 \n\t" + "movd %%xmm2, %0 \n\t" : "+r" (i) : "r"(basis), "r"(rem), "r"(weight), "g"(scale) + XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3") ); return i; } @@ -93,24 +89,25 @@ static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int sca if (FFABS(scale) < 1024) { scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; __asm__ volatile( - "movd %3, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq 8(%1, %0), %%mm1 \n\t" - PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) - "paddw (%2, %0), %%mm0 \n\t" - "paddw 8(%2, %0), %%mm1 \n\t" - "movq %%mm0, (%2, %0) \n\t" - "movq %%mm1, 8(%2, %0) \n\t" - "add $16, %0 \n\t" - "cmp $128, %0 \n\t" // FIXME optimize & bench - " jb 1b \n\t" - + "movd %3, %%xmm2 \n\t" + "punpcklwd %%xmm2, %%xmm2 \n\t" + "pshufd $0, %%xmm2, %%xmm2 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movdqa (%1, %0), %%xmm0 \n\t" + "movdqa 16(%1, %0), %%xmm1 \n\t" + "pmulhrsw %%xmm2, %%xmm0 \n\t" + "pmulhrsw %%xmm2, %%xmm1 \n\t" + "paddw (%2, %0), %%xmm0 \n\t" + "paddw 16(%2, %0), %%xmm1 \n\t" + "movdqa %%xmm0, (%2, %0) \n\t" + "movdqa %%xmm1, 16(%2, %0) \n\t" + "add $32, %0 \n\t" + "cmp $128, %0 \n\t" // FIXME optimize & bench + " jb 1b \n\t" : "+r" (i) : "r"(basis), "r"(rem), "g"(scale) + XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2") ); } else { for (i=0; i<8*8; i++) { diff --git a/tests/checkasm/mpegvideoencdsp.c b/tests/checkasm/mpegvideoencdsp.c index 281195cd5f..a4a4fa6f5c 100644 --- a/tests/checkasm/mpegvideoencdsp.c +++ b/tests/checkasm/mpegvideoencdsp.c @@ -39,13 +39,13 @@ static void check_add_8x8basis(MpegvideoEncDSPContext *c) { - declare_func_emms(AV_CPU_FLAG_SSSE3, void, int16_t rem[64], const int16_t basis[64], int scale); + declare_func(void, int16_t rem[64], const int16_t basis[64], int scale); if (check_func(c->add_8x8basis, "add_8x8basis")) { // FIXME: What are the actual ranges for these values? int scale = sign_extend(rnd(), 12); - int16_t rem1[64]; - int16_t rem2[64]; - int16_t basis[64]; + DECLARE_ALIGNED(16, int16_t, rem1)[64]; + DECLARE_ALIGNED(16, int16_t, rem2)[64]; + DECLARE_ALIGNED(16, int16_t, basis)[64]; randomize_buffer_clipped(basis, -15760, 15760); randomize_buffers(rem1, sizeof(rem1)); -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2025-10-12 7:11 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=176025304521.52.4016490393094737049@bf249f23a2c8 \ --to=ffmpeg-devel@ffmpeg.org \ --cc=code@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git