From: mkver via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> To: ffmpeg-devel@ffmpeg.org Cc: mkver <code@ffmpeg.org> Subject: [FFmpeg-devel] [PATCH] hpeldsp (PR #20668) Date: Wed, 08 Oct 2025 15:20:35 -0000 Message-ID: <175993683631.65.2086698086369123246@bf249f23a2c8> (raw) PR #20668 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20668 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20668.patch >From 09c0c45423ddbfe6925a7b1aed004a2f35d92e9e Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Fri, 3 Oct 2025 04:14:59 +0200 Subject: [PATCH 1/3] avcodec/x86/hpeldsp: Add SSE2 of {avg,put} no_rnd xy2 with blocksize 16 Also remove the now superseded MMX versions (the new functions have the exact same codesize as the removed ones). Old benchmarks: avg_no_rnd_pixels_tab[0][3]_c: 233.7 ( 1.00x) avg_no_rnd_pixels_tab[0][3]_mmx: 121.5 ( 1.92x) put_no_rnd_pixels_tab[0][3]_c: 171.4 ( 1.00x) put_no_rnd_pixels_tab[0][3]_mmx: 82.6 ( 2.08x) New benchmarks: avg_no_rnd_pixels_tab[0][3]_c: 233.3 ( 1.00x) avg_no_rnd_pixels_tab[0][3]_sse2: 45.0 ( 5.18x) put_no_rnd_pixels_tab[0][3]_c: 172.1 ( 1.00x) put_no_rnd_pixels_tab[0][3]_sse2: 40.9 ( 4.21x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/hpeldsp.asm | 13 +++-- libavcodec/x86/hpeldsp_init.c | 89 +++-------------------------------- 2 files changed, 14 insertions(+), 88 deletions(-) diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index e9f988f7b5..4c19da3e2a 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -29,6 +29,7 @@ SECTION_RODATA cextern pb_1 +cextern pw_1 cextern pw_2 pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7 @@ -407,10 +408,10 @@ AVG_PIXELS8_Y2 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro SET_PIXELS_XY2 1 -cglobal %1_pixels16_xy2, 4,5,8 +%macro SET_PIXELS_XY2 2-3 +cglobal %1%3_pixels16_xy2, 4,5,8 pxor m7, m7 - mova m6, [pw_2] + mova m6, [%2] movu m0, [r1] movu m4, [r1+1] mova m1, m0 @@ -481,8 +482,10 @@ cglobal %1_pixels16_xy2, 4,5,8 %endmacro INIT_XMM sse2 -SET_PIXELS_XY2 put -SET_PIXELS_XY2 avg +SET_PIXELS_XY2 put, pw_2 +SET_PIXELS_XY2 avg, pw_2 +SET_PIXELS_XY2 put, pw_1, _no_rnd +SET_PIXELS_XY2 avg, pw_1, _no_rnd %macro SSSE3_PIXELS_XY2 1-2 %if %0 == 2 ; sse2 diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index cb47cb7752..44e44f0975 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -30,7 +30,6 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/avcodec.h" #include "libavcodec/hpeldsp.h" -#include "libavcodec/pixels.h" #include "fpel.h" #include "hpeldsp.h" #include "inline_asm.h" @@ -65,6 +64,10 @@ void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, @@ -143,94 +146,12 @@ static void put_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels, :FF_REG_a, "memory"); } -// this routine is 'slightly' suboptimal but mostly unused -static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - MOVQ_ZERO(mm7); - MOVQ_WONE(mm6); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm4 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" - "add %3, %1 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddusw %%mm2, %%mm0 \n\t" - "paddusw %%mm3, %%mm1 \n\t" - "paddusw %%mm6, %%mm4 \n\t" - "paddusw %%mm6, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm5 \n\t" - "movq (%2, %%"FF_REG_a"), %%mm3 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "pcmpeqd %%mm2, %%mm2 \n\t" - "paddb %%mm2, %%mm2 \n\t" - PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2) - "movq %%mm5, (%2, %%"FF_REG_a") \n\t" - "add %3, %%"FF_REG_a" \n\t" - - "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 - "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm2, %%mm4 \n\t" - "paddusw %%mm3, %%mm5 \n\t" - "paddusw %%mm6, %%mm0 \n\t" - "paddusw %%mm6, %%mm1 \n\t" - "paddusw %%mm4, %%mm0 \n\t" - "paddusw %%mm5, %%mm1 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "movq (%2, %%"FF_REG_a"), %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "pcmpeqd %%mm2, %%mm2 \n\t" - "paddb %%mm2, %%mm2 \n\t" - PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2) - "movq %%mm1, (%2, %%"FF_REG_a") \n\t" - "add %3, %%"FF_REG_a" \n\t" - - "subl $2, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels) - :"D"(block), "r"((x86_reg)line_size) - :FF_REG_a, "memory"); -} - -#if HAVE_MMX -CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8) -CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8) -#endif #endif /* HAVE_INLINE_ASM */ static void hpeldsp_init_mmx(HpelDSPContext *c, int flags) { #if HAVE_MMX_INLINE - c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx; c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx; - c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx; #endif #if HAVE_MMX_EXTERNAL c->put_no_rnd_pixels_tab[1][0] = @@ -269,6 +190,7 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags) c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2; c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_sse2; c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_sse2; + c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_sse2; c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2; @@ -278,6 +200,7 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags) c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_sse2; c->avg_no_rnd_pixels_tab[1] = ff_avg_no_rnd_pixels16_x2_sse2; c->avg_no_rnd_pixels_tab[2] = ff_avg_no_rnd_pixels16_y2_sse2; + c->avg_no_rnd_pixels_tab[3] = ff_avg_no_rnd_pixels16_xy2_sse2; #endif /* HAVE_SSE2_EXTERNAL */ } -- 2.49.1 >From 1f91867029276984e464eb2b46cf8a053b2155a7 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Fri, 3 Oct 2025 05:20:32 +0200 Subject: [PATCH 2/3] avcodec/x86/hpeldsp: Add ff_put_no_rnd_pixels8_xy2_ssse3() Given that one has to deal with 16 byte intermediates it is unsurprising that SSE2 wins against MMX; the MMX version has therefore been removed (as well as the now unused inline_asm.h). The new function is even 32B smaller than the old MMX one. Old benchmarks: put_no_rnd_pixels_tab[1][3]_c: 84.1 ( 1.00x) put_no_rnd_pixels_tab[1][3]_mmx: 41.1 ( 2.05x) New benchmarks: put_no_rnd_pixels_tab[1][3]_c: 84.0 ( 1.00x) put_no_rnd_pixels_tab[1][3]_ssse3: 22.1 ( 3.80x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/hpeldsp.asm | 54 ++++++++++++++++++ libavcodec/x86/hpeldsp_init.c | 83 ++-------------------------- libavcodec/x86/inline_asm.h | 100 ---------------------------------- 3 files changed, 58 insertions(+), 179 deletions(-) delete mode 100644 libavcodec/x86/inline_asm.h diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index 4c19da3e2a..fc51d72d17 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -407,6 +407,60 @@ INIT_XMM sse2 AVG_PIXELS8_Y2 +; void ff_put_no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) +%macro SET_PIXELS8_XY2 2-3 +cglobal %1%3_pixels8_xy2, 4,5,5 + mova m4, [pb_1] + mova m3, [%2] + movh m0, [r1] + movh m2, [r1+1] + punpcklbw m2, m0 + pmaddubsw m2, m4 + xor r4, r4 + add r1, r2 +.loop: + movh m0, [r1+r4] + movh m1, [r1+r4+1] + punpcklbw m0, m1 + pmaddubsw m0, m4 + paddusw m2, m3 + paddusw m2, m0 + psrlw m2, 2 +%ifidn %1, avg + movh m1, [r0+r4] + packuswb m2, m2 + pavgb m2, m1 +%else + packuswb m2, m2 +%endif + movh [r0+r4], m2 + add r4, r2 + + movh m1, [r1+r4] + movh m2, [r1+r4+1] + punpcklbw m2, m1 + pmaddubsw m2, m4 + paddusw m0, m3 + paddusw m0, m2 + psrlw m0, 2 +%ifidn %1, avg + movh m1, [r0+r4] + packuswb m0, m0 + pavgb m0, m1 +%else + packuswb m0, m0 +%endif + movh [r0+r4], m0 + add r4, r2 + sub r3d, 2 + jnz .loop + RET +%endmacro + +INIT_XMM ssse3 +SET_PIXELS8_XY2 put, pw_1, _no_rnd + + ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) %macro SET_PIXELS_XY2 2-3 cglobal %1%3_pixels16_xy2, 4,5,8 diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index 44e44f0975..f4b123ce03 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -32,7 +32,6 @@ #include "libavcodec/hpeldsp.h" #include "fpel.h" #include "hpeldsp.h" -#include "inline_asm.h" void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); @@ -64,6 +63,8 @@ void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_put_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels, @@ -73,86 +74,8 @@ void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -#if HAVE_INLINE_ASM - -/***********************************/ -/* MMX no rounding */ - -// put_pixels -static void put_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - MOVQ_ZERO(mm7); - MOVQ_WONE(mm6); // =1 for no_rnd version - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm4 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" - "add %3, %1 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddusw %%mm2, %%mm0 \n\t" - "paddusw %%mm3, %%mm1 \n\t" - "paddusw %%mm6, %%mm4 \n\t" - "paddusw %%mm6, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm5 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "movq %%mm4, (%2, %%"FF_REG_a") \n\t" - "add %3, %%"FF_REG_a" \n\t" - - "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 - "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm2, %%mm4 \n\t" - "paddusw %%mm3, %%mm5 \n\t" - "paddusw %%mm6, %%mm0 \n\t" - "paddusw %%mm6, %%mm1 \n\t" - "paddusw %%mm4, %%mm0 \n\t" - "paddusw %%mm5, %%mm1 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "movq %%mm0, (%2, %%"FF_REG_a") \n\t" - "add %3, %%"FF_REG_a" \n\t" - - "subl $2, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels) - :"D"(block), "r"((x86_reg)line_size) - :FF_REG_a, "memory"); -} - -#endif /* HAVE_INLINE_ASM */ - static void hpeldsp_init_mmx(HpelDSPContext *c, int flags) { -#if HAVE_MMX_INLINE - c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx; -#endif #if HAVE_MMX_EXTERNAL c->put_no_rnd_pixels_tab[1][0] = c->put_pixels_tab[1][0] = ff_put_pixels8_mmx; @@ -211,6 +134,8 @@ static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags) c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3; c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3; c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3; + + c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_ssse3; #endif } diff --git a/libavcodec/x86/inline_asm.h b/libavcodec/x86/inline_asm.h deleted file mode 100644 index 0198746719..0000000000 --- a/libavcodec/x86/inline_asm.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * inline assembly helper macros - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_INLINE_ASM_H -#define AVCODEC_X86_INLINE_ASM_H - -#include "constants.h" - -#define MOVQ_WONE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ - "psrlw $15, %%" #regd ::) - -#define JUMPALIGN() __asm__ volatile (".p2align 3"::) -#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::) - -#define MOVQ_BFE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%"#regd", %%"#regd" \n\t" \ - "paddb %%"#regd", %%"#regd" \n\t" ::) - -#ifndef PIC -#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2)) -#else -// for shared library it's better to use this way for accessing constants -// pcmpeqd -> -1 -#define MOVQ_WTWO(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%"#regd", %%"#regd" \n\t" \ - "psrlw $15, %%"#regd" \n\t" \ - "psllw $1, %%"#regd" \n\t"::) - -#endif - -// using regr as temporary and for the output result -// first argument is unmodified and second is trashed -// regfe is supposed to contain 0xfefefefefefefefe -#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ - "movq "#rega", "#regr" \n\t" \ - "pand "#regb", "#regr" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pand "#regfe", "#regb" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "paddb "#regb", "#regr" \n\t" - -#define PAVGB_MMX(rega, regb, regr, regfe) \ - "movq "#rega", "#regr" \n\t" \ - "por "#regb", "#regr" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pand "#regfe", "#regb" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "psubb "#regb", "#regr" \n\t" - -// mm6 is supposed to contain 0xfefefefefefefefe -#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ - "movq "#rega", "#regr" \n\t" \ - "movq "#regc", "#regp" \n\t" \ - "pand "#regb", "#regr" \n\t" \ - "pand "#regd", "#regp" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pxor "#regc", "#regd" \n\t" \ - "pand %%mm6, "#regb" \n\t" \ - "pand %%mm6, "#regd" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "psrlq $1, "#regd" \n\t" \ - "paddb "#regb", "#regr" \n\t" \ - "paddb "#regd", "#regp" \n\t" - -#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ - "movq "#rega", "#regr" \n\t" \ - "movq "#regc", "#regp" \n\t" \ - "por "#regb", "#regr" \n\t" \ - "por "#regd", "#regp" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pxor "#regc", "#regd" \n\t" \ - "pand %%mm6, "#regb" \n\t" \ - "pand %%mm6, "#regd" \n\t" \ - "psrlq $1, "#regd" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "psubb "#regb", "#regr" \n\t" \ - "psubb "#regd", "#regp" \n\t" - -#endif /* AVCODEC_X86_INLINE_ASM_H */ -- 2.49.1 >From 790d1fa6b3638d84306b5803a5efb5019881a429 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Fri, 3 Oct 2025 09:29:24 +0200 Subject: [PATCH 3/3] avcodec/x86/hpeldsp: Improve ff_{avg,put}_pixels8_xy2_ssse3() This SSSE3 function uses MMX registers (of course without emms at the end) and processes eight bytes of input by unpacking it into two MMX registers. This is very suboptimal given that one can just use XMM registers to process eight words. This commit switches them to using XMM registers. Old benchmarks: avg_pixels_tab[1][3]_c: 114.5 ( 1.00x) avg_pixels_tab[1][3]_ssse3: 43.6 ( 2.62x) put_pixels_tab[1][3]_c: 83.6 ( 1.00x) put_pixels_tab[1][3]_ssse3: 34.0 ( 2.46x) New benchmarks: avg_pixels_tab[1][3]_c: 115.3 ( 1.00x) avg_pixels_tab[1][3]_ssse3: 24.6 ( 4.69x) put_pixels_tab[1][3]_c: 83.8 ( 1.00x) put_pixels_tab[1][3]_ssse3: 19.7 ( 4.24x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/hpeldsp.asm | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index fc51d72d17..2587e3c315 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -32,7 +32,6 @@ cextern pb_1 cextern pw_1 cextern pw_2 pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 -pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7 cextern pw_8192 @@ -423,9 +422,14 @@ cglobal %1%3_pixels8_xy2, 4,5,5 movh m1, [r1+r4+1] punpcklbw m0, m1 pmaddubsw m0, m4 +%ifidn %3, _no_rnd paddusw m2, m3 paddusw m2, m0 psrlw m2, 2 +%else + paddusw m2, m0 + pmulhrsw m2, [pw_8192] +%endif %ifidn %1, avg movh m1, [r0+r4] packuswb m2, m2 @@ -440,9 +444,14 @@ cglobal %1%3_pixels8_xy2, 4,5,5 movh m2, [r1+r4+1] punpcklbw m2, m1 pmaddubsw m2, m4 +%ifidn %3, _no_rnd paddusw m0, m3 paddusw m0, m2 psrlw m0, 2 +%else + paddusw m0, m2 + pmulhrsw m0, [pw_8192] +%endif %ifidn %1, avg movh m1, [r0+r4] packuswb m0, m0 @@ -459,6 +468,8 @@ cglobal %1%3_pixels8_xy2, 4,5,5 INIT_XMM ssse3 SET_PIXELS8_XY2 put, pw_1, _no_rnd +SET_PIXELS8_XY2 avg, pw_8192 +SET_PIXELS8_XY2 put, pw_8192 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) @@ -542,13 +553,8 @@ SET_PIXELS_XY2 put, pw_1, _no_rnd SET_PIXELS_XY2 avg, pw_1, _no_rnd %macro SSSE3_PIXELS_XY2 1-2 -%if %0 == 2 ; sse2 cglobal %1_pixels16_xy2, 4,5,%2 mova m4, [pb_interleave16] -%else -cglobal %1_pixels8_xy2, 4,5 - mova m4, [pb_interleave8] -%endif mova m5, [pb_1] movu m0, [r1] movu m1, [r1+1] @@ -601,9 +607,6 @@ cglobal %1_pixels8_xy2, 4,5 RET %endmacro -INIT_MMX ssse3 -SSSE3_PIXELS_XY2 put -SSSE3_PIXELS_XY2 avg INIT_XMM ssse3 SSSE3_PIXELS_XY2 put, 6 SSSE3_PIXELS_XY2 avg, 7 -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2025-10-08 15:21 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=175993683631.65.2086698086369123246@bf249f23a2c8 \ --to=ffmpeg-devel@ffmpeg.org \ --cc=code@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git