From: mkver via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> To: ffmpeg-devel@ffmpeg.org Cc: mkver <code@ffmpeg.org> Subject: [FFmpeg-devel] [PATCH] qpeldsp and h264_qpeldsp (PR #20629) Date: Tue, 30 Sep 2025 02:09:11 -0000 Message-ID: <175919815179.25.7383648006752454224@bf249f23a2c8> (raw) PR #20629 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20629 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20629.patch >From 824bdbb577bd41aacc0cfaa4fbf8df56cba53446 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sat, 27 Sep 2025 19:15:49 +0200 Subject: [PATCH 01/26] tests/checkasm: Add qpeldsp checkasm Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 ++ tests/checkasm/checkasm.h | 1 + tests/checkasm/qpeldsp.c | 108 ++++++++++++++++++++++++++++++++++++++ tests/fate/checkasm.mak | 1 + 5 files changed, 114 insertions(+) create mode 100644 tests/checkasm/qpeldsp.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index c41d719e82..1589a15e2f 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -20,6 +20,7 @@ AVCODECOBJS-$(CONFIG_LLVIDENCDSP) += llviddspenc.o AVCODECOBJS-$(CONFIG_LPC) += lpc.o AVCODECOBJS-$(CONFIG_ME_CMP) += motion.o AVCODECOBJS-$(CONFIG_MPEGVIDEOENCDSP) += mpegvideoencdsp.o +AVCODECOBJS-$(CONFIG_QPELDSP) += qpeldsp.o AVCODECOBJS-$(CONFIG_VC1DSP) += vc1dsp.o AVCODECOBJS-$(CONFIG_VP8DSP) += vp8dsp.o AVCODECOBJS-$(CONFIG_VIDEODSP) += videodsp.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index b23e4ce889..e59d366f2b 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -220,6 +220,9 @@ static const struct { #if CONFIG_PIXBLOCKDSP { "pixblockdsp", checkasm_check_pixblockdsp }, #endif + #if CONFIG_QPELDSP + { "qpeldsp", checkasm_check_qpeldsp }, + #endif #if CONFIG_RV34DSP { "rv34dsp", checkasm_check_rv34dsp }, #endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 2c259aae01..eda806e870 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -125,6 +125,7 @@ void checkasm_check_mpegvideoencdsp(void); void checkasm_check_nlmeans(void); void checkasm_check_opusdsp(void); void checkasm_check_pixblockdsp(void); +void checkasm_check_qpeldsp(void); void checkasm_check_sbrdsp(void); void checkasm_check_rv34dsp(void); void checkasm_check_rv40dsp(void); diff --git a/tests/checkasm/qpeldsp.c b/tests/checkasm/qpeldsp.c new file mode 100644 index 0000000000..fd3c50cd55 --- /dev/null +++ b/tests/checkasm/qpeldsp.c @@ -0,0 +1,108 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <assert.h> +#include <stddef.h> + +#include "checkasm.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/macros.h" +#include "libavutil/mem_internal.h" +#include "libavcodec/qpeldsp.h" + +#define MAX_BLOCK_SIZE 16 +#define MAX_STRIDE 64 +// BUF_SIZE is bigger than necessary in order to test strides > block width. +#define BUF_SIZE ((MAX_BLOCK_SIZE - 1) * MAX_STRIDE + MAX_BLOCK_SIZE) +// Due to qpel interpolation the input needs to have one more line than +// the output and the last line needs one more element. +// The input is not subject to alignment requirements; making the input buffer +// bigger (by MAX_BLOCK_SIZE - 1) allows us to use a random misalignment. +#define INPUT_BUF_SIZE (MAX_BLOCK_SIZE * MAX_STRIDE + MAX_BLOCK_SIZE + 1 + (MAX_BLOCK_SIZE - 1)) + +#define randomize_buffers(buf0, buf1) \ + do { \ + static_assert(sizeof(buf0) == sizeof(buf1), "Incompatible buffers"); \ + static_assert(!(sizeof(buf0) % 4), "Tail handling needed"); \ + static_assert(sizeof(buf0[0]) == 1 && sizeof(buf1[0]) == 1, \ + "Pointer arithmetic needs to be adapted"); \ + for (size_t k = 0; k < sizeof(buf0); k += 4) { \ + uint32_t r = rnd(); \ + AV_WN32A(buf0 + k, r); \ + AV_WN32A(buf1 + k, r); \ + } \ + } while (0) + + +void checkasm_check_qpeldsp(void) +{ + DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, srcbuf0)[INPUT_BUF_SIZE]; + DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, srcbuf1)[INPUT_BUF_SIZE]; + DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, dstbuf0)[BUF_SIZE]; + DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, dstbuf1)[BUF_SIZE]; + QpelDSPContext qdsp; + static const struct { + const char *name; + size_t offset; + } tests[] = { +#define TEST(NAME) { .name = #NAME, .offset = offsetof(QpelDSPContext, NAME) } + TEST(put_qpel_pixels_tab), + TEST(avg_qpel_pixels_tab), + TEST(put_no_rnd_qpel_pixels_tab), + }; + declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + + ff_qpeldsp_init(&qdsp); + + for (size_t i = 0; i < FF_ARRAY_ELEMS(tests); ++i) { + qpel_mc_func (*func_tab)[16] = (qpel_mc_func (*)[16])((char*)&qdsp + tests[i].offset); + for (unsigned j = 0; j < 2; ++j) { + const unsigned blocksize = MAX_BLOCK_SIZE >> j; + + for (unsigned dxy = 0; dxy < 16; ++dxy) { + if (check_func(func_tab[j][dxy], "%s[%u][%u]", tests[i].name, j, dxy)) { + // Don't always use output that is 16-aligned. + size_t dst_offset = (rnd() % (MAX_BLOCK_SIZE / blocksize)) * blocksize; + size_t src_offset = rnd() % MAX_BLOCK_SIZE; + ptrdiff_t stride = (rnd() % (MAX_STRIDE / blocksize) + 1) * blocksize; + const uint8_t *src0 = srcbuf0 + src_offset, *src1 = srcbuf1 + src_offset; + uint8_t *dst0 = dstbuf0 + dst_offset, *dst1 = dstbuf1 + dst_offset; + + if (rnd() & 1) { + // Flip stride. + dst1 += (blocksize - 1) * stride; + dst0 += (blocksize - 1) * stride; + // Due to interpolation potentially blocksize + 1 lines are read + // from src, hence blocksize * stride. + src0 += blocksize * stride; + src1 += blocksize * stride; + stride = -stride; + } + + randomize_buffers(srcbuf0, srcbuf1); + randomize_buffers(dstbuf0, dstbuf1); + call_ref(dst0, src0, stride); + call_new(dst1, src1, stride); + if (memcmp(srcbuf0, srcbuf1, sizeof(srcbuf0)) || memcmp(dstbuf0, dstbuf1, sizeof(dstbuf0))) + fail(); + bench_new(dst0, src0, stride); + } + } + } + } +} diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak index 7570c89ad9..178b630fba 100644 --- a/tests/fate/checkasm.mak +++ b/tests/fate/checkasm.mak @@ -40,6 +40,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \ fate-checkasm-mpegvideoencdsp \ fate-checkasm-opusdsp \ fate-checkasm-pixblockdsp \ + fate-checkasm-qpeldsp \ fate-checkasm-sbrdsp \ fate-checkasm-rv34dsp \ fate-checkasm-rv40dsp \ -- 2.49.1 >From 73cd6c34f53aadaa9846056315e2a32be028fab8 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sun, 28 Sep 2025 23:23:20 +0200 Subject: [PATCH 02/26] tests/checkasm/hpeldsp: Use instruction-set independent height Otherwise the benchmark numbers are incomparable. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- tests/checkasm/hpeldsp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/checkasm/hpeldsp.c b/tests/checkasm/hpeldsp.c index ba290b3ab8..6c80029507 100644 --- a/tests/checkasm/hpeldsp.c +++ b/tests/checkasm/hpeldsp.c @@ -86,10 +86,16 @@ void checkasm_check_hpeldsp(void) size_t dst_offset = (rnd() % (MAX_BLOCK_SIZE / blocksize)) * blocksize; size_t src_offset = rnd() % MAX_BLOCK_SIZE; ptrdiff_t stride = (rnd() % (MAX_STRIDE / blocksize) + 1) * blocksize; - int h = (rnd() % (MAX_HEIGHT / h_mult) + 1) * h_mult; const uint8_t *src0 = srcbuf0 + src_offset, *src1 = srcbuf1 + src_offset; uint8_t *dst0 = dstbuf0 + dst_offset, *dst1 = dstbuf1 + dst_offset; + // Always use the same height for each test, so that comparisons of benchmarks + // from different instruction sets are meaningful. + static int saved_heights[FF_ARRAY_ELEMS(tests)][4][4]; + int h = saved_heights[i][j][dxy]; + if (!h) + saved_heights[i][j][dxy] = h = (rnd() % (MAX_HEIGHT / h_mult) + 1) * h_mult; + if (rnd() & 1) { // Flip stride. dst1 += (h - 1) * stride; -- 2.49.1 >From 8a89cd25839261cb3c76d1e1f29ee3f518d7889a Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Fri, 26 Sep 2025 19:31:49 +0200 Subject: [PATCH 03/26] avcodec/x86/qpel: Remove always-false branches The ff_avg_pixels{4,8,16}_l2_mmxext() functions are only ever used in the last step (the one that actually writes to the dst buffer) where the number of lines to process is always equal to the dimensions of the block, whereas ff_put_pixels{8,16}_mmxext() are also used in intermediate calculations where the number of lines can be 9 or 17. The code in qpel.asm uses common macros for both and processes more than one line per loop iteration; it therefore checks for whether the number of lines is odd and treats this line separately; yet this special handling is only needed for the put functions, not the avg functions. It has therefore been %if'ed away for these. The check is also not needed for ff_put_pixels4_l2_mmxext() which is only used by H.264 which always processes four lines. Because ff_{avg,put}_pixels4_l2_mmxext() processes four lines in a single loop iteration, not only the odd-height handling, but the whole loop could be removed. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/qpel.asm | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm index 481251314a..241ed27b8b 100644 --- a/libavcodec/x86/qpel.asm +++ b/libavcodec/x86/qpel.asm @@ -51,17 +51,6 @@ SECTION .text cglobal %1_pixels4_l2, 6,6 movsxdifnidn r3, r3d movsxdifnidn r4, r4d - test r5d, 1 - je .loop - movd m0, [r1] - movd m1, [r2] - add r1, r4 - add r2, 4 - pavgb m0, m1 - OP m0, [r0], m3 - add r0, r3 - dec r5d -.loop: mova m0, [r1] mova m1, [r1+r4] lea r1, [r1+2*r4] @@ -72,15 +61,10 @@ cglobal %1_pixels4_l2, 6,6 lea r0, [r0+2*r3] mova m0, [r1] mova m1, [r1+r4] - lea r1, [r1+2*r4] pavgb m0, [r2+8] pavgb m1, [r2+12] OP m0, [r0], m3 OP m1, [r0+r3], m3 - lea r0, [r0+2*r3] - add r2, 16 - sub r5d, 4 - jne .loop RET %endmacro @@ -95,6 +79,7 @@ PIXELS4_L2 avg cglobal %1_pixels8_l2, 6,6 movsxdifnidn r3, r3d movsxdifnidn r4, r4d +%ifidn %1, put test r5d, 1 je .loop mova m0, [r1] @@ -105,6 +90,7 @@ cglobal %1_pixels8_l2, 6,6 OP m0, [r0] add r0, r3 dec r5d +%endif .loop: mova m0, [r1] mova m1, [r1+r4] @@ -139,6 +125,7 @@ PIXELS8_L2 avg cglobal %1_pixels16_l2, 6,6 movsxdifnidn r3, r3d movsxdifnidn r4, r4d +%ifidn %1, put test r5d, 1 je .loop mova m0, [r1] @@ -151,6 +138,7 @@ cglobal %1_pixels16_l2, 6,6 OP m1, [r0+8] add r0, r3 dec r5d +%endif .loop: mova m0, [r1] mova m1, [r1+8] -- 2.49.1 >From 744d531195aeccbf20f17940bffe20a965801c3b Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sat, 27 Sep 2025 02:32:43 +0200 Subject: [PATCH 04/26] avcodec/x86/qpel{dsp,dsp_init}: Use ptrdiff_t for stride This is more correct given that qpel_mc_func already uses ptrdiff_t; it also allows to avoid movsxdifnidn. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/h264_qpel.c | 12 ++++++------ libavcodec/x86/qpel.asm | 12 +++--------- libavcodec/x86/qpeldsp.asm | 21 ++++---------------- libavcodec/x86/qpeldsp_init.c | 36 +++++++++++++++++------------------ 4 files changed, 31 insertions(+), 50 deletions(-) diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 33d993d722..d2a20c3d6c 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -31,17 +31,17 @@ #if HAVE_X86ASM void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - int dstStride, int src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - int dstStride, int src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - int dstStride, int src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - int dstStride, int src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - int dstStride, int src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - int dstStride, int src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); #define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext #define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm index 241ed27b8b..16da2dbc3b 100644 --- a/libavcodec/x86/qpel.asm +++ b/libavcodec/x86/qpel.asm @@ -45,12 +45,10 @@ SECTION .text %endmacro ; void ff_put/avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, -; int dstStride, int src1Stride, int h) +; ptrdiff_t dstStride, ptrdiff_t src1Stride, int h) %macro PIXELS4_L2 1 %define OP op_%1h cglobal %1_pixels4_l2, 6,6 - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d mova m0, [r1] mova m1, [r1+r4] lea r1, [r1+2*r4] @@ -73,12 +71,10 @@ PIXELS4_L2 put PIXELS4_L2 avg ; void ff_put/avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, -; int dstStride, int src1Stride, int h) +; ptrdiff_t dstStride, ptrdiff_t src1Stride, int h) %macro PIXELS8_L2 1 %define OP op_%1 cglobal %1_pixels8_l2, 6,6 - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d %ifidn %1, put test r5d, 1 je .loop @@ -119,12 +115,10 @@ PIXELS8_L2 put PIXELS8_L2 avg ; void ff_put/avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, -; int dstStride, int src1Stride, int h) +; ptrdiff_t dstStride, ptrdiff_t src1Stride, int h) %macro PIXELS16_L2 1 %define OP op_%1 cglobal %1_pixels16_l2, 6,6 - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d %ifidn %1, put test r5d, 1 je .loop diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm index 30d26a5acc..7fa7dbd2dc 100644 --- a/libavcodec/x86/qpeldsp.asm +++ b/libavcodec/x86/qpeldsp.asm @@ -23,7 +23,6 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA cextern pb_1 cextern pw_3 cextern pw_15 @@ -33,11 +32,10 @@ cextern pw_20 SECTION .text -; void ff_put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +; void ff_put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, +; ptrdiff_t dstStride, ptrdiff_t src1Stride, int h) %macro PUT_NO_RND_PIXELS8_L2 0 cglobal put_no_rnd_pixels8_l2, 6,6 - movsxdifnidn r4, r4d - movsxdifnidn r3, r3d pcmpeqb m6, m6 test r5d, 1 je .loop @@ -99,11 +97,10 @@ INIT_MMX mmxext PUT_NO_RND_PIXELS8_L2 -; void ff_put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +; void ff_put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, +; ptrdiff_t dstStride, ptrdiff_t src1Stride, int h) %macro PUT_NO_RND_PIXELS16_l2 0 cglobal put_no_rnd_pixels16_l2, 6,6 - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d pcmpeqb m6, m6 test r5d, 1 je .loop @@ -169,8 +166,6 @@ PUT_NO_RND_PIXELS16_l2 %macro MPEG4_QPEL16_H_LOWPASS 1 cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16 - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d pxor m7, m7 .loop: mova m0, [r1] @@ -302,8 +297,6 @@ MPEG4_QPEL16_H_LOWPASS put_no_rnd %macro MPEG4_QPEL8_H_LOWPASS 1 cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8 - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d pxor m7, m7 .loop: mova m0, [r1] @@ -398,9 +391,6 @@ MPEG4_QPEL8_H_LOWPASS put_no_rnd %macro MPEG4_QPEL16_V_LOWPASS 1 cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544 - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d - mov r4d, 17 mov r5, rsp pxor m7, m7 @@ -494,9 +484,6 @@ MPEG4_QPEL16_V_LOWPASS put_no_rnd %macro MPEG4_QPEL8_V_LOWPASS 1 cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288 - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d - mov r4d, 9 mov r5, rsp pxor m7, m7 diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c index 2448adde88..33e1643669 100644 --- a/libavcodec/x86/qpeldsp_init.c +++ b/libavcodec/x86/qpeldsp_init.c @@ -33,52 +33,52 @@ void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - int dstStride, int src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - int dstStride, int src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - int dstStride, int src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - int dstStride, int src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - int dstStride, int src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - int dstStride, int src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, - int dstStride, int srcStride, int h); + ptrdiff_t dstStride, ptrdiff_t srcStride, int h); void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, - int dstStride, int srcStride, int h); + ptrdiff_t dstStride, ptrdiff_t srcStride, int h); void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, - int dstStride, int srcStride, + ptrdiff_t dstStride, ptrdiff_t srcStride, int h); void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, - int dstStride, int srcStride, int h); + ptrdiff_t dstStride, ptrdiff_t srcStride, int h); void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, - int dstStride, int srcStride, int h); + ptrdiff_t dstStride, ptrdiff_t srcStride, int h); void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, - int dstStride, int srcStride, + ptrdiff_t dstStride, ptrdiff_t srcStride, int h); void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, - int dstStride, int srcStride); + ptrdiff_t dstStride, ptrdiff_t srcStride); void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, - int dstStride, int srcStride); + ptrdiff_t dstStride, ptrdiff_t srcStride); void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, - int dstStride, int srcStride); + ptrdiff_t dstStride, ptrdiff_t srcStride); void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, - int dstStride, int srcStride); + ptrdiff_t dstStride, ptrdiff_t srcStride); void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, - int dstStride, int srcStride); + ptrdiff_t dstStride, ptrdiff_t srcStride); void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, - int dstStride, int srcStride); + ptrdiff_t dstStride, ptrdiff_t srcStride); #if HAVE_X86ASM -- 2.49.1 >From 2329f06cd612a11e2452c907bdf1d21de52f6fab Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sat, 27 Sep 2025 04:33:04 +0200 Subject: [PATCH 05/26] avcodec/x86/qpel{,dsp_init}: Remove constant function parameters ff_avg_pixels{4,8,16}_l2_mmxext() are always called with height equal to their blocksize. And ff_{put,avg}_pixels4_l2_mmxext() are furthermore always called with both strides being equal. So remove these redundant function parameters. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/h264_qpel.c | 18 ++++--- libavcodec/x86/qpel.asm | 32 ++++++++---- libavcodec/x86/qpeldsp_init.c | 96 ++++++++++++++++++----------------- 3 files changed, 83 insertions(+), 63 deletions(-) diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index d2a20c3d6c..43e68d2d97 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -31,21 +31,27 @@ #if HAVE_X86ASM void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); + ptrdiff_t stride); void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); + ptrdiff_t stride); void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride); void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride); +#define ff_put_pixels4_l2_mmxext(dst, src1, src2, dststride, src1stride, h) \ + ff_put_pixels4_l2_mmxext((dst), (src1), (src2), (dststride)) +#define ff_avg_pixels4_l2_mmxext(dst, src1, src2, dststride, src1stride, h) \ + ff_avg_pixels4_l2_mmxext((dst), (src1), (src2), (dststride)) #define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext -#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext +#define ff_avg_pixels8_l2_sse2(dst, src1, src2, dststride, src1stride, h) \ + ff_avg_pixels8_l2_mmxext((dst), (src1), (src2), (dststride), (src1stride)) #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext -#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext +#define ff_avg_pixels16_l2_sse2(dst, src1, src2, dststride, src1stride, h) \ + ff_avg_pixels16_l2_mmxext((dst), (src1), (src2), (dststride), (src1stride)) #define ff_put_pixels4_mmxext(...) #define DEF_QPEL(OPNAME)\ diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm index 16da2dbc3b..043f7b0a66 100644 --- a/libavcodec/x86/qpel.asm +++ b/libavcodec/x86/qpel.asm @@ -45,20 +45,20 @@ SECTION .text %endmacro ; void ff_put/avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, -; ptrdiff_t dstStride, ptrdiff_t src1Stride, int h) +; ptrdiff_t stride) %macro PIXELS4_L2 1 %define OP op_%1h -cglobal %1_pixels4_l2, 6,6 +cglobal %1_pixels4_l2, 4,4 mova m0, [r1] - mova m1, [r1+r4] - lea r1, [r1+2*r4] + mova m1, [r1+r3] + lea r1, [r1+2*r3] pavgb m0, [r2] pavgb m1, [r2+4] OP m0, [r0], m3 OP m1, [r0+r3], m3 lea r0, [r0+2*r3] mova m0, [r1] - mova m1, [r1+r4] + mova m1, [r1+r3] pavgb m0, [r2+8] pavgb m1, [r2+12] OP m0, [r0], m3 @@ -70,12 +70,12 @@ INIT_MMX mmxext PIXELS4_L2 put PIXELS4_L2 avg -; void ff_put/avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, -; ptrdiff_t dstStride, ptrdiff_t src1Stride, int h) %macro PIXELS8_L2 1 %define OP op_%1 -cglobal %1_pixels8_l2, 6,6 %ifidn %1, put +; void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, +; ptrdiff_t dstStride, ptrdiff_t src1Stride, int h) +cglobal put_pixels8_l2, 6,6 test r5d, 1 je .loop mova m0, [r1] @@ -86,6 +86,11 @@ cglobal %1_pixels8_l2, 6,6 OP m0, [r0] add r0, r3 dec r5d +%else +; void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, +; ptrdiff_t dstStride, ptrdiff_t src1Stride) +cglobal avg_pixels8_l2, 5,6 + mov r5d, 8 %endif .loop: mova m0, [r1] @@ -114,12 +119,12 @@ INIT_MMX mmxext PIXELS8_L2 put PIXELS8_L2 avg -; void ff_put/avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, -; ptrdiff_t dstStride, ptrdiff_t src1Stride, int h) %macro PIXELS16_L2 1 %define OP op_%1 -cglobal %1_pixels16_l2, 6,6 %ifidn %1, put +; void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, +; ptrdiff_t dstStride, ptrdiff_t src1Stride, int h) +cglobal put_pixels16_l2, 6,6 test r5d, 1 je .loop mova m0, [r1] @@ -132,6 +137,11 @@ cglobal %1_pixels16_l2, 6,6 OP m1, [r0+8] add r0, r3 dec r5d +%else +; void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, +; ptrdiff_t dstStride, ptrdiff_t src1Stride) +cglobal avg_pixels16_l2, 5,6 + mov r5d, 16 %endif .loop: mova m0, [r1] diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c index 33e1643669..0bb39402d4 100644 --- a/libavcodec/x86/qpeldsp_init.c +++ b/libavcodec/x86/qpeldsp_init.c @@ -39,13 +39,13 @@ void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride); void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); + ptrdiff_t dstStride, ptrdiff_t src1Stride); void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src1Stride, int h); @@ -82,7 +82,7 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, #if HAVE_X86ASM -#define QPEL_OP(OPNAME, RND, MMX) \ +#define QPEL_OP(OPNAME, RND, MMX, ARG) \ static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, \ const uint8_t *src, \ ptrdiff_t stride) \ @@ -91,8 +91,8 @@ static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, \ uint8_t *const half = (uint8_t *) temp; \ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \ stride, 8); \ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \ - stride, stride, 8); \ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, src, half, \ + stride, stride, 8)); \ } \ \ static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, \ @@ -111,8 +111,8 @@ static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, \ uint8_t *const half = (uint8_t *) temp; \ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \ stride, 8); \ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \ - stride, 8); \ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, src + 1, half, \ + stride, stride, 8)); \ } \ \ static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, \ @@ -123,8 +123,8 @@ static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, \ uint8_t *const half = (uint8_t *) temp; \ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \ 8, stride); \ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \ - stride, stride, 8); \ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, src, half, \ + stride, stride, 8)); \ } \ \ static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, \ @@ -143,8 +143,8 @@ static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, \ uint8_t *const half = (uint8_t *) temp; \ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \ 8, stride); \ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\ - stride, 8); \ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, src + stride, half, \ + stride, stride, 8)); \ } \ \ static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, \ @@ -159,8 +159,8 @@ static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, \ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \ stride, 9); \ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 8, 8); \ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH, halfHV, \ + stride, 8, 8)); \ } \ \ static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, \ @@ -175,8 +175,8 @@ static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, \ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ stride, 9); \ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 8, 8); \ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH, halfHV, \ + stride, 8, 8)); \ } \ \ static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, \ @@ -191,8 +191,8 @@ static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, \ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \ stride, 9); \ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \ - stride, 8, 8); \ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH + 8, halfHV, \ + stride, 8, 8)); \ } \ \ static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, \ @@ -207,8 +207,8 @@ static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, \ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ stride, 9); \ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \ - stride, 8, 8); \ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH + 8, halfHV, \ + stride, 8, 8)); \ } \ \ static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, \ @@ -221,8 +221,8 @@ static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, \ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ stride, 9); \ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 8, 8); \ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH, halfHV, \ + stride, 8, 8)); \ } \ \ static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, \ @@ -235,8 +235,8 @@ static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, \ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ stride, 9); \ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \ - stride, 8, 8); \ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH + 8, halfHV, \ + stride, 8, 8)); \ } \ \ static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, \ @@ -287,8 +287,8 @@ static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, \ uint8_t *const half = (uint8_t *) temp; \ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \ stride, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \ - stride, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, src, half, \ + stride, stride, 16)); \ } \ \ static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, \ @@ -307,8 +307,8 @@ static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, \ uint8_t *const half = (uint8_t*) temp; \ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \ stride, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \ - stride, stride, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, src + 1, half, \ + stride, stride, 16)); \ } \ \ static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, \ @@ -319,8 +319,8 @@ static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, \ uint8_t *const half = (uint8_t *) temp; \ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \ stride); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \ - stride, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, src, half, \ + stride, stride, 16)); \ } \ \ static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, \ @@ -339,8 +339,8 @@ static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, \ uint8_t *const half = (uint8_t *) temp; \ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \ stride); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \ - stride, stride, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, src+stride, half, \ + stride, stride, 16)); \ } \ \ static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, \ @@ -356,8 +356,8 @@ static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, \ stride, 17); \ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 16, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH, halfHV, \ + stride, 16, 16)); \ } \ \ static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, \ @@ -373,8 +373,8 @@ static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, \ stride, 17); \ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 16, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH, halfHV, \ + stride, 16, 16)); \ } \ \ static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, \ @@ -390,8 +390,8 @@ static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, \ stride, 17); \ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \ - stride, 16, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH + 16, halfHV, \ + stride, 16, 16)); \ } \ \ static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, \ @@ -407,8 +407,8 @@ static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, \ stride, 17); \ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \ - stride, 16, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH + 16, halfHV, \ + stride, 16, 16)); \ } \ \ static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, \ @@ -422,8 +422,8 @@ static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, \ stride, 17); \ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 16, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH, halfHV, \ + stride, 16, 16)); \ } \ \ static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, \ @@ -437,8 +437,8 @@ static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, \ stride, 17); \ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \ - stride, 16, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH + 16, halfHV, \ + stride, 16, 16)); \ } \ \ static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, \ @@ -481,9 +481,13 @@ static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, \ stride, 16); \ } -QPEL_OP(put_, _, mmxext) -QPEL_OP(avg_, _, mmxext) -QPEL_OP(put_no_rnd_, _no_rnd_, mmxext) +#define PASSTHROUGH(...) __VA_ARGS__ +#define STRIP_HEIGHT(dst, src1, src2, dststride, srcstride, height) \ + (dst), (src1), (src2), (dststride), (srcstride) + +QPEL_OP(put_, _, mmxext, PASSTHROUGH) +QPEL_OP(avg_, _, mmxext, STRIP_HEIGHT) +QPEL_OP(put_no_rnd_, _no_rnd_, mmxext, PASSTHROUGH) #define MC00(OPNAME, SIZE, EXT) \ static void OPNAME ## _qpel ## SIZE ## _mc00_ ## EXT(uint8_t *dst, \ -- 2.49.1 >From c8e51209822c268e05b0b90b79712a727b08b28f Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sat, 27 Sep 2025 05:03:34 +0200 Subject: [PATCH 06/26] avcodec/x86/qpel: Move ff_{put,avg}_pixels4_l2_mmxext to h264_qpel Only used there. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/h264_qpel_8bit.asm | 26 ++++++++++++++++++++++ libavcodec/x86/qpel.asm | 36 ------------------------------- 2 files changed, 26 insertions(+), 36 deletions(-) diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index 89e7c282b2..6a134ee5b4 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -53,6 +53,32 @@ SECTION .text mova %2, %1 %endmacro +; void ff_put/avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, +; ptrdiff_t stride) +%macro PIXELS4_L2 1 +%define OP op_%1h +cglobal %1_pixels4_l2, 4,4 + mova m0, [r1] + mova m1, [r1+r3] + lea r1, [r1+2*r3] + pavgb m0, [r2] + pavgb m1, [r2+4] + OP m0, [r0], m3 + OP m1, [r0+r3], m3 + lea r0, [r0+2*r3] + mova m0, [r1] + mova m1, [r1+r3] + pavgb m0, [r2+8] + pavgb m1, [r2+12] + OP m0, [r0], m3 + OP m1, [r0+r3], m3 + RET +%endmacro + +INIT_MMX mmxext +PIXELS4_L2 put +PIXELS4_L2 avg + %macro QPEL4_H_LOWPASS_OP 1 cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride movsxdifnidn r2, r2d diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm index 043f7b0a66..93f0d007c3 100644 --- a/libavcodec/x86/qpel.asm +++ b/libavcodec/x86/qpel.asm @@ -25,51 +25,15 @@ SECTION .text -%macro op_avgh 3 - movh %3, %2 - pavgb %1, %3 - movh %2, %1 -%endmacro - %macro op_avg 2 pavgb %1, %2 mova %2, %1 %endmacro -%macro op_puth 2-3 - movh %2, %1 -%endmacro - %macro op_put 2 mova %2, %1 %endmacro -; void ff_put/avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, -; ptrdiff_t stride) -%macro PIXELS4_L2 1 -%define OP op_%1h -cglobal %1_pixels4_l2, 4,4 - mova m0, [r1] - mova m1, [r1+r3] - lea r1, [r1+2*r3] - pavgb m0, [r2] - pavgb m1, [r2+4] - OP m0, [r0], m3 - OP m1, [r0+r3], m3 - lea r0, [r0+2*r3] - mova m0, [r1] - mova m1, [r1+r3] - pavgb m0, [r2+8] - pavgb m1, [r2+12] - OP m0, [r0], m3 - OP m1, [r0+r3], m3 - RET -%endmacro - -INIT_MMX mmxext -PIXELS4_L2 put -PIXELS4_L2 avg - %macro PIXELS8_L2 1 %define OP op_%1 %ifidn %1, put -- 2.49.1 >From 0d681e423d83b376727b802ba51a7a38f998e149 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Fri, 26 Sep 2025 20:48:08 +0200 Subject: [PATCH 07/26] configure: Remove mss2->qpeldsp dependency Forgotten in 9cc38cc636badb675b2757fc665d9b8d2f28870f. (mss2 still has an implicit dependency on qpeldsp via the VC-1 decoder.) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- configure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure b/configure index aaa72c9f93..b6e0c9b042 100755 --- a/configure +++ b/configure @@ -3121,7 +3121,7 @@ msmpeg4v2_decoder_select="msmpeg4dec" msmpeg4v2_encoder_select="msmpeg4enc" msmpeg4v3_decoder_select="msmpeg4dec" msmpeg4v3_encoder_select="msmpeg4enc" -mss2_decoder_select="mpegvideodec qpeldsp vc1_decoder" +mss2_decoder_select="mpegvideodec vc1_decoder" mts2_decoder_select="jpegtables mss34dsp" mv30_decoder_select="aandcttables blockdsp" mvha_decoder_select="inflate_wrapper llviddsp" -- 2.49.1 >From 75f825a0fd8357db24a5003d1624193dbe45c481 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Fri, 26 Sep 2025 21:13:57 +0200 Subject: [PATCH 08/26] configure: Avoid mpeg4video_parser->{h263,qpel}dsp dependency This can be easily achieved by moving code only used by the MPEG-4 decoder behind #if CONFIG_MPEG4_DECODER. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- configure | 2 +- libavcodec/Makefile | 2 +- libavcodec/mpeg4videodec.c | 286 ++++++++++++++++++------------------- 3 files changed, 145 insertions(+), 145 deletions(-) diff --git a/configure b/configure index b6e0c9b042..7cc003eb4c 100755 --- a/configure +++ b/configure @@ -3573,7 +3573,7 @@ ftr_parser_select="adts_header mpeg4audio" h264_parser_select="golomb h264dsp h264parse h264_sei" hevc_parser_select="hevcparse hevc_sei" mpegaudio_parser_select="mpegaudioheader" -mpeg4video_parser_select="h263dsp mpegvideodec qpeldsp" +mpeg4video_parser_select="mpegvideodec" vc1_parser_select="vc1dsp" vvc_parser_select="cbs_h266" diff --git a/libavcodec/Makefile b/libavcodec/Makefile index e7fde87b22..d55e899c14 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -1269,7 +1269,7 @@ OBJS-$(CONFIG_MJPEG_PARSER) += mjpeg_parser.o OBJS-$(CONFIG_MLP_PARSER) += mlp_parse.o mlp_parser.o mlp.o OBJS-$(CONFIG_MPEG4VIDEO_PARSER) += mpeg4video_parser.o h263.o \ mpeg4videodec.o mpeg4video.o \ - ituh263dec.o h263dec.o h263data.o + ituh263dec.o h263data.o OBJS-$(CONFIG_MPEGAUDIO_PARSER) += mpegaudio_parser.o OBJS-$(CONFIG_MPEGVIDEO_PARSER) += mpegvideo_parser.o \ mpeg12.o mpeg12data.o diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c index f3d138387b..4a1385ea4d 100644 --- a/libavcodec/mpeg4videodec.c +++ b/libavcodec/mpeg4videodec.c @@ -3065,148 +3065,6 @@ static int decode_user_data(Mpeg4DecContext *ctx, GetBitContext *gb) return 0; } -static av_cold void permute_quant_matrix(uint16_t matrix[64], - const uint8_t new_perm[64], - const uint8_t old_perm[64]) -{ - uint16_t tmp[64]; - - memcpy(tmp, matrix, sizeof(tmp)); - for (int i = 0; i < 64; ++i) - matrix[new_perm[i]] = tmp[old_perm[i]]; -} - -static av_cold void switch_to_xvid_idct(AVCodecContext *const avctx, - MpegEncContext *const s) -{ - uint8_t old_permutation[64]; - - memcpy(old_permutation, s->idsp.idct_permutation, sizeof(old_permutation)); - - avctx->idct_algo = FF_IDCT_XVID; - ff_mpv_idct_init(s); - ff_permute_scantable(s->permutated_intra_h_scantable, - s->alternate_scan ? ff_alternate_vertical_scan : ff_alternate_horizontal_scan, - s->idsp.idct_permutation); - - // Normal (i.e. non-studio) MPEG-4 does not use the chroma matrices. - permute_quant_matrix(s->inter_matrix, s->idsp.idct_permutation, old_permutation); - permute_quant_matrix(s->intra_matrix, s->idsp.idct_permutation, old_permutation); -} - -void ff_mpeg4_workaround_bugs(AVCodecContext *avctx) -{ - Mpeg4DecContext *ctx = avctx->priv_data; - H263DecContext *const h = &ctx->h; - - if (ctx->xvid_build == -1 && ctx->divx_version == -1 && ctx->lavc_build == -1) { - if (h->c.codec_tag == AV_RL32("XVID") || - h->c.codec_tag == AV_RL32("XVIX") || - h->c.codec_tag == AV_RL32("RMP4") || - h->c.codec_tag == AV_RL32("ZMP4") || - h->c.codec_tag == AV_RL32("SIPP")) - ctx->xvid_build = 0; - } - - if (ctx->xvid_build == -1 && ctx->divx_version == -1 && ctx->lavc_build == -1) - if (h->c.codec_tag == AV_RL32("DIVX") && ctx->vo_type == 0 && - ctx->vol_control_parameters == 0) - ctx->divx_version = 400; // divx 4 - - if (ctx->xvid_build >= 0 && ctx->divx_version >= 0) { - ctx->divx_version = - ctx->divx_build = -1; - } - - if (h->c.workaround_bugs & FF_BUG_AUTODETECT) { - if (h->c.codec_tag == AV_RL32("XVIX")) - h->c.workaround_bugs |= FF_BUG_XVID_ILACE; - - if (h->c.codec_tag == AV_RL32("UMP4")) - h->c.workaround_bugs |= FF_BUG_UMP4; - - if (ctx->divx_version >= 500 && ctx->divx_build < 1814) - h->c.workaround_bugs |= FF_BUG_QPEL_CHROMA; - - if (ctx->divx_version > 502 && ctx->divx_build < 1814) - h->c.workaround_bugs |= FF_BUG_QPEL_CHROMA2; - - if (ctx->xvid_build <= 3U) - h->padding_bug_score = 256 * 256 * 256 * 64; - - if (ctx->xvid_build <= 1U) - h->c.workaround_bugs |= FF_BUG_QPEL_CHROMA; - - if (ctx->xvid_build <= 12U) - h->c.workaround_bugs |= FF_BUG_EDGE; - - if (ctx->xvid_build <= 32U) - h->c.workaround_bugs |= FF_BUG_DC_CLIP; - -#define SET_QPEL_FUNC(postfix1, postfix2) \ - h->c.qdsp.put_ ## postfix1 = ff_put_ ## postfix2; \ - h->c.qdsp.put_no_rnd_ ## postfix1 = ff_put_no_rnd_ ## postfix2; \ - h->c.qdsp.avg_ ## postfix1 = ff_avg_ ## postfix2; - - if (ctx->lavc_build < 4653U) - h->c.workaround_bugs |= FF_BUG_STD_QPEL; - - if (ctx->lavc_build < 4655U) - h->c.workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE; - - if (ctx->lavc_build < 4670U) - h->c.workaround_bugs |= FF_BUG_EDGE; - - if (ctx->lavc_build <= 4712U) - h->c.workaround_bugs |= FF_BUG_DC_CLIP; - - if ((ctx->lavc_build&0xFF) >= 100) { - if (ctx->lavc_build > 3621476 && ctx->lavc_build < 3752552 && - (ctx->lavc_build < 3752037 || ctx->lavc_build > 3752191) // 3.2.1+ - ) - h->c.workaround_bugs |= FF_BUG_IEDGE; - } - - if (ctx->divx_version >= 0) - h->c.workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE; - if (ctx->divx_version == 501 && ctx->divx_build == 20020416) - h->padding_bug_score = 256 * 256 * 256 * 64; - - if (ctx->divx_version < 500U) - h->c.workaround_bugs |= FF_BUG_EDGE; - - if (ctx->divx_version >= 0) - h->c.workaround_bugs |= FF_BUG_HPEL_CHROMA; - } - - if (h->c.workaround_bugs & FF_BUG_STD_QPEL) { - SET_QPEL_FUNC(qpel_pixels_tab[0][5], qpel16_mc11_old_c) - SET_QPEL_FUNC(qpel_pixels_tab[0][7], qpel16_mc31_old_c) - SET_QPEL_FUNC(qpel_pixels_tab[0][9], qpel16_mc12_old_c) - SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_old_c) - SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_old_c) - SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_old_c) - - SET_QPEL_FUNC(qpel_pixels_tab[1][5], qpel8_mc11_old_c) - SET_QPEL_FUNC(qpel_pixels_tab[1][7], qpel8_mc31_old_c) - SET_QPEL_FUNC(qpel_pixels_tab[1][9], qpel8_mc12_old_c) - SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_old_c) - SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_old_c) - SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_old_c) - } - - if (avctx->debug & FF_DEBUG_BUGS) - av_log(h->c.avctx, AV_LOG_DEBUG, - "bugs: %X lavc_build:%d xvid_build:%d divx_version:%d divx_build:%d %s\n", - h->c.workaround_bugs, ctx->lavc_build, ctx->xvid_build, - ctx->divx_version, ctx->divx_build, h->divx_packed ? "p" : ""); - - if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 && - avctx->idct_algo == FF_IDCT_AUTO && !h->c.studio_profile) { - switch_to_xvid_idct(avctx, &h->c); - } -} - static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb, int parse_only) { @@ -3738,6 +3596,149 @@ end: return decode_vop_header(ctx, gb, parse_only); } +#if CONFIG_MPEG4_DECODER +static av_cold void permute_quant_matrix(uint16_t matrix[64], + const uint8_t new_perm[64], + const uint8_t old_perm[64]) +{ + uint16_t tmp[64]; + + memcpy(tmp, matrix, sizeof(tmp)); + for (int i = 0; i < 64; ++i) + matrix[new_perm[i]] = tmp[old_perm[i]]; +} + +static av_cold void switch_to_xvid_idct(AVCodecContext *const avctx, + MpegEncContext *const s) +{ + uint8_t old_permutation[64]; + + memcpy(old_permutation, s->idsp.idct_permutation, sizeof(old_permutation)); + + avctx->idct_algo = FF_IDCT_XVID; + ff_mpv_idct_init(s); + ff_permute_scantable(s->permutated_intra_h_scantable, + s->alternate_scan ? ff_alternate_vertical_scan : ff_alternate_horizontal_scan, + s->idsp.idct_permutation); + + // Normal (i.e. non-studio) MPEG-4 does not use the chroma matrices. + permute_quant_matrix(s->inter_matrix, s->idsp.idct_permutation, old_permutation); + permute_quant_matrix(s->intra_matrix, s->idsp.idct_permutation, old_permutation); +} + +void ff_mpeg4_workaround_bugs(AVCodecContext *avctx) +{ + Mpeg4DecContext *ctx = avctx->priv_data; + H263DecContext *const h = &ctx->h; + + if (ctx->xvid_build == -1 && ctx->divx_version == -1 && ctx->lavc_build == -1) { + if (h->c.codec_tag == AV_RL32("XVID") || + h->c.codec_tag == AV_RL32("XVIX") || + h->c.codec_tag == AV_RL32("RMP4") || + h->c.codec_tag == AV_RL32("ZMP4") || + h->c.codec_tag == AV_RL32("SIPP")) + ctx->xvid_build = 0; + } + + if (ctx->xvid_build == -1 && ctx->divx_version == -1 && ctx->lavc_build == -1) + if (h->c.codec_tag == AV_RL32("DIVX") && ctx->vo_type == 0 && + ctx->vol_control_parameters == 0) + ctx->divx_version = 400; // divx 4 + + if (ctx->xvid_build >= 0 && ctx->divx_version >= 0) { + ctx->divx_version = + ctx->divx_build = -1; + } + + if (h->c.workaround_bugs & FF_BUG_AUTODETECT) { + if (h->c.codec_tag == AV_RL32("XVIX")) + h->c.workaround_bugs |= FF_BUG_XVID_ILACE; + + if (h->c.codec_tag == AV_RL32("UMP4")) + h->c.workaround_bugs |= FF_BUG_UMP4; + + if (ctx->divx_version >= 500 && ctx->divx_build < 1814) + h->c.workaround_bugs |= FF_BUG_QPEL_CHROMA; + + if (ctx->divx_version > 502 && ctx->divx_build < 1814) + h->c.workaround_bugs |= FF_BUG_QPEL_CHROMA2; + + if (ctx->xvid_build <= 3U) + h->padding_bug_score = 256 * 256 * 256 * 64; + + if (ctx->xvid_build <= 1U) + h->c.workaround_bugs |= FF_BUG_QPEL_CHROMA; + + if (ctx->xvid_build <= 12U) + h->c.workaround_bugs |= FF_BUG_EDGE; + + if (ctx->xvid_build <= 32U) + h->c.workaround_bugs |= FF_BUG_DC_CLIP; + +#define SET_QPEL_FUNC(postfix1, postfix2) \ + h->c.qdsp.put_ ## postfix1 = ff_put_ ## postfix2; \ + h->c.qdsp.put_no_rnd_ ## postfix1 = ff_put_no_rnd_ ## postfix2; \ + h->c.qdsp.avg_ ## postfix1 = ff_avg_ ## postfix2; + + if (ctx->lavc_build < 4653U) + h->c.workaround_bugs |= FF_BUG_STD_QPEL; + + if (ctx->lavc_build < 4655U) + h->c.workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE; + + if (ctx->lavc_build < 4670U) + h->c.workaround_bugs |= FF_BUG_EDGE; + + if (ctx->lavc_build <= 4712U) + h->c.workaround_bugs |= FF_BUG_DC_CLIP; + + if ((ctx->lavc_build&0xFF) >= 100) { + if (ctx->lavc_build > 3621476 && ctx->lavc_build < 3752552 && + (ctx->lavc_build < 3752037 || ctx->lavc_build > 3752191) // 3.2.1+ + ) + h->c.workaround_bugs |= FF_BUG_IEDGE; + } + + if (ctx->divx_version >= 0) + h->c.workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE; + if (ctx->divx_version == 501 && ctx->divx_build == 20020416) + h->padding_bug_score = 256 * 256 * 256 * 64; + + if (ctx->divx_version < 500U) + h->c.workaround_bugs |= FF_BUG_EDGE; + + if (ctx->divx_version >= 0) + h->c.workaround_bugs |= FF_BUG_HPEL_CHROMA; + } + + if (h->c.workaround_bugs & FF_BUG_STD_QPEL) { + SET_QPEL_FUNC(qpel_pixels_tab[0][5], qpel16_mc11_old_c) + SET_QPEL_FUNC(qpel_pixels_tab[0][7], qpel16_mc31_old_c) + SET_QPEL_FUNC(qpel_pixels_tab[0][9], qpel16_mc12_old_c) + SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_old_c) + SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_old_c) + SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_old_c) + + SET_QPEL_FUNC(qpel_pixels_tab[1][5], qpel8_mc11_old_c) + SET_QPEL_FUNC(qpel_pixels_tab[1][7], qpel8_mc31_old_c) + SET_QPEL_FUNC(qpel_pixels_tab[1][9], qpel8_mc12_old_c) + SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_old_c) + SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_old_c) + SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_old_c) + } + + if (avctx->debug & FF_DEBUG_BUGS) + av_log(h->c.avctx, AV_LOG_DEBUG, + "bugs: %X lavc_build:%d xvid_build:%d divx_version:%d divx_build:%d %s\n", + h->c.workaround_bugs, ctx->lavc_build, ctx->xvid_build, + ctx->divx_version, ctx->divx_build, h->divx_packed ? "p" : ""); + + if (ctx->xvid_build >= 0 && + avctx->idct_algo == FF_IDCT_AUTO && !h->c.studio_profile) { + switch_to_xvid_idct(avctx, &h->c); + } +} + static int mpeg4_decode_picture_header(H263DecContext *const h) { Mpeg4DecContext *const ctx = h263_to_mpeg4(h); @@ -3821,7 +3822,6 @@ int ff_mpeg4_frame_end(AVCodecContext *avctx, const AVPacket *pkt) return 0; } -#if CONFIG_MPEG4_DECODER #if HAVE_THREADS static av_cold void clear_context(MpegEncContext *s) { -- 2.49.1 >From 32e0436b899f311fda348f34dbbc2929e1d8cc66 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Fri, 26 Sep 2025 21:28:56 +0200 Subject: [PATCH 09/26] configure: Remove h263_decoder->h263_parser,qpeldsp dependency The former is unnecessary since 3ceffe783965767e62d59e8e68ecd265c98460ec. The latter is since ff_mpeg4_workaround_bugs() (and thereby setting the "old" qpeldsp functions) has been moved inside #if CONFIG_MPEG4_DECODER. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- configure | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configure b/configure index 7cc003eb4c..0128fd5534 100755 --- a/configure +++ b/configure @@ -3053,7 +3053,7 @@ g723_1_encoder_select="celp_math" g729_decoder_select="audiodsp celp_math" h261_decoder_select="mpegvideodec" h261_encoder_select="mpegvideoenc" -h263_decoder_select="h263_parser h263dsp mpegvideodec qpeldsp" +h263_decoder_select="h263dsp mpegvideodec" h263_encoder_select="h263dsp mpegvideoenc" h263i_decoder_select="h263_decoder" h263p_decoder_select="h263_decoder" @@ -3112,7 +3112,7 @@ mpeg1video_decoder_select="mpegvideodec" mpeg1video_encoder_select="mpegvideoenc" mpeg2video_decoder_select="mpegvideodec" mpeg2video_encoder_select="mpegvideoenc" -mpeg4_decoder_select="h263_decoder" +mpeg4_decoder_select="h263_decoder qpeldsp" mpeg4_encoder_select="h263_encoder qpeldsp" msa1_decoder_select="mss34dsp" mscc_decoder_select="inflate_wrapper" -- 2.49.1 >From eeef3755fa14e7f81b891de7871d6644d72c6074 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Fri, 26 Sep 2025 22:30:02 +0200 Subject: [PATCH 10/26] avcodec/mips/Makefile: Fix VC1DSP build rules Affected standalone builds of the VC-1 parser. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/mips/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile index f1c1c796a2..4bbc2f00ea 100644 --- a/libavcodec/mips/Makefile +++ b/libavcodec/mips/Makefile @@ -93,7 +93,7 @@ MSA-OBJS-$(CONFIG_IDCTDSP) += mips/idctdsp_msa.o \ MSA-OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_msa.o MSA-OBJS-$(CONFIG_MPEGVIDEOENCDSP) += mips/mpegvideoencdsp_msa.o MSA-OBJS-$(CONFIG_ME_CMP) += mips/me_cmp_msa.o -MSA-OBJS-$(CONFIG_VC1_DECODER) += mips/vc1dsp_msa.o +MSA-OBJS-$(CONFIG_VC1DSP) += mips/vc1dsp_msa.o MMI-OBJS += mips/constants.o MMI-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o @@ -109,7 +109,7 @@ MMI-OBJS-$(CONFIG_PIXBLOCKDSP) += mips/pixblockdsp_mmi.o MMI-OBJS-$(CONFIG_H264QPEL) += mips/h264qpel_mmi.o MMI-OBJS-$(CONFIG_VP8_DECODER) += mips/vp8dsp_mmi.o MMI-OBJS-$(CONFIG_HPELDSP) += mips/hpeldsp_mmi.o -MMI-OBJS-$(CONFIG_VC1_DECODER) += mips/vc1dsp_mmi.o +MMI-OBJS-$(CONFIG_VC1DSP) += mips/vc1dsp_mmi.o MMI-OBJS-$(CONFIG_WMV2DSP) += mips/wmv2dsp_mmi.o MMI-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_mmi.o MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o -- 2.49.1 >From 4b6f6b7a3670fb37accd828d251efb3c4bc873ef Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Fri, 26 Sep 2025 22:35:49 +0200 Subject: [PATCH 11/26] configure: Remove vc1dsp->qpeldsp dependency It only needs it for some x86 fpel functions; instead add a direct dependency for that. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- configure | 2 +- libavcodec/x86/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configure b/configure index 0128fd5534..6e55f6daec 100755 --- a/configure +++ b/configure @@ -2955,7 +2955,7 @@ mpegvideodec_select="h264chroma mpegvideo mpeg_er" mpegvideoenc_select="aandcttables fdctdsp me_cmp mpegvideo mpegvideoencdsp pixblockdsp" msmpeg4dec_select="h263_decoder" msmpeg4enc_select="h263_encoder" -vc1dsp_select="h264chroma qpeldsp startcode" +vc1dsp_select="h264chroma startcode" vvc_sei_select="atsc_a53 golomb" wmv2dsp_select="qpeldsp" diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 461753c2fe..36168b4aff 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -132,7 +132,7 @@ X86ASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \ x86/qpel.o X86ASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o X86ASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o \ - x86/vc1dsp_mc.o + x86/vc1dsp_mc.o x86/fpel.o ifdef ARCH_X86_64 X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o else -- 2.49.1 >From e42676536c8bec9c7d3ed846c0fd55f65c20ee9b Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sat, 27 Sep 2025 01:32:08 +0200 Subject: [PATCH 12/26] avcodec/h264qpel: Don't build unused 2x2 size funcs for bitdepths > 8 The 2x2 put functions are only used by Snow and Snow uses only the eight bit versions. The rest is dead code. Disabling it saved 41277B here. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/h264qpel.c | 62 ++++++++++++++++++++++++++- libavcodec/h264qpel_template.c | 76 ---------------------------------- tests/checkasm/h264qpel.c | 2 +- 3 files changed, 62 insertions(+), 78 deletions(-) diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c index faca1e8953..be80203c4b 100644 --- a/libavcodec/h264qpel.c +++ b/libavcodec/h264qpel.c @@ -20,11 +20,71 @@ */ #include "libavutil/attributes.h" +#include "libavutil/common.h" #include "h264qpel.h" #define pixeltmp int16_t #define BIT_DEPTH 8 #include "h264qpel_template.c" + +static void put_h264_qpel2_h_lowpass_8(uint8_t *dst, const uint8_t *restrict src, int dstStride, int srcStride) +{ + const int h = 2; + for (int i = 0; i < h; ++i) { + dst[0] = av_clip_uint8(((src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]) + 16) >> 5); + dst[1] = av_clip_uint8(((src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]) + 16) >> 5); + dst += dstStride; + src += srcStride; + } +} + +static void put_h264_qpel2_v_lowpass_8(uint8_t *dst, const uint8_t *restrict src, int dstStride, int srcStride) +{ + const int w = 2; + for (int i = 0; i < w; ++i) { + const int srcB = src[-2*srcStride]; + const int srcA = src[-1*srcStride]; + const int src0 = src[0 *srcStride]; + const int src1 = src[1 *srcStride]; + const int src2 = src[2 *srcStride]; + const int src3 = src[3 *srcStride]; + const int src4 = src[4 *srcStride]; + dst[0*dstStride] = av_clip_uint8(((src0+src1)*20 - (srcA+src2)*5 + (srcB+src3) + 16) >> 5); + dst[1*dstStride] = av_clip_uint8(((src1+src2)*20 - (src0+src3)*5 + (srcA+src4) + 16) >> 5); + dst++; + src++; + } +} + +static void put_h264_qpel2_hv_lowpass_8(uint8_t *dst, pixeltmp *tmp, const uint8_t *restrict src, int dstStride, int tmpStride, int srcStride) +{ + const int h = 2; + const int w = 2; + src -= 2*srcStride; + for (int i = 0; i < h + 5; ++i) { + tmp[0] = (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]); + tmp[1] = (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]); + tmp += tmpStride; + src += srcStride; + } + tmp -= tmpStride*(h+5-2); + for (int i = 0; i < w; ++i) { + const int tmpB = tmp[-2*tmpStride]; + const int tmpA = tmp[-1*tmpStride]; + const int tmp0 = tmp[0 *tmpStride]; + const int tmp1 = tmp[1 *tmpStride]; + const int tmp2 = tmp[2 *tmpStride]; + const int tmp3 = tmp[3 *tmpStride]; + const int tmp4 = tmp[4 *tmpStride]; + dst[0*dstStride] = av_clip_uint8(((tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3) + 512) >> 10); + dst[1*dstStride] = av_clip_uint8(((tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4) + 512) >> 10); + dst++; + tmp++; + } +} + +H264_MC(put_, 2) + #undef BIT_DEPTH #define BIT_DEPTH 9 @@ -73,7 +133,6 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth) dspfunc2(put_h264_qpel, 0, 16, depth); \ dspfunc2(put_h264_qpel, 1, 8, depth); \ dspfunc2(put_h264_qpel, 2, 4, depth); \ - dspfunc2(put_h264_qpel, 3, 2, depth); \ dspfunc2(avg_h264_qpel, 0, 16, depth); \ dspfunc2(avg_h264_qpel, 1, 8, depth); \ dspfunc2(avg_h264_qpel, 2, 4, depth) @@ -81,6 +140,7 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth) switch (bit_depth) { default: SET_QPEL(8); + dspfunc2(put_h264_qpel, 3, 2, 8); // only used by Snow break; case 9: SET_QPEL(9); diff --git a/libavcodec/h264qpel_template.c b/libavcodec/h264qpel_template.c index b71710e6db..a55b45e824 100644 --- a/libavcodec/h264qpel_template.c +++ b/libavcodec/h264qpel_template.c @@ -75,81 +75,6 @@ static inline void FUNC(copy_block16)(uint8_t *dst, const uint8_t *restrict src, } #define H264_LOWPASS(OPNAME, OP, OP2) \ -av_unused static void FUNC(OPNAME ## h264_qpel2_h_lowpass)(uint8_t *p_dst, const uint8_t *restrict p_src, int dstStride, int srcStride)\ -{\ - const int h=2;\ - int i;\ - pixel *dst = (pixel*)p_dst;\ - const pixel *restrict src = (const pixel*)p_src;\ - dstStride >>= sizeof(pixel)-1;\ - srcStride >>= sizeof(pixel)-1;\ - for(i=0; i<h; i++)\ - {\ - OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\ - OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\ - dst+=dstStride;\ - src+=srcStride;\ - }\ -}\ -\ -av_unused static void FUNC(OPNAME ## h264_qpel2_v_lowpass)(uint8_t *_dst, const uint8_t *restrict _src, int dstStride, int srcStride)\ -{\ - const int w=2;\ - int i;\ - pixel *dst = (pixel*)_dst;\ - const pixel *restrict src = (const pixel*)_src;\ - dstStride >>= sizeof(pixel)-1;\ - srcStride >>= sizeof(pixel)-1;\ - for(i=0; i<w; i++)\ - {\ - const int srcB= src[-2*srcStride];\ - const int srcA= src[-1*srcStride];\ - const int src0= src[0 *srcStride];\ - const int src1= src[1 *srcStride];\ - const int src2= src[2 *srcStride];\ - const int src3= src[3 *srcStride];\ - const int src4= src[4 *srcStride];\ - OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ - OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ - dst++;\ - src++;\ - }\ -}\ -\ -av_unused static void FUNC(OPNAME ## h264_qpel2_hv_lowpass)(uint8_t *_dst, pixeltmp *tmp, const uint8_t *restrict _src, int dstStride, int tmpStride, int srcStride)\ -{\ - const int h=2;\ - const int w=2;\ - const int pad = (BIT_DEPTH == 10) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\ - int i;\ - pixel *dst = (pixel*)_dst;\ - const pixel *restrict src = (const pixel*)_src;\ - dstStride >>= sizeof(pixel)-1;\ - srcStride >>= sizeof(pixel)-1;\ - src -= 2*srcStride;\ - for(i=0; i<h+5; i++)\ - {\ - tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]) + pad;\ - tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]) + pad;\ - tmp+=tmpStride;\ - src+=srcStride;\ - }\ - tmp -= tmpStride*(h+5-2);\ - for(i=0; i<w; i++)\ - {\ - const int tmpB= tmp[-2*tmpStride] - pad;\ - const int tmpA= tmp[-1*tmpStride] - pad;\ - const int tmp0= tmp[0 *tmpStride] - pad;\ - const int tmp1= tmp[1 *tmpStride] - pad;\ - const int tmp2= tmp[2 *tmpStride] - pad;\ - const int tmp3= tmp[3 *tmpStride] - pad;\ - const int tmp4= tmp[4 *tmpStride] - pad;\ - OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ - OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ - dst++;\ - tmp++;\ - }\ -}\ static void FUNC(OPNAME ## h264_qpel4_h_lowpass)(uint8_t *_dst, const uint8_t *restrict _src, int dstStride, int srcStride)\ {\ const int h=4;\ @@ -540,7 +465,6 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc32)(uint8_t *dst, const uint H264_LOWPASS(put_ , op_put, op2_put) H264_LOWPASS(avg_ , op_avg, op2_avg) -H264_MC(put_, 2) H264_MC(put_, 4) H264_MC(put_, 8) H264_MC(put_, 16) diff --git a/tests/checkasm/h264qpel.c b/tests/checkasm/h264qpel.c index e47d659929..7387c2510a 100644 --- a/tests/checkasm/h264qpel.c +++ b/tests/checkasm/h264qpel.c @@ -64,7 +64,7 @@ void checkasm_check_h264qpel(void) for (bit_depth = 8; bit_depth <= 10; bit_depth++) { ff_h264qpel_init(&h, bit_depth); - for (i = 0; i < (op ? 3 : 4); i++) { + for (i = 0; i < (op || bit_depth != 8 ? 3 : 4); i++) { int size = 16 >> i; for (j = 0; j < 16; j++) if (check_func(tab[i][j], "%s_h264_qpel_%d_mc%d%d_%d", op_name, size, j & 3, j >> 2, bit_depth)) { -- 2.49.1 >From 8f8a5b17abda5c2bbf9714f2f733d0012aa17e51 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sat, 27 Sep 2025 02:15:27 +0200 Subject: [PATCH 13/26] avcodec/h264qpel: Make avg_h264_qpel_pixels_tab smaller avg_h264_qpel only supports 16x16,8x8 and 4x4 blocksizes, so it is currently unnecessarily large. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/h264qpel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h index 24baf826f9..6ae5ba1724 100644 --- a/libavcodec/h264qpel.h +++ b/libavcodec/h264qpel.h @@ -26,7 +26,7 @@ typedef struct H264QpelContext { qpel_mc_func put_h264_qpel_pixels_tab[4][16]; - qpel_mc_func avg_h264_qpel_pixels_tab[4][16]; + qpel_mc_func avg_h264_qpel_pixels_tab[3][16]; } H264QpelContext; void ff_h264qpel_init(H264QpelContext *c, int bit_depth); -- 2.49.1 >From 635067eebb5bdac1857069bae9ef7452eb02f290 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sun, 28 Sep 2025 23:25:59 +0200 Subject: [PATCH 14/26] avcodec/x86/fpel: Remove redundant repetition The repetition count is always one since 2cf9e733c6a666600423a0967f23341d9f09e3c8. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/fpel.asm | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm index 8551ff1ff3..477caa8b44 100644 --- a/libavcodec/x86/fpel.asm +++ b/libavcodec/x86/fpel.asm @@ -31,33 +31,27 @@ SECTION .text %if %2 == mmsize/2 %define LOAD movh %define SAVE movh -%define LEN mmsize %else %define LOAD movu %define SAVE mova -%define LEN %2 %endif cglobal %1_pixels%2, 4,5,4 lea r4, [r2*3] .loop: -%assign %%i 0 -%rep LEN/mmsize - LOAD m0, [r1 + %%i] - LOAD m1, [r1+r2 + %%i] - LOAD m2, [r1+r2*2 + %%i] - LOAD m3, [r1+r4 + %%i] + LOAD m0, [r1] + LOAD m1, [r1+r2] + LOAD m2, [r1+r2*2] + LOAD m3, [r1+r4] %ifidn %1, avg - pavgb m0, [r0 + %%i] - pavgb m1, [r0+r2 + %%i] - pavgb m2, [r0+r2*2 + %%i] - pavgb m3, [r0+r4 + %%i] + pavgb m0, [r0] + pavgb m1, [r0+r2] + pavgb m2, [r0+r2*2] + pavgb m3, [r0+r4] %endif - SAVE [r0 + %%i], m0 - SAVE [r0+r2 + %%i], m1 - SAVE [r0+r2*2 + %%i], m2 - SAVE [r0+r4 + %%i], m3 -%assign %%i %%i+mmsize -%endrep + SAVE [r0], m0 + SAVE [r0+r2], m1 + SAVE [r0+r2*2], m2 + SAVE [r0+r4], m3 sub r3d, 4 lea r1, [r1+r2*4] lea r0, [r0+r2*4] -- 2.49.1 >From 705bc69615320b181b8c4896da0a0fd6aa569170 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Mon, 29 Sep 2025 00:11:47 +0200 Subject: [PATCH 15/26] avcodec/fpel: Avoid loop in ff_avg_pixels4_mmxext() It is only used by h264_qpel.c and only with height four (which is unrolled) and uses a loop in order to handle multiples of four as height. Remove the loop and the height parameter and move the function to h264_qpel_8bit.asm. This leads to a bit of code duplication, but this is simpler than all the %if checks necessary to achieve the same outcome in fpel.asm. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/fpel.asm | 1 - libavcodec/x86/fpel.h | 2 -- libavcodec/x86/h264_qpel.c | 21 ++++++++++----------- libavcodec/x86/h264_qpel_8bit.asm | 20 ++++++++++++++++++-- 4 files changed, 28 insertions(+), 16 deletions(-) diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm index 477caa8b44..8ca684efa9 100644 --- a/libavcodec/x86/fpel.asm +++ b/libavcodec/x86/fpel.asm @@ -63,7 +63,6 @@ INIT_MMX mmx OP_PIXELS put, 8 INIT_MMX mmxext -OP_PIXELS avg, 4 OP_PIXELS avg, 8 INIT_XMM sse2 diff --git a/libavcodec/x86/fpel.h b/libavcodec/x86/fpel.h index 851a70b99f..dc69e1cd83 100644 --- a/libavcodec/x86/fpel.h +++ b/libavcodec/x86/fpel.h @@ -22,8 +22,6 @@ #include <stddef.h> #include <stdint.h> -void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 43e68d2d97..649bfabda8 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -30,6 +30,7 @@ #include "fpel.h" #if HAVE_X86ASM +void ff_avg_pixels4_mmxext(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ptrdiff_t stride); void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, @@ -52,7 +53,6 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext #define ff_avg_pixels16_l2_sse2(dst, src1, src2, dststride, src1stride, h) \ ff_avg_pixels16_l2_mmxext((dst), (src1), (src2), (dststride), (src1stride)) -#define ff_put_pixels4_mmxext(...) #define DEF_QPEL(OPNAME)\ void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ @@ -191,8 +191,7 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uin #define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext #define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext -#define H264_MC_C_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \ -H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ +#define H264_MC_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \ H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ @@ -208,11 +207,11 @@ static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src, ff_avg_pixels16_sse2(dst, src, stride, 16); } -#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ -av_unused static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ -{\ - ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ -}\ +static void avg_h264_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + ff_avg_pixels8_mmxext(dst, src, stride, 8); +} #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ @@ -346,8 +345,7 @@ QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3) QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3) -H264_MC(H264_MC_C_V_H_HV, 4, mmxext, 8) -H264_MC_C(avg_, 8, mmxext, 8) +H264_MC(H264_MC_V_H_HV, 4, mmxext, 8) H264_MC_816(H264_MC_V, sse2) H264_MC_816(H264_MC_HV, sse2) H264_MC_816(H264_MC_H, ssse3) @@ -461,7 +459,8 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth) if (!high_bit_depth) { SET_QPEL_FUNCS_1PP(put_h264_qpel, 2, 4, mmxext, ); c->avg_h264_qpel_pixels_tab[1][0] = avg_h264_qpel8_mc00_mmxext; - SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, ); + SET_QPEL_FUNCS_1PP(avg_h264_qpel, 2, 4, mmxext, ); + c->avg_h264_qpel_pixels_tab[2][0] = ff_avg_pixels4_mmxext; } else if (bit_depth == 10) { SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_); SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_); diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index 6a134ee5b4..07056f1215 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -25,14 +25,30 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA 32 - cextern pw_16 cextern pw_5 cextern pb_0 SECTION .text +; void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, +; ptrdiff_t line_size) +INIT_MMX mmxext +cglobal avg_pixels4, 3,4 + lea r3, [r2*3] + movh m0, [r1] + movh m1, [r1+r2] + movh m2, [r1+r2*2] + movh m3, [r1+r3] + pavgb m0, [r0] + pavgb m1, [r0+r2] + pavgb m2, [r0+r2*2] + pavgb m3, [r0+r3] + movh [r0], m0 + movh [r0+r2], m1 + movh [r0+r2*2], m2 + movh [r0+r3], m3 + RET %macro op_avgh 3 movh %3, %2 -- 2.49.1 >From a3feeca4ac73fc80d722594311cf917dd51e9f4f Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Mon, 29 Sep 2025 02:31:42 +0200 Subject: [PATCH 16/26] avcodec/x86/h264_qpel: Add ff_{avg,put}_pixels16_l2_shift5_sse2 Up until now this function was emulated via two calls to ff_{avg,pull}_pixels8_l2_shift5_mmxext(). Adding a dedicated function proved beneficial both size wise and performance wise: The new functions take 192B, yet the simplified calls save 256B with GCC and 320B with Clang here. This change will also allow further optimizations. Old benchmarks: avg_h264_qpel_16_mc12_8_c: 1735.8 ( 1.00x) avg_h264_qpel_16_mc12_8_sse2: 300.8 ( 5.77x) avg_h264_qpel_16_mc12_8_ssse3: 233.3 ( 7.44x) avg_h264_qpel_16_mc32_8_c: 1777.9 ( 1.00x) avg_h264_qpel_16_mc32_8_sse2: 275.6 ( 6.45x) avg_h264_qpel_16_mc32_8_ssse3: 235.7 ( 7.54x) put_h264_qpel_16_mc12_8_c: 1808.2 ( 1.00x) put_h264_qpel_16_mc12_8_sse2: 267.2 ( 6.77x) put_h264_qpel_16_mc12_8_ssse3: 231.9 ( 7.80x) put_h264_qpel_16_mc32_8_c: 1766.9 ( 1.00x) put_h264_qpel_16_mc32_8_sse2: 272.9 ( 6.47x) put_h264_qpel_16_mc32_8_ssse3: 229.5 ( 7.70x) New benchmarks: avg_h264_qpel_16_mc12_8_c: 1742.3 ( 1.00x) avg_h264_qpel_16_mc12_8_sse2: 240.3 ( 7.25x) avg_h264_qpel_16_mc12_8_ssse3: 214.8 ( 8.11x) avg_h264_qpel_16_mc32_8_c: 1748.0 ( 1.00x) avg_h264_qpel_16_mc32_8_sse2: 238.0 ( 7.35x) avg_h264_qpel_16_mc32_8_ssse3: 209.2 ( 8.35x) put_h264_qpel_16_mc12_8_c: 2014.4 ( 1.00x) put_h264_qpel_16_mc12_8_sse2: 243.7 ( 8.27x) put_h264_qpel_16_mc12_8_ssse3: 211.5 ( 9.52x) put_h264_qpel_16_mc32_8_c: 1800.0 ( 1.00x) put_h264_qpel_16_mc32_8_sse2: 238.8 ( 7.54x) put_h264_qpel_16_mc32_8_ssse3: 206.7 ( 8.71x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/h264_qpel.c | 12 +++++------- libavcodec/x86/h264_qpel_8bit.asm | 27 +++++++++++++++++---------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 649bfabda8..b782d32bea 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -68,7 +68,8 @@ void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, in void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\ void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\ void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\ -void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h); +void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\ +void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\ DEF_QPEL(avg) DEF_QPEL(put) @@ -104,12 +105,6 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(u ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ }\ -\ -static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ - ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ -}\ #if ARCH_X86_64 @@ -191,6 +186,9 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uin #define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext #define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext +#define ff_put_pixels16_l2_shift5_mmxext ff_put_pixels16_l2_shift5_sse2 +#define ff_avg_pixels16_l2_shift5_mmxext ff_avg_pixels16_l2_shift5_sse2 + #define H264_MC_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \ H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index 07056f1215..fefa3aff01 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -734,15 +734,19 @@ PIXELS4_L2_SHIFT5 put PIXELS4_L2_SHIFT5 avg -%macro PIXELS8_L2_SHIFT5 1 -cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h +%macro PIXELS_L2_SHIFT5 2 +%if cpuflag(sse2) +cglobal %1_pixels%2_l2_shift5, 6, 6, 4 ; dst, src16, src8, dstStride, src8Stride, h +%else +cglobal %1_pixels%2_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h +%endif movsxdifnidn r3, r3d movsxdifnidn r4, r4d .loop: - mova m0, [r1] - mova m1, [r1+8] - mova m2, [r1+48] - mova m3, [r1+48+8] + movu m0, [r1] + movu m1, [r1+%2] + movu m2, [r1+48] + movu m3, [r1+48+%2] psraw m0, 5 psraw m1, 5 psraw m2, 5 @@ -751,8 +755,8 @@ cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h packuswb m2, m3 pavgb m0, [r2] pavgb m2, [r2+r4] - op_%1 m0, [r0], m4 - op_%1 m2, [r0+r3], m5 + op_%1 m0, [r0], m1 + op_%1 m2, [r0+r3], m1 lea r2, [r2+2*r4] add r1, 48*2 lea r0, [r0+2*r3] @@ -762,9 +766,12 @@ cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h %endmacro INIT_MMX mmxext -PIXELS8_L2_SHIFT5 put -PIXELS8_L2_SHIFT5 avg +PIXELS_L2_SHIFT5 put, 8 +PIXELS_L2_SHIFT5 avg, 8 +INIT_XMM sse2 +PIXELS_L2_SHIFT5 put, 16 +PIXELS_L2_SHIFT5 avg, 16 %if ARCH_X86_64 %macro QPEL16_H_LOWPASS_L2_OP 1 -- 2.49.1 >From e56d1788f788c894f773196ebec4a29f613230d3 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Mon, 29 Sep 2025 02:50:58 +0200 Subject: [PATCH 17/26] avcodec/x86/h264_qpel: Remove constant parameters from shift5 funcs They are constant since the size 16 version is no longer emulated via the size 8 version. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/h264_qpel.c | 10 +++++----- libavcodec/x86/h264_qpel_8bit.asm | 22 ++++++++++------------ 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index b782d32bea..e6035b8de5 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -67,9 +67,9 @@ void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, int srcStride, int size);\ void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\ void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\ -void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\ -void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\ -void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\ +void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ +void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ +void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ DEF_QPEL(avg) DEF_QPEL(put) @@ -309,7 +309,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uin int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ av_assert2(((uintptr_t)temp & 7) == 0);\ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ + ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ @@ -319,7 +319,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uin int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ av_assert2(((uintptr_t)temp & 7) == 0);\ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ + ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride);\ }\ #define H264_MC(QPEL, SIZE, MMX, ALIGN)\ diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index fefa3aff01..7dbdf41d0a 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -701,9 +701,8 @@ QPEL8OR16_HV2_LOWPASS_OP_XMM avg %macro PIXELS4_L2_SHIFT5 1 -cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h +cglobal %1_pixels4_l2_shift5,4,4 ; dst, src16, src8, dstStride movsxdifnidn r3, r3d - movsxdifnidn r4, r4d mova m0, [r1] mova m1, [r1+24] psraw m0, 5 @@ -711,10 +710,9 @@ cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h packuswb m0, m0 packuswb m1, m1 pavgb m0, [r2] - pavgb m1, [r2+r4] + pavgb m1, [r2+4] op_%1h m0, [r0], m4 op_%1h m1, [r0+r3], m5 - lea r2, [r2+r4*2] lea r0, [r0+r3*2] mova m0, [r1+48] mova m1, [r1+72] @@ -722,8 +720,8 @@ cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h psraw m1, 5 packuswb m0, m0 packuswb m1, m1 - pavgb m0, [r2] - pavgb m1, [r2+r4] + pavgb m0, [r2+2*4] + pavgb m1, [r2+3*4] op_%1h m0, [r0], m4 op_%1h m1, [r0+r3], m5 RET @@ -736,12 +734,12 @@ PIXELS4_L2_SHIFT5 avg %macro PIXELS_L2_SHIFT5 2 %if cpuflag(sse2) -cglobal %1_pixels%2_l2_shift5, 6, 6, 4 ; dst, src16, src8, dstStride, src8Stride, h +cglobal %1_pixels%2_l2_shift5, 5, 5, 4 ; dst, src16, src8, dstStride %else -cglobal %1_pixels%2_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h +cglobal %1_pixels%2_l2_shift5, 5, 5 ; dst, src16, src8, dstStride %endif movsxdifnidn r3, r3d - movsxdifnidn r4, r4d + mov r4d, %2 .loop: movu m0, [r1] movu m1, [r1+%2] @@ -754,13 +752,13 @@ cglobal %1_pixels%2_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h packuswb m0, m1 packuswb m2, m3 pavgb m0, [r2] - pavgb m2, [r2+r4] + pavgb m2, [r2+%2] op_%1 m0, [r0], m1 op_%1 m2, [r0+r3], m1 - lea r2, [r2+2*r4] + lea r2, [r2+2*%2] add r1, 48*2 lea r0, [r0+2*r3] - sub r5d, 2 + sub r4d, 2 jne .loop RET %endmacro -- 2.49.1 >From 5ff695208ab05a71283f729889f73f21388ea300 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Mon, 29 Sep 2025 03:47:25 +0200 Subject: [PATCH 18/26] avcodec/x86/h264_qpel: Remove unused parameter from hv2_lowpass funcs tmpstride is unused. This also allows to remove said parameter from lots of functions in h264_qpel.c. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/h264_qpel.c | 35 +++++++++++++++---------------- libavcodec/x86/h264_qpel_8bit.asm | 14 ++++++------- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index e6035b8de5..53b3ffb653 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -65,8 +65,8 @@ void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp, int srcStride);\ void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\ void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, int srcStride, int size);\ -void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\ -void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\ +void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int h);\ +void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int size);\ void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ @@ -75,7 +75,7 @@ DEF_QPEL(avg) DEF_QPEL(put) #define QPEL_H264(OPNAME, OP, MMX)\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride){\ int w=3;\ src -= 2*srcStride+2;\ while(w--){\ @@ -87,10 +87,10 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\ }\ \ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int size){\ int w = size>>4;\ do{\ - ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\ + ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, size);\ tmp += 8;\ dst += 8;\ }while(w--);\ @@ -148,7 +148,6 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, const uint8_t *src, - int tmpStride, int srcStride, int size) { @@ -162,15 +161,15 @@ static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, } #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ - put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ - ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride, int size){\ + put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, srcStride, size);\ + ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, size);\ }\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride){\ + ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, srcStride, 8);\ }\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride){\ + ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, srcStride, 16);\ }\ #define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext @@ -279,7 +278,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uin static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ {\ LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\ - ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ + ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, stride);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ @@ -288,7 +287,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uin uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ av_assert2(((uintptr_t)temp & 7) == 0);\ - ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ }\ \ @@ -298,7 +297,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uin uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ av_assert2(((uintptr_t)temp & 7) == 0);\ - ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ }\ \ @@ -308,7 +307,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uin uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ av_assert2(((uintptr_t)temp & 7) == 0);\ - ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride);\ }\ \ @@ -318,7 +317,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uin uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ av_assert2(((uintptr_t)temp & 7) == 0);\ - ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride);\ }\ diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index 7dbdf41d0a..9c4957f8e7 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -565,8 +565,7 @@ QPEL8OR16_HV1_LOWPASS_OP put %macro QPEL8OR16_HV2_LOWPASS_OP 1 -; unused is to match ssse3 and mmxext args -cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h +cglobal %1_h264_qpel8or16_hv2_lowpass_op, 4,4 ; dst, tmp, dstStride, h movsxdifnidn r2, r2d .loop: mova m0, [r1] @@ -599,7 +598,7 @@ cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h op_%1 m0, [r0], m7 add r1, 48 add r0, r2 - dec r4d + dec r3d jne .loop RET %endmacro @@ -609,10 +608,9 @@ QPEL8OR16_HV2_LOWPASS_OP put QPEL8OR16_HV2_LOWPASS_OP avg %macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1 -cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size +cglobal %1_h264_qpel8or16_hv2_lowpass, 4,4,8 ; dst, tmp, dstStride, size movsxdifnidn r2, r2d - movsxdifnidn r3, r3d - cmp r4d, 16 + cmp r3d, 16 je .op16 .loop8: mova m1, [r1+16] @@ -640,7 +638,7 @@ cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, s op_%1h m0, [r0], m7 add r1, 48 add r0, r2 - dec r4d + dec r3d jne .loop8 jmp .done .op16: @@ -689,7 +687,7 @@ cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, s op_%1 m3, [r0], m7 add r1, 48 add r0, r2 - dec r4d + dec r3d jne .op16 .done: RET -- 2.49.1 >From 2ca6f31c0507d368377c7bc2ef5ad78fcf61835a Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Mon, 29 Sep 2025 04:30:19 +0200 Subject: [PATCH 19/26] avcodec/x86/h264_qpel_8bit: Optimize branch away ff_{avg,put}_h264_qpel8or16_hv2_lowpass_ssse3() currently is almost the disjoint union of the codepaths for sizes 8 and 16. This size is a compile-time constant at every callsite. So split the function and avoid the runtime branch. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/h264_qpel.c | 15 ++++++++++++++- libavcodec/x86/h264_qpel_8bit.asm | 26 ++++++++++++++++---------- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 53b3ffb653..010cb51991 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -66,7 +66,8 @@ void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_ void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\ void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, int srcStride, int size);\ void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int h);\ -void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int size);\ +void ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride);\ +void ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride);\ void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ @@ -172,6 +173,18 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uin ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, srcStride, 16);\ }\ +#define SSSE3_HV2_LOWPASS_WRAPPER(OPNAME) \ +static av_always_inline void \ +ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int size) \ +{\ + if (size == 8)\ + ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(dst, tmp, dstStride);\ + else\ + ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(dst, tmp, dstStride);\ +} +SSSE3_HV2_LOWPASS_WRAPPER(avg) +SSSE3_HV2_LOWPASS_WRAPPER(put) + #define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext #define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext #define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index 9c4957f8e7..7b6b51be04 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -608,11 +608,14 @@ QPEL8OR16_HV2_LOWPASS_OP put QPEL8OR16_HV2_LOWPASS_OP avg %macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1 -cglobal %1_h264_qpel8or16_hv2_lowpass, 4,4,8 ; dst, tmp, dstStride, size +%ifidn %1, avg +cglobal %1_h264_qpel8_hv2_lowpass, 3,4,7 ; dst, tmp, dstStride +%else +cglobal %1_h264_qpel8_hv2_lowpass, 3,4,6 ; dst, tmp, dstStride +%endif movsxdifnidn r2, r2d - cmp r3d, 16 - je .op16 -.loop8: + mov r3d, 8 +.loop: mova m1, [r1+16] mova m0, [r1] mova m2, m1 @@ -635,13 +638,17 @@ cglobal %1_h264_qpel8or16_hv2_lowpass, 4,4,8 ; dst, tmp, dstStride, size paddw m0, m2 psraw m0, 6 packuswb m0, m0 - op_%1h m0, [r0], m7 + op_%1h m0, [r0], m6 add r1, 48 add r0, r2 dec r3d - jne .loop8 - jmp .done -.op16: + jne .loop + RET + +cglobal %1_h264_qpel16_hv2_lowpass, 3,4,8 ; dst, tmp, dstStride + movsxdifnidn r2, r2d + mov r3d, 16 +.loop: mova m4, [r1+32] mova m5, [r1+16] mova m7, [r1] @@ -688,8 +695,7 @@ cglobal %1_h264_qpel8or16_hv2_lowpass, 4,4,8 ; dst, tmp, dstStride, size add r1, 48 add r0, r2 dec r3d - jne .op16 -.done: + jne .loop RET %endmacro -- 2.49.1 >From f869cc66f66339b93e3c9ca279b7d00cf3d9a5f7 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Mon, 29 Sep 2025 07:56:53 +0200 Subject: [PATCH 20/26] avcodec/x86/h264_qpel_8bit: Replace qpel8_h_lowpass_l2 MMXEXT by SSE2 Using xmm registers here is very natural, as it allows to operate on eight words at a time. It also saves 48B here and does not clobber the MMX state. Old benchmarks (only tests affected by the modified function are shown): avg_h264_qpel_8_mc11_8_c: 352.2 ( 1.00x) avg_h264_qpel_8_mc11_8_sse2: 70.4 ( 5.00x) avg_h264_qpel_8_mc11_8_ssse3: 53.9 ( 6.53x) avg_h264_qpel_8_mc13_8_c: 353.3 ( 1.00x) avg_h264_qpel_8_mc13_8_sse2: 72.8 ( 4.86x) avg_h264_qpel_8_mc13_8_ssse3: 53.8 ( 6.57x) avg_h264_qpel_8_mc21_8_c: 404.0 ( 1.00x) avg_h264_qpel_8_mc21_8_sse2: 116.1 ( 3.48x) avg_h264_qpel_8_mc21_8_ssse3: 94.3 ( 4.28x) avg_h264_qpel_8_mc23_8_c: 398.9 ( 1.00x) avg_h264_qpel_8_mc23_8_sse2: 118.6 ( 3.36x) avg_h264_qpel_8_mc23_8_ssse3: 94.8 ( 4.21x) avg_h264_qpel_8_mc31_8_c: 352.7 ( 1.00x) avg_h264_qpel_8_mc31_8_sse2: 71.4 ( 4.94x) avg_h264_qpel_8_mc31_8_ssse3: 53.8 ( 6.56x) avg_h264_qpel_8_mc33_8_c: 354.0 ( 1.00x) avg_h264_qpel_8_mc33_8_sse2: 70.6 ( 5.01x) avg_h264_qpel_8_mc33_8_ssse3: 53.7 ( 6.59x) avg_h264_qpel_16_mc11_8_c: 1417.0 ( 1.00x) avg_h264_qpel_16_mc11_8_sse2: 276.9 ( 5.12x) avg_h264_qpel_16_mc11_8_ssse3: 178.8 ( 7.92x) avg_h264_qpel_16_mc13_8_c: 1427.3 ( 1.00x) avg_h264_qpel_16_mc13_8_sse2: 277.4 ( 5.14x) avg_h264_qpel_16_mc13_8_ssse3: 179.7 ( 7.94x) avg_h264_qpel_16_mc21_8_c: 1634.1 ( 1.00x) avg_h264_qpel_16_mc21_8_sse2: 421.3 ( 3.88x) avg_h264_qpel_16_mc21_8_ssse3: 291.2 ( 5.61x) avg_h264_qpel_16_mc23_8_c: 1627.0 ( 1.00x) avg_h264_qpel_16_mc23_8_sse2: 420.8 ( 3.87x) avg_h264_qpel_16_mc23_8_ssse3: 291.0 ( 5.59x) avg_h264_qpel_16_mc31_8_c: 1418.4 ( 1.00x) avg_h264_qpel_16_mc31_8_sse2: 278.5 ( 5.09x) avg_h264_qpel_16_mc31_8_ssse3: 178.6 ( 7.94x) avg_h264_qpel_16_mc33_8_c: 1407.3 ( 1.00x) avg_h264_qpel_16_mc33_8_sse2: 277.6 ( 5.07x) avg_h264_qpel_16_mc33_8_ssse3: 179.9 ( 7.82x) put_h264_qpel_8_mc11_8_c: 348.1 ( 1.00x) put_h264_qpel_8_mc11_8_sse2: 69.1 ( 5.04x) put_h264_qpel_8_mc11_8_ssse3: 53.8 ( 6.47x) put_h264_qpel_8_mc13_8_c: 349.3 ( 1.00x) put_h264_qpel_8_mc13_8_sse2: 69.7 ( 5.01x) put_h264_qpel_8_mc13_8_ssse3: 53.7 ( 6.51x) put_h264_qpel_8_mc21_8_c: 398.5 ( 1.00x) put_h264_qpel_8_mc21_8_sse2: 115.0 ( 3.46x) put_h264_qpel_8_mc21_8_ssse3: 95.3 ( 4.18x) put_h264_qpel_8_mc23_8_c: 399.9 ( 1.00x) put_h264_qpel_8_mc23_8_sse2: 120.8 ( 3.31x) put_h264_qpel_8_mc23_8_ssse3: 95.4 ( 4.19x) put_h264_qpel_8_mc31_8_c: 350.4 ( 1.00x) put_h264_qpel_8_mc31_8_sse2: 69.6 ( 5.03x) put_h264_qpel_8_mc31_8_ssse3: 54.2 ( 6.47x) put_h264_qpel_8_mc33_8_c: 353.1 ( 1.00x) put_h264_qpel_8_mc33_8_sse2: 71.0 ( 4.97x) put_h264_qpel_8_mc33_8_ssse3: 54.2 ( 6.51x) put_h264_qpel_16_mc11_8_c: 1384.2 ( 1.00x) put_h264_qpel_16_mc11_8_sse2: 272.9 ( 5.07x) put_h264_qpel_16_mc11_8_ssse3: 178.3 ( 7.76x) put_h264_qpel_16_mc13_8_c: 1393.6 ( 1.00x) put_h264_qpel_16_mc13_8_sse2: 271.1 ( 5.14x) put_h264_qpel_16_mc13_8_ssse3: 178.3 ( 7.82x) put_h264_qpel_16_mc21_8_c: 1612.6 ( 1.00x) put_h264_qpel_16_mc21_8_sse2: 416.5 ( 3.87x) put_h264_qpel_16_mc21_8_ssse3: 289.1 ( 5.58x) put_h264_qpel_16_mc23_8_c: 1621.3 ( 1.00x) put_h264_qpel_16_mc23_8_sse2: 416.9 ( 3.89x) put_h264_qpel_16_mc23_8_ssse3: 289.4 ( 5.60x) put_h264_qpel_16_mc31_8_c: 1408.4 ( 1.00x) put_h264_qpel_16_mc31_8_sse2: 273.5 ( 5.15x) put_h264_qpel_16_mc31_8_ssse3: 176.9 ( 7.96x) put_h264_qpel_16_mc33_8_c: 1396.4 ( 1.00x) put_h264_qpel_16_mc33_8_sse2: 276.3 ( 5.05x) put_h264_qpel_16_mc33_8_ssse3: 176.4 ( 7.92x) New benchmarks: avg_h264_qpel_8_mc11_8_c: 352.1 ( 1.00x) avg_h264_qpel_8_mc11_8_sse2: 52.5 ( 6.71x) avg_h264_qpel_8_mc11_8_ssse3: 53.9 ( 6.54x) avg_h264_qpel_8_mc13_8_c: 350.8 ( 1.00x) avg_h264_qpel_8_mc13_8_sse2: 54.7 ( 6.42x) avg_h264_qpel_8_mc13_8_ssse3: 54.3 ( 6.46x) avg_h264_qpel_8_mc21_8_c: 400.1 ( 1.00x) avg_h264_qpel_8_mc21_8_sse2: 98.6 ( 4.06x) avg_h264_qpel_8_mc21_8_ssse3: 95.5 ( 4.19x) avg_h264_qpel_8_mc23_8_c: 400.4 ( 1.00x) avg_h264_qpel_8_mc23_8_sse2: 101.4 ( 3.95x) avg_h264_qpel_8_mc23_8_ssse3: 95.9 ( 4.18x) avg_h264_qpel_8_mc31_8_c: 352.4 ( 1.00x) avg_h264_qpel_8_mc31_8_sse2: 52.9 ( 6.67x) avg_h264_qpel_8_mc31_8_ssse3: 54.4 ( 6.48x) avg_h264_qpel_8_mc33_8_c: 354.5 ( 1.00x) avg_h264_qpel_8_mc33_8_sse2: 52.9 ( 6.70x) avg_h264_qpel_8_mc33_8_ssse3: 54.4 ( 6.52x) avg_h264_qpel_16_mc11_8_c: 1420.4 ( 1.00x) avg_h264_qpel_16_mc11_8_sse2: 204.8 ( 6.93x) avg_h264_qpel_16_mc11_8_ssse3: 177.9 ( 7.98x) avg_h264_qpel_16_mc13_8_c: 1409.8 ( 1.00x) avg_h264_qpel_16_mc13_8_sse2: 206.4 ( 6.83x) avg_h264_qpel_16_mc13_8_ssse3: 178.0 ( 7.92x) avg_h264_qpel_16_mc21_8_c: 1634.1 ( 1.00x) avg_h264_qpel_16_mc21_8_sse2: 349.6 ( 4.67x) avg_h264_qpel_16_mc21_8_ssse3: 290.0 ( 5.63x) avg_h264_qpel_16_mc23_8_c: 1624.1 ( 1.00x) avg_h264_qpel_16_mc23_8_sse2: 350.0 ( 4.64x) avg_h264_qpel_16_mc23_8_ssse3: 291.9 ( 5.56x) avg_h264_qpel_16_mc31_8_c: 1407.2 ( 1.00x) avg_h264_qpel_16_mc31_8_sse2: 205.8 ( 6.84x) avg_h264_qpel_16_mc31_8_ssse3: 178.2 ( 7.90x) avg_h264_qpel_16_mc33_8_c: 1400.5 ( 1.00x) avg_h264_qpel_16_mc33_8_sse2: 206.3 ( 6.79x) avg_h264_qpel_16_mc33_8_ssse3: 179.4 ( 7.81x) put_h264_qpel_8_mc11_8_c: 349.7 ( 1.00x) put_h264_qpel_8_mc11_8_sse2: 50.2 ( 6.96x) put_h264_qpel_8_mc11_8_ssse3: 51.3 ( 6.82x) put_h264_qpel_8_mc13_8_c: 349.8 ( 1.00x) put_h264_qpel_8_mc13_8_sse2: 50.7 ( 6.90x) put_h264_qpel_8_mc13_8_ssse3: 51.7 ( 6.76x) put_h264_qpel_8_mc21_8_c: 398.0 ( 1.00x) put_h264_qpel_8_mc21_8_sse2: 96.5 ( 4.13x) put_h264_qpel_8_mc21_8_ssse3: 92.3 ( 4.31x) put_h264_qpel_8_mc23_8_c: 401.4 ( 1.00x) put_h264_qpel_8_mc23_8_sse2: 102.3 ( 3.92x) put_h264_qpel_8_mc23_8_ssse3: 92.8 ( 4.32x) put_h264_qpel_8_mc31_8_c: 349.4 ( 1.00x) put_h264_qpel_8_mc31_8_sse2: 50.8 ( 6.88x) put_h264_qpel_8_mc31_8_ssse3: 51.8 ( 6.75x) put_h264_qpel_8_mc33_8_c: 351.1 ( 1.00x) put_h264_qpel_8_mc33_8_sse2: 52.2 ( 6.73x) put_h264_qpel_8_mc33_8_ssse3: 51.7 ( 6.79x) put_h264_qpel_16_mc11_8_c: 1391.1 ( 1.00x) put_h264_qpel_16_mc11_8_sse2: 196.6 ( 7.07x) put_h264_qpel_16_mc11_8_ssse3: 178.2 ( 7.81x) put_h264_qpel_16_mc13_8_c: 1385.2 ( 1.00x) put_h264_qpel_16_mc13_8_sse2: 195.6 ( 7.08x) put_h264_qpel_16_mc13_8_ssse3: 176.6 ( 7.84x) put_h264_qpel_16_mc21_8_c: 1607.5 ( 1.00x) put_h264_qpel_16_mc21_8_sse2: 341.0 ( 4.71x) put_h264_qpel_16_mc21_8_ssse3: 289.1 ( 5.56x) put_h264_qpel_16_mc23_8_c: 1616.7 ( 1.00x) put_h264_qpel_16_mc23_8_sse2: 340.8 ( 4.74x) put_h264_qpel_16_mc23_8_ssse3: 288.6 ( 5.60x) put_h264_qpel_16_mc31_8_c: 1397.6 ( 1.00x) put_h264_qpel_16_mc31_8_sse2: 197.3 ( 7.08x) put_h264_qpel_16_mc31_8_ssse3: 175.4 ( 7.97x) put_h264_qpel_16_mc33_8_c: 1394.3 ( 1.00x) put_h264_qpel_16_mc33_8_sse2: 197.7 ( 7.05x) put_h264_qpel_16_mc33_8_ssse3: 175.2 ( 7.96x) As can be seen, the SSE2 version is often neck-to-neck with the SSSE3 version (which also benefits from a better hv2_lowpass SSSE3 implementation for mc21 and mc23) for eight byte block sizes. Unsurprisingly, SSSE3 beats SSE2 for 16x16 blocks: For SSE2, these blocks are processed by calling the 8x8 function four times whereas SSSE3 has a dedicated function (on x64). This implementation should also be extendable to an AVX version for 16x16 blocks. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/h264_qpel.c | 32 +++++--------- libavcodec/x86/h264_qpel_8bit.asm | 71 ++++++++++++------------------- 2 files changed, 38 insertions(+), 65 deletions(-) diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 010cb51991..c49a866c5d 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -58,7 +58,7 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ -void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ +void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_sse2(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\ @@ -96,15 +96,16 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX dst += 8;\ }while(w--);\ }\ -\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ + +#define QPEL_H264_H16(OPNAME, EXT) \ +static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## EXT(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## EXT(dst , src , src2 , dstStride, src2Stride);\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## EXT(dst+8, src+8, src2+8, dstStride, src2Stride);\ src += 8*dstStride;\ dst += 8*dstStride;\ src2 += 8*src2Stride;\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## EXT(dst , src , src2 , dstStride, src2Stride);\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## EXT(dst+8, src+8, src2+8, dstStride, src2Stride);\ }\ @@ -115,16 +116,7 @@ void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, con void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride); #else // ARCH_X86_64 -#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ - src += 8*dstStride;\ - dst += 8*dstStride;\ - src2 += 8*src2Stride;\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ -} +#define QPEL_H264_H16_XMM(OPNAME, OP, EXT) QPEL_H264_H16(OPNAME, EXT) #endif // ARCH_X86_64 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ @@ -185,10 +177,8 @@ ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, i SSSE3_HV2_LOWPASS_WRAPPER(avg) SSSE3_HV2_LOWPASS_WRAPPER(put) -#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext -#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext -#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext -#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext +QPEL_H264_H16(avg_, sse2) +QPEL_H264_H16(put_, sse2) #define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2 #define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2 diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index 7b6b51be04..68c0094426 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -230,56 +230,39 @@ QPEL4_H_LOWPASS_L2_OP avg %macro QPEL8_H_LOWPASS_L2_OP 1 -cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride +cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,6 ; dst, src, src2, dstStride, srcStride movsxdifnidn r3, r3d movsxdifnidn r4, r4d + mova m3, [pw_16] mov r5d, 8 - pxor m7, m7 - mova m6, [pw_5] + pxor m5, m5 + mova m4, [pw_5] .loop: - mova m0, [r1] - mova m2, [r1+1] - mova m1, m0 - mova m3, m2 - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 - paddw m0, m2 - paddw m1, m3 + movh m0, [r1] + movh m1, [r1+1] + punpcklbw m0, m5 + punpcklbw m1, m5 + paddw m0, m1 psllw m0, 2 - psllw m1, 2 - mova m2, [r1-1] - mova m4, [r1+2] - mova m3, m2 - mova m5, m4 - punpcklbw m2, m7 - punpckhbw m3, m7 - punpcklbw m4, m7 - punpckhbw m5, m7 - paddw m2, m4 - paddw m5, m3 - psubw m0, m2 - psubw m1, m5 - pmullw m0, m6 - pmullw m1, m6 - movd m2, [r1-2] - movd m5, [r1+7] - punpcklbw m2, m7 - punpcklbw m5, m7 - paddw m2, m3 - paddw m4, m5 - mova m5, [pw_16] - paddw m2, m5 - paddw m4, m5 + movh m1, [r1-1] + movh m2, [r1+2] + punpcklbw m1, m5 + punpcklbw m2, m5 + paddw m1, m2 + psubw m0, m1 + pmullw m0, m4 + movh m1, [r1-2] + movh m2, [r1+3] + punpcklbw m1, m5 + punpcklbw m2, m5 + paddw m0, m1 paddw m0, m2 - paddw m1, m4 + paddw m0, m3 psraw m0, 5 - psraw m1, 5 - mova m4, [r2] - packuswb m0, m1 - pavgb m0, m4 - op_%1 m0, [r0], m4 + packuswb m0, m5 + movh m2, [r2] + pavgb m0, m2 + op_%1h m0, [r0], m2 add r0, r3 add r1, r3 add r2, r4 @@ -288,7 +271,7 @@ cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride RET %endmacro -INIT_MMX mmxext +INIT_XMM sse2 QPEL8_H_LOWPASS_L2_OP put QPEL8_H_LOWPASS_L2_OP avg -- 2.49.1 >From 29b3d7875e97a5c7013571cee7add42293d2472a Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Mon, 29 Sep 2025 09:54:38 +0200 Subject: [PATCH 21/26] avcodec/x86/h264_qpel_8bit: Remove dead macro Forgotten in 4011a76494a5ff6844312813bc753aae8e54c2f0. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/h264_qpel_8bit.asm | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index 68c0094426..dc55a8ad93 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -379,17 +379,11 @@ QPEL4_V_LOWPASS_OP avg %macro QPEL8OR16_V_LOWPASS_OP 1 -%if cpuflag(sse2) cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h movsxdifnidn r2, r2d movsxdifnidn r3, r3d sub r1, r3 sub r1, r3 -%else -cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d -%endif pxor m7, m7 movh m0, [r1] movh m1, [r1+r3] @@ -503,8 +497,8 @@ INIT_MMX mmxext QPEL4_HV1_LOWPASS_OP put QPEL4_HV1_LOWPASS_OP avg -%macro QPEL8OR16_HV1_LOWPASS_OP 1 -cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size +INIT_XMM sse2 +cglobal put_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size movsxdifnidn r2, r2d pxor m7, m7 movh m0, [r0] @@ -540,11 +534,6 @@ cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size FILT_HV 15*48 .end: RET -%endmacro - -INIT_XMM sse2 -QPEL8OR16_HV1_LOWPASS_OP put - %macro QPEL8OR16_HV2_LOWPASS_OP 1 -- 2.49.1 >From e613d0d0da941e29b6d7df6ebf837a51e9e6d566 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Mon, 29 Sep 2025 19:51:47 +0200 Subject: [PATCH 22/26] avcodec/x86/h264_qpel: Add ff_{avg,put}_h264_qpel16_h_lowpass_l2_sse2() These functions are currently emulated via four calls to the versions for 8x8 blocks. In fact, the size savings from the simplified calls in h264_qpel.c (GCC 1344B, Clang 1280B) more than outweigh the size of the added functions (512B) here. It is also beneficial performance-wise. Old benchmarks: avg_h264_qpel_16_mc11_8_c: 1414.1 ( 1.00x) avg_h264_qpel_16_mc11_8_sse2: 206.2 ( 6.86x) avg_h264_qpel_16_mc11_8_ssse3: 177.7 ( 7.96x) avg_h264_qpel_16_mc13_8_c: 1417.0 ( 1.00x) avg_h264_qpel_16_mc13_8_sse2: 207.4 ( 6.83x) avg_h264_qpel_16_mc13_8_ssse3: 178.2 ( 7.95x) avg_h264_qpel_16_mc21_8_c: 1632.8 ( 1.00x) avg_h264_qpel_16_mc21_8_sse2: 349.3 ( 4.67x) avg_h264_qpel_16_mc21_8_ssse3: 291.3 ( 5.60x) avg_h264_qpel_16_mc23_8_c: 1640.2 ( 1.00x) avg_h264_qpel_16_mc23_8_sse2: 351.3 ( 4.67x) avg_h264_qpel_16_mc23_8_ssse3: 290.8 ( 5.64x) avg_h264_qpel_16_mc31_8_c: 1411.7 ( 1.00x) avg_h264_qpel_16_mc31_8_sse2: 203.4 ( 6.94x) avg_h264_qpel_16_mc31_8_ssse3: 178.9 ( 7.89x) avg_h264_qpel_16_mc33_8_c: 1409.7 ( 1.00x) avg_h264_qpel_16_mc33_8_sse2: 204.6 ( 6.89x) avg_h264_qpel_16_mc33_8_ssse3: 178.1 ( 7.92x) put_h264_qpel_16_mc11_8_c: 1391.0 ( 1.00x) put_h264_qpel_16_mc11_8_sse2: 197.4 ( 7.05x) put_h264_qpel_16_mc11_8_ssse3: 176.1 ( 7.90x) put_h264_qpel_16_mc13_8_c: 1395.9 ( 1.00x) put_h264_qpel_16_mc13_8_sse2: 196.7 ( 7.10x) put_h264_qpel_16_mc13_8_ssse3: 177.7 ( 7.85x) put_h264_qpel_16_mc21_8_c: 1609.5 ( 1.00x) put_h264_qpel_16_mc21_8_sse2: 341.1 ( 4.72x) put_h264_qpel_16_mc21_8_ssse3: 289.2 ( 5.57x) put_h264_qpel_16_mc23_8_c: 1604.0 ( 1.00x) put_h264_qpel_16_mc23_8_sse2: 340.9 ( 4.71x) put_h264_qpel_16_mc23_8_ssse3: 289.6 ( 5.54x) put_h264_qpel_16_mc31_8_c: 1390.2 ( 1.00x) put_h264_qpel_16_mc31_8_sse2: 194.6 ( 7.14x) put_h264_qpel_16_mc31_8_ssse3: 176.4 ( 7.88x) put_h264_qpel_16_mc33_8_c: 1400.4 ( 1.00x) put_h264_qpel_16_mc33_8_sse2: 198.5 ( 7.06x) put_h264_qpel_16_mc33_8_ssse3: 176.2 ( 7.95x) New benchmarks: avg_h264_qpel_16_mc11_8_c: 1413.3 ( 1.00x) avg_h264_qpel_16_mc11_8_sse2: 171.8 ( 8.23x) avg_h264_qpel_16_mc11_8_ssse3: 173.0 ( 8.17x) avg_h264_qpel_16_mc13_8_c: 1423.2 ( 1.00x) avg_h264_qpel_16_mc13_8_sse2: 172.0 ( 8.27x) avg_h264_qpel_16_mc13_8_ssse3: 173.4 ( 8.21x) avg_h264_qpel_16_mc21_8_c: 1641.3 ( 1.00x) avg_h264_qpel_16_mc21_8_sse2: 322.1 ( 5.10x) avg_h264_qpel_16_mc21_8_ssse3: 291.3 ( 5.63x) avg_h264_qpel_16_mc23_8_c: 1629.1 ( 1.00x) avg_h264_qpel_16_mc23_8_sse2: 323.0 ( 5.04x) avg_h264_qpel_16_mc23_8_ssse3: 293.3 ( 5.55x) avg_h264_qpel_16_mc31_8_c: 1409.2 ( 1.00x) avg_h264_qpel_16_mc31_8_sse2: 172.0 ( 8.19x) avg_h264_qpel_16_mc31_8_ssse3: 173.7 ( 8.11x) avg_h264_qpel_16_mc33_8_c: 1402.5 ( 1.00x) avg_h264_qpel_16_mc33_8_sse2: 172.5 ( 8.13x) avg_h264_qpel_16_mc33_8_ssse3: 173.6 ( 8.08x) put_h264_qpel_16_mc11_8_c: 1393.7 ( 1.00x) put_h264_qpel_16_mc11_8_sse2: 170.4 ( 8.18x) put_h264_qpel_16_mc11_8_ssse3: 178.2 ( 7.82x) put_h264_qpel_16_mc13_8_c: 1398.0 ( 1.00x) put_h264_qpel_16_mc13_8_sse2: 170.2 ( 8.21x) put_h264_qpel_16_mc13_8_ssse3: 178.6 ( 7.83x) put_h264_qpel_16_mc21_8_c: 1619.6 ( 1.00x) put_h264_qpel_16_mc21_8_sse2: 320.6 ( 5.05x) put_h264_qpel_16_mc21_8_ssse3: 297.2 ( 5.45x) put_h264_qpel_16_mc23_8_c: 1617.4 ( 1.00x) put_h264_qpel_16_mc23_8_sse2: 320.0 ( 5.05x) put_h264_qpel_16_mc23_8_ssse3: 297.4 ( 5.44x) put_h264_qpel_16_mc31_8_c: 1389.7 ( 1.00x) put_h264_qpel_16_mc31_8_sse2: 169.9 ( 8.18x) put_h264_qpel_16_mc31_8_ssse3: 178.1 ( 7.80x) put_h264_qpel_16_mc33_8_c: 1394.0 ( 1.00x) put_h264_qpel_16_mc33_8_sse2: 170.9 ( 8.16x) put_h264_qpel_16_mc33_8_ssse3: 176.9 ( 7.88x) Notice that the SSSE3 versions of mc21 and mc23 benefit from an optimized version of hv2_lowpass. Also notice that there is no SSE2 version of the purely horizontal motion compensation. This means that src2 is currently always aligned when calling the SSE2 functions (and that srcStride is always equal to the block width). Yet this has not been exploited (yet). Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/h264_qpel.c | 4 +- libavcodec/x86/h264_qpel_8bit.asm | 74 +++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 3 deletions(-) diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index c49a866c5d..75caac8805 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -59,6 +59,7 @@ void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t * void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_sse2(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ +void ff_ ## OPNAME ## _h264_qpel16_h_lowpass_l2_sse2(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\ @@ -177,9 +178,6 @@ ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, i SSSE3_HV2_LOWPASS_WRAPPER(avg) SSSE3_HV2_LOWPASS_WRAPPER(put) -QPEL_H264_H16(avg_, sse2) -QPEL_H264_H16(put_, sse2) - #define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2 #define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2 #define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2 diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index dc55a8ad93..101ab21647 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -276,6 +276,80 @@ QPEL8_H_LOWPASS_L2_OP put QPEL8_H_LOWPASS_L2_OP avg +%macro QPEL16_H_LOWPASS_L2 1 +%if ARCH_X86_64 +cglobal %1_h264_qpel16_h_lowpass_l2, 5,6,9 ; dst, src, src2, dstStride, srcStride + mova m8, [pw_16] +%define PW_16 m8 +%else +cglobal %1_h264_qpel16_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, srcStride +%define PW_16 [pw_16] +%endif + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + mov r5d, 16 + pxor m7, m7 + mova m6, [pw_5] +.loop: + movu m0, [r1] + movu m2, [r1+1] + mova m1, m0 + mova m3, m2 + punpcklbw m0, m7 + punpcklbw m2, m7 + punpckhbw m1, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m1, m3 + psllw m0, 2 + psllw m1, 2 + movu m2, [r1-1] + movu m4, [r1+2] + mova m3, m2 + mova m5, m4 + punpcklbw m2, m7 + punpcklbw m4, m7 + punpckhbw m3, m7 + punpckhbw m5, m7 + paddw m2, m4 + paddw m3, m5 + psubw m0, m2 + psubw m1, m3 + pmullw m0, m6 + pmullw m1, m6 + movu m2, [r1-2] + movu m4, [r1+3] + mova m3, m2 + mova m5, m4 + punpcklbw m2, m7 + punpcklbw m4, m7 + punpckhbw m3, m7 + punpckhbw m5, m7 + paddw m2, m4 + paddw m3, m5 + paddw m0, m2 + paddw m1, m3 + paddw m0, PW_16 + paddw m1, PW_16 + psraw m0, 5 + psraw m1, 5 + packuswb m0, m1 + movu m4, [r2] + pavgb m0, m4 + op_%1 m0, [r0], m4 + add r0, r3 + add r1, r3 + add r2, r4 + dec r5d + jg .loop + RET +%endmacro + +INIT_XMM sse2 +QPEL16_H_LOWPASS_L2 put +QPEL16_H_LOWPASS_L2 avg + + %macro QPEL8_H_LOWPASS_L2_OP_XMM 1 cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride movsxdifnidn r3, r3d -- 2.49.1 >From 7f5961652abb3e7dd475d64c19a1b75cbc9af464 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Mon, 29 Sep 2025 20:27:44 +0200 Subject: [PATCH 23/26] avcodec/x86/h264_qpel: Don't use ff_ prefix for static functions Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/h264_qpel.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 75caac8805..48dd9d8766 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -77,7 +77,7 @@ DEF_QPEL(avg) DEF_QPEL(put) #define QPEL_H264(OPNAME, OP, MMX)\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride){\ +static av_always_inline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride){\ int w=3;\ src -= 2*srcStride+2;\ while(w--){\ @@ -155,15 +155,15 @@ static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, } #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride, int size){\ +static av_always_inline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride, int size){\ put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, srcStride, size);\ ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, size);\ }\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride){\ - ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, srcStride, 8);\ +static av_always_inline void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, srcStride, 8);\ }\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride){\ - ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, srcStride, 16);\ +static av_always_inline void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, srcStride, 16);\ }\ #define SSSE3_HV2_LOWPASS_WRAPPER(OPNAME) \ @@ -279,7 +279,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uin static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ {\ LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\ - ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, stride);\ + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, stride);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ @@ -288,7 +288,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uin uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ av_assert2(((uintptr_t)temp & 7) == 0);\ - ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ }\ \ @@ -298,7 +298,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uin uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ av_assert2(((uintptr_t)temp & 7) == 0);\ - ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ }\ \ @@ -308,7 +308,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uin uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ av_assert2(((uintptr_t)temp & 7) == 0);\ - ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride);\ }\ \ @@ -318,7 +318,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uin uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ av_assert2(((uintptr_t)temp & 7) == 0);\ - ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride);\ }\ -- 2.49.1 >From 543c03e207b46d6218117b3121652168c9d33e98 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Mon, 29 Sep 2025 22:59:10 +0200 Subject: [PATCH 24/26] avcodec/x86/h264_qpel: Port pixel8_l2_shift5 from MMXEXT to SSE2 This abides by the ABI (no missing emms) and yields a tiny performance improvement here. Old benchmarks: avg_h264_qpel_8_mc12_8_c: 419.9 ( 1.00x) avg_h264_qpel_8_mc12_8_sse2: 78.9 ( 5.32x) avg_h264_qpel_8_mc12_8_ssse3: 71.7 ( 5.86x) avg_h264_qpel_8_mc32_8_c: 429.1 ( 1.00x) avg_h264_qpel_8_mc32_8_sse2: 76.9 ( 5.58x) avg_h264_qpel_8_mc32_8_ssse3: 73.4 ( 5.84x) put_h264_qpel_8_mc12_8_c: 424.0 ( 1.00x) put_h264_qpel_8_mc12_8_sse2: 78.6 ( 5.40x) put_h264_qpel_8_mc12_8_ssse3: 70.6 ( 6.00x) put_h264_qpel_8_mc32_8_c: 425.7 ( 1.00x) put_h264_qpel_8_mc32_8_sse2: 75.2 ( 5.66x) put_h264_qpel_8_mc32_8_ssse3: 70.4 ( 6.05x) New benchmarks: avg_h264_qpel_8_mc12_8_c: 425.7 ( 1.00x) avg_h264_qpel_8_mc12_8_sse2: 77.5 ( 5.49x) avg_h264_qpel_8_mc12_8_ssse3: 69.8 ( 6.10x) avg_h264_qpel_8_mc32_8_c: 423.7 ( 1.00x) avg_h264_qpel_8_mc32_8_sse2: 74.6 ( 5.68x) avg_h264_qpel_8_mc32_8_ssse3: 71.9 ( 5.89x) put_h264_qpel_8_mc12_8_c: 422.2 ( 1.00x) put_h264_qpel_8_mc12_8_sse2: 75.8 ( 5.57x) put_h264_qpel_8_mc12_8_ssse3: 67.9 ( 6.22x) put_h264_qpel_8_mc32_8_c: 421.8 ( 1.00x) put_h264_qpel_8_mc32_8_sse2: 72.6 ( 5.81x) put_h264_qpel_8_mc32_8_ssse3: 67.7 ( 6.23x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/h264_qpel.c | 10 ++++---- libavcodec/x86/h264_qpel_8bit.asm | 38 ++++++++++++++++++++++--------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 48dd9d8766..52a6bfd5bf 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -70,7 +70,7 @@ void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_ void ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride);\ void ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride);\ void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ -void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ +void ff_ ## OPNAME ## _pixels8_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ DEF_QPEL(avg) @@ -186,8 +186,8 @@ SSSE3_HV2_LOWPASS_WRAPPER(put) #define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext #define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext -#define ff_put_pixels16_l2_shift5_mmxext ff_put_pixels16_l2_shift5_sse2 -#define ff_avg_pixels16_l2_shift5_mmxext ff_avg_pixels16_l2_shift5_sse2 +#define ff_put_pixels4_l2_shift5_sse2 ff_put_pixels4_l2_shift5_mmxext +#define ff_avg_pixels4_l2_shift5_sse2 ff_avg_pixels4_l2_shift5_mmxext #define H264_MC_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \ H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ @@ -309,7 +309,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uin int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ av_assert2(((uintptr_t)temp & 7) == 0);\ put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ - ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride);\ + ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_sse2(dst, halfV+2, halfHV, stride);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ @@ -319,7 +319,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uin int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ av_assert2(((uintptr_t)temp & 7) == 0);\ put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ - ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride);\ + ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_sse2(dst, halfV+3, halfHV, stride);\ }\ #define H264_MC(QPEL, SIZE, MMX, ALIGN)\ diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index 101ab21647..9ca78b0775 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -781,13 +781,30 @@ INIT_MMX mmxext PIXELS4_L2_SHIFT5 put PIXELS4_L2_SHIFT5 avg +%macro PIXELS8_L2_SHIFT5 1 +cglobal %1_pixels8_l2_shift5, 5, 5, 3 ; dst, src16, src8, dstStride + movsxdifnidn r3, r3d + mov r4d, 8 +.loop: + movu m0, [r1] + movu m1, [r1+48] + psraw m0, 5 + psraw m1, 5 + packuswb m0, m1 + pavgb m0, [r2] + pshufd m1, m0, 0xee ; low half of m1 is high half of m0 + op_%1h m0, [r0], m2 + op_%1h m1, [r0+r3], m2 + add r1, 48*2 + add r2, 8*2 + lea r0, [r0+2*r3] + sub r4d, 2 + jne .loop + RET +%endmacro -%macro PIXELS_L2_SHIFT5 2 -%if cpuflag(sse2) +%macro PIXELS16_L2_SHIFT5 2 cglobal %1_pixels%2_l2_shift5, 5, 5, 4 ; dst, src16, src8, dstStride -%else -cglobal %1_pixels%2_l2_shift5, 5, 5 ; dst, src16, src8, dstStride -%endif movsxdifnidn r3, r3d mov r4d, %2 .loop: @@ -813,13 +830,12 @@ cglobal %1_pixels%2_l2_shift5, 5, 5 ; dst, src16, src8, dstStride RET %endmacro -INIT_MMX mmxext -PIXELS_L2_SHIFT5 put, 8 -PIXELS_L2_SHIFT5 avg, 8 - INIT_XMM sse2 -PIXELS_L2_SHIFT5 put, 16 -PIXELS_L2_SHIFT5 avg, 16 +PIXELS8_L2_SHIFT5 put +PIXELS8_L2_SHIFT5 avg + +PIXELS16_L2_SHIFT5 put, 16 +PIXELS16_L2_SHIFT5 avg, 16 %if ARCH_X86_64 %macro QPEL16_H_LOWPASS_L2_OP 1 -- 2.49.1 >From 97e212f75e5769838dd5076d909ad80e399cc23e Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Mon, 29 Sep 2025 23:08:11 +0200 Subject: [PATCH 25/26] avcodec/x86/h264_qpel_10bit: Remove SSE2 "cache64" duplicates The horizontal 10bit MC SSE2 functions are currently duplicated: They exist both in ordinary form as well as with a "sse2_cache64" suffix. A comment in ff_h264qpel_init_x86() indicates that this is due to older processors not liking accesses that cross cache lines, yet these functions are identical to the non-cache64 functions (apart from the unavoidable changes in the rip-offset). The only difference between these functions and the ordinary ones are that the cache64 ones are created via a special form of the INIT_XMM macro: "INIT_XMM sse2, cache64". This affects the name and apparently defines cpuflags_cache64, yet nothing checks for this, so both versions are identical. So remove the cache64 ones and treat the remaining ones like ordinary SSE2 functions. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/h264_qpel.c | 21 +++------------------ libavcodec/x86/h264_qpel_10bit.asm | 4 ---- 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 52a6bfd5bf..6b9b4f7bc6 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -384,13 +384,10 @@ LUMA_MC_4(10, mc33, mmxext) LUMA_MC_816(10, mc00, sse2) LUMA_MC_816(10, mc10, sse2) -LUMA_MC_816(10, mc10, sse2_cache64) LUMA_MC_816(10, mc10, ssse3_cache64) LUMA_MC_816(10, mc20, sse2) -LUMA_MC_816(10, mc20, sse2_cache64) LUMA_MC_816(10, mc20, ssse3_cache64) LUMA_MC_816(10, mc30, sse2) -LUMA_MC_816(10, mc30, sse2_cache64) LUMA_MC_816(10, mc30, ssse3_cache64) LUMA_MC_816(10, mc01, sse2) LUMA_MC_816(10, mc11, sse2) @@ -488,9 +485,9 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth) SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_); SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_); SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_); - H264_QPEL_FUNCS_10(1, 0, sse2_cache64); - H264_QPEL_FUNCS_10(2, 0, sse2_cache64); - H264_QPEL_FUNCS_10(3, 0, sse2_cache64); + H264_QPEL_FUNCS_10(1, 0, sse2); + H264_QPEL_FUNCS_10(2, 0, sse2); + H264_QPEL_FUNCS_10(3, 0, sse2); } } @@ -516,17 +513,5 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth) H264_QPEL_FUNCS_10(3, 0, ssse3_cache64); } } - - if (EXTERNAL_AVX(cpu_flags)) { - /* AVX implies 64 byte cache lines without the need to avoid unaligned - * memory accesses that cross the boundary between two cache lines. - * TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid - * having to treat SSE2 functions with such properties as AVX. */ - if (bit_depth == 10) { - H264_QPEL_FUNCS_10(1, 0, sse2); - H264_QPEL_FUNCS_10(2, 0, sse2); - H264_QPEL_FUNCS_10(3, 0, sse2); - } - } #endif } diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm index 80483b15ba..bad2d386eb 100644 --- a/libavcodec/x86/h264_qpel_10bit.asm +++ b/libavcodec/x86/h264_qpel_10bit.asm @@ -227,8 +227,6 @@ MC00 avg %define OP_MOV mova INIT_MMX mmxext %1 put, 4 -INIT_XMM sse2, cache64 -%1 put, 8 INIT_XMM ssse3, cache64 %1 put, 8 INIT_XMM sse2 @@ -237,8 +235,6 @@ INIT_XMM sse2 %define OP_MOV AVG_MOV INIT_MMX mmxext %1 avg, 4 -INIT_XMM sse2, cache64 -%1 avg, 8 INIT_XMM ssse3, cache64 %1 avg, 8 INIT_XMM sse2 -- 2.49.1 >From c42a768d0cabd1cbd4a427ca38362b99a5794e60 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Tue, 30 Sep 2025 00:32:27 +0200 Subject: [PATCH 26/26] avcodec/x86/h264_qpel: Use ptrdiff_t for strides Avoids having to sign-extend the strides in the assembly (it also is more correct given that the qpel_mc_func already uses ptrdiff_t). Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/h264_qpel.c | 73 +++++++++++++++++-------------- libavcodec/x86/h264_qpel_8bit.asm | 27 ------------ 2 files changed, 41 insertions(+), 59 deletions(-) diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 6b9b4f7bc6..636be54530 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -19,14 +19,14 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include <stddef.h> + #include "libavutil/attributes.h" #include "libavutil/avassert.h" #include "libavutil/cpu.h" #include "libavutil/mem_internal.h" -#include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/h264qpel.h" -#include "libavcodec/pixels.h" #include "fpel.h" #if HAVE_X86ASM @@ -55,29 +55,30 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t ff_avg_pixels16_l2_mmxext((dst), (src1), (src2), (dststride), (src1stride)) #define DEF_QPEL(OPNAME)\ -void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ -void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ -void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ -void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_sse2(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ -void ff_ ## OPNAME ## _h264_qpel16_h_lowpass_l2_sse2(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ -void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ -void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ -void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\ -void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp, int srcStride);\ -void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\ -void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, int srcStride, int size);\ -void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int h);\ -void ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride);\ -void ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride);\ -void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ -void ff_ ## OPNAME ## _pixels8_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ -void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ +void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride);\ +void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride);\ +void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride);\ +void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_sse2(uint8_t *dst, const uint8_t *src, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride);\ +void ff_ ## OPNAME ## _h264_qpel16_h_lowpass_l2_sse2(uint8_t *dst, const uint8_t *src, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride);\ +void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride);\ +void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride);\ +void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h);\ +void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp, ptrdiff_t srcStride);\ +void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, ptrdiff_t dstStride);\ +void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, ptrdiff_t srcStride, int size);\ +void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride, int h);\ +void ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride);\ +void ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride);\ +void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, ptrdiff_t dstStride);\ +void ff_ ## OPNAME ## _pixels8_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, ptrdiff_t dstStride);\ +void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, ptrdiff_t dstStride);\ DEF_QPEL(avg) DEF_QPEL(put) #define QPEL_H264(OPNAME, OP, MMX)\ -static av_always_inline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride){\ +static av_always_inline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\ +{\ int w=3;\ src -= 2*srcStride+2;\ while(w--){\ @@ -89,7 +90,8 @@ static av_always_inline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *ds ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\ }\ \ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int size){\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride, int size)\ +{\ int w = size>>4;\ do{\ ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, size);\ @@ -99,7 +101,8 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX }\ #define QPEL_H264_H16(OPNAME, EXT) \ -static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## EXT(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## EXT(uint8_t *dst, const uint8_t *src, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)\ +{\ ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## EXT(dst , src , src2 , dstStride, src2Stride);\ ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## EXT(dst+8, src+8, src2+8, dstStride, src2Stride);\ src += 8*dstStride;\ @@ -113,8 +116,8 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## EXT(u #if ARCH_X86_64 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride); -void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride); +void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride); +void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride); #else // ARCH_X86_64 #define QPEL_H264_H16_XMM(OPNAME, OP, EXT) QPEL_H264_H16(OPNAME, EXT) @@ -122,7 +125,8 @@ void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, con #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\ +{\ ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ src += 8*srcStride;\ @@ -132,17 +136,19 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint }\ #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\ +{\ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ }\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\ +{\ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ } static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, const uint8_t *src, - int srcStride, + ptrdiff_t srcStride, int size) { int w = (size+8)>>3; @@ -155,20 +161,23 @@ static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, } #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ -static av_always_inline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride, int size){\ +static av_always_inline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int size)\ +{\ put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, srcStride, size);\ ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, size);\ }\ -static av_always_inline void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride){\ +static av_always_inline void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\ +{\ OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, srcStride, 8);\ }\ -static av_always_inline void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int srcStride){\ +static av_always_inline void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\ +{\ OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, srcStride, 16);\ }\ #define SSSE3_HV2_LOWPASS_WRAPPER(OPNAME) \ static av_always_inline void \ -ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int size) \ +ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride, int size) \ {\ if (size == 8)\ ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(dst, tmp, dstStride);\ diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index 9ca78b0775..ede4f382e1 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -97,8 +97,6 @@ PIXELS4_L2 avg %macro QPEL4_H_LOWPASS_OP 1 cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d pxor m7, m7 mova m4, [pw_5] mova m5, [pw_16] @@ -140,8 +138,6 @@ QPEL4_H_LOWPASS_OP avg %macro QPEL8_H_LOWPASS_OP_XMM 1 cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d mov r4d, 8 pxor m7, m7 mova m6, [pw_5] @@ -184,8 +180,6 @@ QPEL8_H_LOWPASS_OP_XMM avg %macro QPEL4_H_LOWPASS_L2_OP 1 cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d pxor m7, m7 mova m4, [pw_5] mova m5, [pw_16] @@ -231,8 +225,6 @@ QPEL4_H_LOWPASS_L2_OP avg %macro QPEL8_H_LOWPASS_L2_OP 1 cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,6 ; dst, src, src2, dstStride, srcStride - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d mova m3, [pw_16] mov r5d, 8 pxor m5, m5 @@ -285,8 +277,6 @@ cglobal %1_h264_qpel16_h_lowpass_l2, 5,6,9 ; dst, src, src2, dstStride, srcStrid cglobal %1_h264_qpel16_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, srcStride %define PW_16 [pw_16] %endif - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d mov r5d, 16 pxor m7, m7 mova m6, [pw_5] @@ -352,8 +342,6 @@ QPEL16_H_LOWPASS_L2 avg %macro QPEL8_H_LOWPASS_L2_OP_XMM 1 cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d mov r5d, 8 pxor m7, m7 mova m6, [pw_5] @@ -421,8 +409,6 @@ QPEL8_H_LOWPASS_L2_OP_XMM avg %macro QPEL4_V_LOWPASS_OP 1 cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d sub r1, r3 sub r1, r3 pxor m7, m7 @@ -454,8 +440,6 @@ QPEL4_V_LOWPASS_OP avg %macro QPEL8OR16_V_LOWPASS_OP 1 cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d sub r1, r3 sub r1, r3 pxor m7, m7 @@ -520,7 +504,6 @@ QPEL8OR16_V_LOWPASS_OP avg %macro QPEL4_HV1_LOWPASS_OP 1 cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride - movsxdifnidn r2, r2d pxor m7, m7 movh m0, [r0] movh m1, [r0+r2] @@ -542,7 +525,6 @@ cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride RET cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride - movsxdifnidn r2, r2d mov r3d, 4 .loop: mova m0, [r0] @@ -573,7 +555,6 @@ QPEL4_HV1_LOWPASS_OP avg INIT_XMM sse2 cglobal put_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size - movsxdifnidn r2, r2d pxor m7, m7 movh m0, [r0] movh m1, [r0+r2] @@ -612,7 +593,6 @@ cglobal put_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size %macro QPEL8OR16_HV2_LOWPASS_OP 1 cglobal %1_h264_qpel8or16_hv2_lowpass_op, 4,4 ; dst, tmp, dstStride, h - movsxdifnidn r2, r2d .loop: mova m0, [r1] mova m3, [r1+8] @@ -659,7 +639,6 @@ cglobal %1_h264_qpel8_hv2_lowpass, 3,4,7 ; dst, tmp, dstStride %else cglobal %1_h264_qpel8_hv2_lowpass, 3,4,6 ; dst, tmp, dstStride %endif - movsxdifnidn r2, r2d mov r3d, 8 .loop: mova m1, [r1+16] @@ -692,7 +671,6 @@ cglobal %1_h264_qpel8_hv2_lowpass, 3,4,6 ; dst, tmp, dstStride RET cglobal %1_h264_qpel16_hv2_lowpass, 3,4,8 ; dst, tmp, dstStride - movsxdifnidn r2, r2d mov r3d, 16 .loop: mova m4, [r1+32] @@ -752,7 +730,6 @@ QPEL8OR16_HV2_LOWPASS_OP_XMM avg %macro PIXELS4_L2_SHIFT5 1 cglobal %1_pixels4_l2_shift5,4,4 ; dst, src16, src8, dstStride - movsxdifnidn r3, r3d mova m0, [r1] mova m1, [r1+24] psraw m0, 5 @@ -783,7 +760,6 @@ PIXELS4_L2_SHIFT5 avg %macro PIXELS8_L2_SHIFT5 1 cglobal %1_pixels8_l2_shift5, 5, 5, 3 ; dst, src16, src8, dstStride - movsxdifnidn r3, r3d mov r4d, 8 .loop: movu m0, [r1] @@ -805,7 +781,6 @@ cglobal %1_pixels8_l2_shift5, 5, 5, 3 ; dst, src16, src8, dstStride %macro PIXELS16_L2_SHIFT5 2 cglobal %1_pixels%2_l2_shift5, 5, 5, 4 ; dst, src16, src8, dstStride - movsxdifnidn r3, r3d mov r4d, %2 .loop: movu m0, [r1] @@ -840,8 +815,6 @@ PIXELS16_L2_SHIFT5 avg, 16 %if ARCH_X86_64 %macro QPEL16_H_LOWPASS_L2_OP 1 cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d mov r5d, 16 pxor m15, m15 mova m14, [pw_5] -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2025-09-30 2:09 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=175919815179.25.7383648006752454224@bf249f23a2c8 \ --to=ffmpeg-devel@ffmpeg.org \ --cc=code@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git