From: mkver via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> To: ffmpeg-devel@ffmpeg.org Cc: mkver <code@ffmpeg.org> Subject: [FFmpeg-devel] [PATCH] Port cavs qpeldsp from MMX to SSE2 (PR #20648) Date: Sun, 05 Oct 2025 18:53:20 -0000 Message-ID: <175969040086.65.6437601043884458921@bf249f23a2c8> (raw) PR #20648 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20648 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20648.patch Also uncovered a bug in the MMX version that has gone unnoticed since it was added in 2006. Apparently no one uses cavs. >From 099834932c49192a9441b51e485aaa355514c520 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sun, 5 Oct 2025 08:16:01 +0200 Subject: [PATCH 1/7] avcodec/cavs: Remove unused parameter Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/cavs.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/libavcodec/cavs.c b/libavcodec/cavs.c index 172cc5cc7a..4db6892452 100644 --- a/libavcodec/cavs.c +++ b/libavcodec/cavs.c @@ -387,7 +387,7 @@ void ff_cavs_modify_mb_i(AVSContext *h, int *pred_mode_uv) ****************************************************************************/ static inline void mc_dir_part(AVSContext *h, AVFrame *pic, int chroma_height, - int delta, int list, uint8_t *dest_y, + int list, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int src_x_offset, int src_y_offset, qpel_mc_func *qpix_op, @@ -452,7 +452,7 @@ static inline void mc_dir_part(AVSContext *h, AVFrame *pic, int chroma_height, chroma_op(dest_cr, src_cr, h->c_stride, chroma_height, mx & 7, my & 7); } -static inline void mc_part_std(AVSContext *h, int chroma_height, int delta, +static inline void mc_part_std(AVSContext *h, int chroma_height, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, @@ -474,7 +474,7 @@ static inline void mc_part_std(AVSContext *h, int chroma_height, int delta, if (mv->ref >= 0) { AVFrame *ref = h->DPB[mv->ref].f; - mc_dir_part(h, ref, chroma_height, delta, 0, + mc_dir_part(h, ref, chroma_height, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op, mv); @@ -484,7 +484,7 @@ static inline void mc_part_std(AVSContext *h, int chroma_height, int delta, if ((mv + MV_BWD_OFFS)->ref >= 0) { AVFrame *ref = h->DPB[0].f; - mc_dir_part(h, ref, chroma_height, delta, 1, + mc_dir_part(h, ref, chroma_height, 1, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op, mv + MV_BWD_OFFS); } @@ -493,32 +493,32 @@ static inline void mc_part_std(AVSContext *h, int chroma_height, int delta, void ff_cavs_inter(AVSContext *h, enum cavs_mb mb_type) { if (ff_cavs_partition_flags[mb_type] == 0) { // 16x16 - mc_part_std(h, 8, 0, h->cy, h->cu, h->cv, 0, 0, + mc_part_std(h, 8, h->cy, h->cu, h->cv, 0, 0, h->cdsp.put_cavs_qpel_pixels_tab[0], h->h264chroma.put_h264_chroma_pixels_tab[0], h->cdsp.avg_cavs_qpel_pixels_tab[0], h->h264chroma.avg_h264_chroma_pixels_tab[0], &h->mv[MV_FWD_X0]); } else { - mc_part_std(h, 4, 0, h->cy, h->cu, h->cv, 0, 0, + mc_part_std(h, 4, h->cy, h->cu, h->cv, 0, 0, h->cdsp.put_cavs_qpel_pixels_tab[1], h->h264chroma.put_h264_chroma_pixels_tab[1], h->cdsp.avg_cavs_qpel_pixels_tab[1], h->h264chroma.avg_h264_chroma_pixels_tab[1], &h->mv[MV_FWD_X0]); - mc_part_std(h, 4, 0, h->cy, h->cu, h->cv, 4, 0, + mc_part_std(h, 4, h->cy, h->cu, h->cv, 4, 0, h->cdsp.put_cavs_qpel_pixels_tab[1], h->h264chroma.put_h264_chroma_pixels_tab[1], h->cdsp.avg_cavs_qpel_pixels_tab[1], h->h264chroma.avg_h264_chroma_pixels_tab[1], &h->mv[MV_FWD_X1]); - mc_part_std(h, 4, 0, h->cy, h->cu, h->cv, 0, 4, + mc_part_std(h, 4, h->cy, h->cu, h->cv, 0, 4, h->cdsp.put_cavs_qpel_pixels_tab[1], h->h264chroma.put_h264_chroma_pixels_tab[1], h->cdsp.avg_cavs_qpel_pixels_tab[1], h->h264chroma.avg_h264_chroma_pixels_tab[1], &h->mv[MV_FWD_X2]); - mc_part_std(h, 4, 0, h->cy, h->cu, h->cv, 4, 4, + mc_part_std(h, 4, h->cy, h->cu, h->cv, 4, 4, h->cdsp.put_cavs_qpel_pixels_tab[1], h->h264chroma.put_h264_chroma_pixels_tab[1], h->cdsp.avg_cavs_qpel_pixels_tab[1], -- 2.49.1 >From 53c9a63f9998301a69b7b446950f9cb897c95b78 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sun, 5 Oct 2025 12:35:17 +0200 Subject: [PATCH 2/7] avcodec/x86/cavsdsp: Fix vertical qpel motion compensation The prediction involves terms of the form (-1 * s0 - 2 * s1 + 96 * s2 + 42 * s3 - 7 * s4 + 64) >> 7, where the s values are in the range of 0..255. The sum can have values in the range -2550..35190, which does not fit into a signed 16bit integer. The code uses an arithmetic right shift, which does not yield the correct result for values >= 2^15; such values should be clipped to 255, yet are clipped to 0 instead. Fix this by shifting the values by 4096, so that the range is positive, use a logical right shift and subtract 32. bunny.mp4 from the FATE suite can be used to reproduce the problem. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/cavsdsp.c | 57 +++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c index 3d21744ef0..593369aaec 100644 --- a/libavcodec/x86/cavsdsp.c +++ b/libavcodec/x86/cavsdsp.c @@ -53,6 +53,7 @@ static void cavs_idct8_add_sse2(uint8_t *dst, int16_t *block, ptrdiff_t stride) DECLARE_ASM_CONST(8, uint64_t, pw_42) = 0x002A002A002A002AULL; DECLARE_ASM_CONST(8, uint64_t, pw_96) = 0x0060006000600060ULL; +DECLARE_ASM_CONST(8, uint64_t, pw_4160) = 0x1040104010401040ULL; /***************************************************************************** * @@ -61,7 +62,7 @@ DECLARE_ASM_CONST(8, uint64_t, pw_96) = 0x0060006000600060ULL; ****************************************************************************/ /* vertical filter [-1 -2 96 42 -7 0] */ -#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \ +#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, SUB, MUL1, MUL2) \ "movd (%0), "#F" \n\t"\ "movq "#C", %%mm6 \n\t"\ "pmullw "MANGLE(MUL1)", %%mm6\n\t"\ @@ -80,13 +81,14 @@ DECLARE_ASM_CONST(8, uint64_t, pw_96) = 0x0060006000600060ULL; "psraw $1, "#B" \n\t"\ "psubw "#A", %%mm6 \n\t"\ "paddw "MANGLE(ADD)", %%mm6 \n\t"\ - "psraw $7, %%mm6 \n\t"\ + "psrlw $7, %%mm6 \n\t"\ + "psubw "MANGLE(SUB)", %%mm6 \n\t"\ "packuswb %%mm6, %%mm6 \n\t"\ OP(%%mm6, (%1), A, d) \ "add %3, %1 \n\t" /* vertical filter [ 0 -1 5 5 -1 0] */ -#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \ +#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, SUB, MUL1, MUL2) \ "movd (%0), "#F" \n\t"\ "movq "#C", %%mm6 \n\t"\ "paddw "#D", %%mm6 \n\t"\ @@ -102,7 +104,7 @@ DECLARE_ASM_CONST(8, uint64_t, pw_96) = 0x0060006000600060ULL; "add %3, %1 \n\t" /* vertical filter [ 0 -7 42 96 -2 -1] */ -#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \ +#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, SUB, MUL1, MUL2) \ "movd (%0), "#F" \n\t"\ "movq "#C", %%mm6 \n\t"\ "pmullw "MANGLE(MUL2)", %%mm6\n\t"\ @@ -121,13 +123,14 @@ DECLARE_ASM_CONST(8, uint64_t, pw_96) = 0x0060006000600060ULL; "psraw $1, "#E" \n\t"\ "psubw "#F", %%mm6 \n\t"\ "paddw "MANGLE(ADD)", %%mm6 \n\t"\ - "psraw $7, %%mm6 \n\t"\ + "psrlw $7, %%mm6 \n\t"\ + "psubw "MANGLE(SUB)", %%mm6 \n\t"\ "packuswb %%mm6, %%mm6 \n\t"\ OP(%%mm6, (%1), A, d) \ "add %3, %1 \n\t" -#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\ +#define QPEL_CAVSVNUM(VOP,OP,ADD,SUB,MUL1,MUL2)\ int w= 2;\ src -= 2*srcStride;\ \ @@ -149,34 +152,34 @@ DECLARE_ASM_CONST(8, uint64_t, pw_96) = 0x0060006000600060ULL; "punpcklbw %%mm7, %%mm2 \n\t"\ "punpcklbw %%mm7, %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm4 \n\t"\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\ - VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\ - VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\ + VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, SUB, MUL1, MUL2)\ + VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, SUB, MUL1, MUL2)\ + VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, SUB, MUL1, MUL2)\ + VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, SUB, MUL1, MUL2)\ + VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, SUB, MUL1, MUL2)\ + VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, SUB, MUL1, MUL2)\ + VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, SUB, MUL1, MUL2)\ + VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, SUB, MUL1, MUL2)\ \ : "+a"(src), "+c"(dst)\ : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\ - NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\ + NAMED_CONSTRAINTS_ADD(ADD,SUB,MUL1,MUL2)\ : "memory"\ );\ if(h==16){\ __asm__ volatile(\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\ - VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\ - VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\ + VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, SUB, MUL1, MUL2)\ + VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, SUB, MUL1, MUL2)\ + VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, SUB, MUL1, MUL2)\ + VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, SUB, MUL1, MUL2)\ + VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, SUB, MUL1, MUL2)\ + VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, SUB, MUL1, MUL2)\ + VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, SUB, MUL1, MUL2)\ + VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, SUB, MUL1, MUL2)\ \ : "+a"(src), "+c"(dst)\ : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\ - NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\ + NAMED_CONSTRAINTS_ADD(ADD,SUB,MUL1,MUL2)\ : "memory"\ );\ }\ @@ -236,17 +239,17 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptr \ static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\ { \ - QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,pw_96,pw_42) \ + QPEL_CAVSVNUM(QPEL_CAVSV1,OP,pw_4160,ff_pw_32,pw_96,pw_42) \ }\ \ static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\ { \ - QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,pw_42) \ + QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_4,ff_pw_5,pw_42) \ }\ \ static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\ { \ - QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,pw_96,pw_42) \ + QPEL_CAVSVNUM(QPEL_CAVSV3,OP,pw_4160,ff_pw_32,pw_96,pw_42) \ }\ \ static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\ -- 2.49.1 >From 54e9de55529ab036ba951b6aacc687b56662a4dc Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sun, 5 Oct 2025 10:52:31 +0200 Subject: [PATCH 3/7] tests/checkasm: Add CAVS qpel test This test already uncovered a bug in the vertical qpel motion compensation code. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- tests/checkasm/Makefile | 1 + tests/checkasm/cavsdsp.c | 119 ++++++++++++++++++++++++++++++++++++++ tests/checkasm/checkasm.c | 3 + tests/checkasm/checkasm.h | 1 + tests/fate/checkasm.mak | 1 + 5 files changed, 125 insertions(+) create mode 100644 tests/checkasm/cavsdsp.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 82bcc21b01..7a9566eb8a 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -31,6 +31,7 @@ AVCODECOBJS-$(CONFIG_AAC_DECODER) += aacpsdsp.o \ AVCODECOBJS-$(CONFIG_AAC_ENCODER) += aacencdsp.o AVCODECOBJS-$(CONFIG_ALAC_DECODER) += alacdsp.o AVCODECOBJS-$(CONFIG_APV_DECODER) += apv_dsp.o +AVCODECOBJS-$(CONFIG_CAVS_DECODER) += cavsdsp.o AVCODECOBJS-$(CONFIG_DCA_DECODER) += dcadsp.o synth_filter.o AVCODECOBJS-$(CONFIG_DIRAC_DECODER) += diracdsp.o AVCODECOBJS-$(CONFIG_EXR_DECODER) += exrdsp.o diff --git a/tests/checkasm/cavsdsp.c b/tests/checkasm/cavsdsp.c new file mode 100644 index 0000000000..a6ffea588c --- /dev/null +++ b/tests/checkasm/cavsdsp.c @@ -0,0 +1,119 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <assert.h> +#include <stddef.h> + +#include "checkasm.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/macros.h" +#include "libavutil/mem_internal.h" +#include "libavcodec/cavsdsp.h" + + +enum { +// DECLARE_ALIGNED can't handle enum constants. +#define MAX_BLOCK_SIZE 16 + MAX_STRIDE = 64, + /// BUF_SIZE is bigger than necessary in order to test strides > block width. + BUF_SIZE = ((MAX_BLOCK_SIZE - 1) * MAX_STRIDE + MAX_BLOCK_SIZE), + /** + * The qpel interpolation code accesses two lines above and three lines + * below the actual src block; it also accesses two pixels to the left + * and three to the right. + * The input is not subject to alignment requirements; making the input buffer + * bigger (by MAX_BLOCK_SIZE - 1) allows us to use a random misalignment. + */ + INPUT_BUF_SIZE = (2 + (2 + MAX_BLOCK_SIZE - 1 + 3) * MAX_STRIDE + MAX_BLOCK_SIZE + 3 + (MAX_BLOCK_SIZE - 1)) +}; + +#define randomize_buffers(buf0, buf1) \ + do { \ + static_assert(sizeof(buf0) == sizeof(buf1), "Incompatible buffers"); \ + static_assert(!(sizeof(buf0) % 4), "Tail handling needed"); \ + static_assert(sizeof(buf0[0]) == 1 && sizeof(buf1[0]) == 1, \ + "Pointer arithmetic needs to be adapted"); \ + for (size_t k = 0; k < sizeof(buf0); k += 4) { \ + uint32_t r = rnd(); \ + AV_WN32A(buf0 + k, r); \ + AV_WN32A(buf1 + k, r); \ + } \ + } while (0) + + +static void check_cavs_qpeldsp(void) +{ + DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, srcbuf0)[INPUT_BUF_SIZE]; + DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, srcbuf1)[INPUT_BUF_SIZE]; + DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, dstbuf0)[BUF_SIZE]; + DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, dstbuf1)[BUF_SIZE]; + CAVSDSPContext cavsdsp; + static const struct { + const char *name; + size_t offset; + } tests[] = { +#define TEST(NAME) { .name = #NAME, .offset = offsetof(CAVSDSPContext, NAME) } + TEST(put_cavs_qpel_pixels_tab), + TEST(avg_cavs_qpel_pixels_tab), + }; + declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + + ff_cavsdsp_init(&cavsdsp); + + for (size_t i = 0; i < FF_ARRAY_ELEMS(tests); ++i) { + qpel_mc_func (*func_tab)[16] = (qpel_mc_func (*)[16])((char*)&cavsdsp + tests[i].offset); + for (unsigned j = 0; j < 2; ++j) { + const unsigned blocksize = MAX_BLOCK_SIZE >> j; + + for (unsigned dxy = 0; dxy < 16; ++dxy) { + if (check_func(func_tab[j][dxy], "%s[%u][%u]", tests[i].name, j, dxy)) { + // Don't always use output that is 16-aligned. + size_t dst_offset = (rnd() % (MAX_BLOCK_SIZE / blocksize)) * blocksize; + ptrdiff_t stride = (rnd() % (MAX_STRIDE / blocksize) + 1) * blocksize; + size_t src_offset = 2 + 2 * stride + rnd() % MAX_BLOCK_SIZE; + const uint8_t *src0 = srcbuf0 + src_offset, *src1 = srcbuf1 + src_offset; + uint8_t *dst0 = dstbuf0 + dst_offset, *dst1 = dstbuf1 + dst_offset; + + if (rnd() & 1) { + // Flip stride. + dst1 += (blocksize - 1) * stride; + dst0 += (blocksize - 1) * stride; + // We need two lines above src and three lines below the block, + // hence blocksize * stride. + src0 += blocksize * stride; + src1 += blocksize * stride; + stride = -stride; + } + + randomize_buffers(srcbuf0, srcbuf1); + randomize_buffers(dstbuf0, dstbuf1); + call_ref(dst0, src0, stride); + call_new(dst1, src1, stride); + if (memcmp(srcbuf0, srcbuf1, sizeof(srcbuf0)) || memcmp(dstbuf0, dstbuf1, sizeof(dstbuf0))) + fail(); + bench_new(dst0, src0, stride); + } + } + } + } +} + +void checkasm_check_cavsdsp(void) +{ + check_cavs_qpeldsp(); +} diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index d542f953e8..83aa26624d 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -141,6 +141,9 @@ static const struct { #if CONFIG_BSWAPDSP { "bswapdsp", checkasm_check_bswapdsp }, #endif + #if CONFIG_CAVS_DECODER + { "cavsdsp", checkasm_check_cavsdsp }, + #endif #if CONFIG_DCA_DECODER { "dcadsp", checkasm_check_dcadsp }, { "synth_filter", checkasm_check_synth_filter }, diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 0382963daa..bd7a896447 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -90,6 +90,7 @@ void checkasm_check_blackdetect(void); void checkasm_check_blend(void); void checkasm_check_blockdsp(void); void checkasm_check_bswapdsp(void); +void checkasm_check_cavsdsp(void); void checkasm_check_colordetect(void); void checkasm_check_colorspace(void); void checkasm_check_dcadsp(void); diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak index 233950023c..6d16a65521 100644 --- a/tests/fate/checkasm.mak +++ b/tests/fate/checkasm.mak @@ -9,6 +9,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \ fate-checkasm-av_tx \ fate-checkasm-blockdsp \ fate-checkasm-bswapdsp \ + fate-checkasm-cavsdsp \ fate-checkasm-dcadsp \ fate-checkasm-diracdsp \ fate-checkasm-exrdsp \ -- 2.49.1 >From 7938c727b7cb3bdcb8158e4703c9f696e7ea9341 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sun, 5 Oct 2025 15:06:20 +0200 Subject: [PATCH 4/7] avcodec/x86/cavsdsp: Add SSE2 mc20 horizontal motion compensation Basically a direct port of the MMXEXT one. The main difference is of course that one can process eight pixels (unpacked to words) at a time, leading to speedups. avg_cavs_qpel_pixels_tab[0][2]_c: 700.1 ( 1.00x) avg_cavs_qpel_pixels_tab[0][2]_mmxext: 158.1 ( 4.43x) avg_cavs_qpel_pixels_tab[0][2]_sse2: 86.0 ( 8.14x) avg_cavs_qpel_pixels_tab[1][2]_c: 171.9 ( 1.00x) avg_cavs_qpel_pixels_tab[1][2]_mmxext: 39.4 ( 4.36x) avg_cavs_qpel_pixels_tab[1][2]_sse2: 21.7 ( 7.92x) put_cavs_qpel_pixels_tab[0][2]_c: 525.7 ( 1.00x) put_cavs_qpel_pixels_tab[0][2]_mmxext: 148.5 ( 3.54x) put_cavs_qpel_pixels_tab[0][2]_sse2: 75.2 ( 6.99x) put_cavs_qpel_pixels_tab[1][2]_c: 129.5 ( 1.00x) put_cavs_qpel_pixels_tab[1][2]_mmxext: 36.7 ( 3.53x) put_cavs_qpel_pixels_tab[1][2]_sse2: 19.0 ( 6.81x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/Makefile | 2 +- libavcodec/x86/cavs_qpel.asm | 80 ++++++++++++++++++++++++++++++++++++ libavcodec/x86/cavsdsp.c | 28 ++++++++++++- 3 files changed, 107 insertions(+), 3 deletions(-) create mode 100644 libavcodec/x86/cavs_qpel.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 36168b4aff..ed60fa340f 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -152,7 +152,7 @@ X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o X86ASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o X86ASM-OBJS-$(CONFIG_APV_DECODER) += x86/apv_dsp.o -X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o +X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o x86/cavs_qpel.o X86ASM-OBJS-$(CONFIG_CFHD_ENCODER) += x86/cfhdencdsp.o X86ASM-OBJS-$(CONFIG_CFHD_DECODER) += x86/cfhddsp.o X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o diff --git a/libavcodec/x86/cavs_qpel.asm b/libavcodec/x86/cavs_qpel.asm new file mode 100644 index 0000000000..543ff21f0e --- /dev/null +++ b/libavcodec/x86/cavs_qpel.asm @@ -0,0 +1,80 @@ +;***************************************************************************** +;* SSE2-optimized CAVS QPEL code +;***************************************************************************** +;* Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de> +;* based on H.264 optimizations by Michael Niedermayer and Loren Merritt +;* Copyright (c) 2025 Andreas Rheinhardt <andreas.rheinhardt@outlook.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +cextern pw_4 +cextern pw_5 + +SECTION .text + +%macro op_avgh 3 + movh %3, %2 + pavgb %1, %3 + movh %2, %1 +%endmacro + +%macro op_puth 2-3 + movh %2, %1 +%endmacro + +%macro CAVS_QPEL_H 1 +; ff_put_cavs_qpel8_mc20(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) +cglobal %1_cavs_qpel8_mc20, 3,4,6 + mov r3d, 8 + jmp %1_cavs_qpel8_h_after_prologue + +; ff_put_cavs_qpel8_h(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h) +cglobal %1_cavs_qpel8_h, 4,4,6 +%1_cavs_qpel8_h_after_prologue: + mova m3, [pw_4] + mova m4, [pw_5] + pxor m5, m5 +.loop: + movh m0, [r1] + movh m1, [r1+1] + punpcklbw m0, m5 + punpcklbw m1, m5 + paddw m0, m1 + movh m1, [r1-1] + movh m2, [r1+2] + pmullw m0, m4 + punpcklbw m1, m5 + punpcklbw m2, m5 + paddw m0, m3 + add r1, r2 + paddw m1, m2 + psubw m0, m1 + psraw m0, 3 + packuswb m0, m5 + op_%1h m0, [r0], m1 + add r0, r2 + dec r3d + jne .loop + RET +%endmacro + +INIT_XMM sse2 +CAVS_QPEL_H avg +CAVS_QPEL_H put diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c index 593369aaec..00bab0d5f9 100644 --- a/libavcodec/x86/cavsdsp.c +++ b/libavcodec/x86/cavsdsp.c @@ -370,6 +370,25 @@ CAVS_MC(avg_, 8, mmxext) CAVS_MC(avg_, 16, mmxext) #endif /* HAVE_MMXEXT_INLINE */ +#if HAVE_SSE2_EXTERNAL +#define DEF_QPEL(OPNAME) \ + void ff_ ## OPNAME ## _cavs_qpel8_mc20_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ + void ff_ ## OPNAME ## _cavs_qpel8_h_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h); \ + +DEF_QPEL(put) +DEF_QPEL(avg) + +#define QPEL_CAVS_XMM(OPNAME, XMM) \ +static void OPNAME ## _cavs_qpel16_mc20_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \ +{ \ + ff_ ## OPNAME ## _cavs_qpel8_h_ ## XMM(dst, src, stride, 16); \ + ff_ ## OPNAME ## _cavs_qpel8_h_ ## XMM(dst + 8, src + 8, stride, 16); \ +} + +QPEL_CAVS_XMM(put, sse2) +QPEL_CAVS_XMM(avg, sse2) +#endif + av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c) { av_unused int cpu_flags = av_get_cpu_flags(); @@ -392,8 +411,13 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c) #endif #if HAVE_SSE2_EXTERNAL if (EXTERNAL_SSE2(cpu_flags)) { - c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2; - c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2; + c->put_cavs_qpel_pixels_tab[0][ 0] = put_cavs_qpel16_mc00_sse2; + c->put_cavs_qpel_pixels_tab[0][ 2] = put_cavs_qpel16_mc20_sse2; + c->put_cavs_qpel_pixels_tab[1][ 2] = ff_put_cavs_qpel8_mc20_sse2; + + c->avg_cavs_qpel_pixels_tab[0][ 0] = avg_cavs_qpel16_mc00_sse2; + c->avg_cavs_qpel_pixels_tab[0][ 2] = avg_cavs_qpel16_mc20_sse2; + c->avg_cavs_qpel_pixels_tab[1][ 2] = ff_avg_cavs_qpel8_mc20_sse2; c->cavs_idct8_add = cavs_idct8_add_sse2; c->idct_perm = FF_IDCT_PERM_TRANSPOSE; -- 2.49.1 >From 98ac1d913aad2829978dceeac94fea3b0b4a1a44 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sun, 5 Oct 2025 16:22:07 +0200 Subject: [PATCH 5/7] avcodec/x86/cavs_qpel: Add SSE2 vertical motion compensation This is not based on the MMXEXT one, because the latter is quite suboptimal: Motion vector types mc01 and mc03 (vertical motion vectors with remainder of one quarter or three quarter) use different neighboring lines for interpolation: mc01 uses two lines above and two lines below, mc03 one line above and three lines below. The MMXEXT code uses a common macro for all of them and therefore reads six lines before it processes them (even reading lines which are not used at all), leading to severe register pressure. Another difference to the old code is that the positive and negative parts of the sum to calculate are accumulated separately and the subtraction is performed with unsigned saturation, so that one can avoid biasing the sum. The fact that the mc01 and mc03 filter coefficients are mirrors of each other has been exploited to reduce mc01 to mc03. But of course the most important different difference between this code and the MMXEXT one is that XMM registers allow to process eight words at a time, ideal for 8x8 subblocks, whereas the MMXEXT code processes them in 4x8 or 4x16 blocks. Benchmarks: avg_cavs_qpel_pixels_tab[0][4]_c: 917.0 ( 1.00x) avg_cavs_qpel_pixels_tab[0][4]_mmxext: 222.0 ( 4.13x) avg_cavs_qpel_pixels_tab[0][4]_sse2: 89.0 (10.31x) avg_cavs_qpel_pixels_tab[0][12]_c: 885.7 ( 1.00x) avg_cavs_qpel_pixels_tab[0][12]_mmxext: 223.2 ( 3.97x) avg_cavs_qpel_pixels_tab[0][12]_sse2: 88.5 (10.01x) avg_cavs_qpel_pixels_tab[1][4]_c: 222.4 ( 1.00x) avg_cavs_qpel_pixels_tab[1][4]_mmxext: 57.2 ( 3.89x) avg_cavs_qpel_pixels_tab[1][4]_sse2: 23.3 ( 9.55x) avg_cavs_qpel_pixels_tab[1][12]_c: 216.0 ( 1.00x) avg_cavs_qpel_pixels_tab[1][12]_mmxext: 57.4 ( 3.76x) avg_cavs_qpel_pixels_tab[1][12]_sse2: 22.6 ( 9.56x) put_cavs_qpel_pixels_tab[0][4]_c: 750.9 ( 1.00x) put_cavs_qpel_pixels_tab[0][4]_mmxext: 210.4 ( 3.57x) put_cavs_qpel_pixels_tab[0][4]_sse2: 84.2 ( 8.92x) put_cavs_qpel_pixels_tab[0][12]_c: 731.6 ( 1.00x) put_cavs_qpel_pixels_tab[0][12]_mmxext: 210.7 ( 3.47x) put_cavs_qpel_pixels_tab[0][12]_sse2: 84.1 ( 8.70x) put_cavs_qpel_pixels_tab[1][4]_c: 191.7 ( 1.00x) put_cavs_qpel_pixels_tab[1][4]_mmxext: 53.8 ( 3.56x) put_cavs_qpel_pixels_tab[1][4]_sse2: 24.5 ( 7.83x) put_cavs_qpel_pixels_tab[1][12]_c: 179.1 ( 1.00x) put_cavs_qpel_pixels_tab[1][12]_mmxext: 53.9 ( 3.32x) put_cavs_qpel_pixels_tab[1][12]_sse2: 24.0 ( 7.47x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/cavs_qpel.asm | 140 +++++++++++++++++++++++++++++++++++ libavcodec/x86/cavsdsp.c | 34 +++++++++ libavcodec/x86/constants.c | 1 + libavcodec/x86/constants.h | 1 + libavcodec/x86/diracdsp.asm | 2 +- 5 files changed, 177 insertions(+), 1 deletion(-) diff --git a/libavcodec/x86/cavs_qpel.asm b/libavcodec/x86/cavs_qpel.asm index 543ff21f0e..4e3444e910 100644 --- a/libavcodec/x86/cavs_qpel.asm +++ b/libavcodec/x86/cavs_qpel.asm @@ -24,8 +24,14 @@ %include "libavutil/x86/x86util.asm" +SECTION_RODATA + cextern pw_4 cextern pw_5 +cextern pw_7 +cextern pw_64 +pw_42: times 8 dw 42 +pw_96: times 8 dw 96 SECTION .text @@ -78,3 +84,137 @@ cglobal %1_cavs_qpel8_h, 4,4,6 INIT_XMM sse2 CAVS_QPEL_H avg CAVS_QPEL_H put + +%macro FILT_V 1 + movh m3, [r1] + punpcklbw m3, m7 + mova m4, m1 + paddw m4, m2 + paddw m0, m3 + add r1, r2 + pmullw m4, m5 + psubw m4, m0 + paddw m4, m6 + psraw m4, 3 + packuswb m4, m7 + op_%1h m4, [r0], m0 + add r0, r2 + SWAP 0, 1, 2, 3 +%endmacro + +%macro CAVS_QPEL_MC02 1 +; ff_put_cavs_qpel8_mc02(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) +cglobal %1_cavs_qpel8_mc02, 3,4,8 + mov r3d, 8 + jmp %1_cavs_qpel8_v2_after_prologue + +; ff_put_cavs_qpel8_v2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h) +cglobal %1_cavs_qpel8_v2, 4,4,8 +%1_cavs_qpel8_v2_after_prologue: + movh m1, [r1] + sub r1, r2 + movh m0, [r1] + lea r1, [r1+2*r2] + pxor m7, m7 + movh m2, [r1] + add r1, r2 + punpcklbw m1, m7 + punpcklbw m0, m7 + punpcklbw m2, m7 + mova m5, [pw_5] + mova m6, [pw_4] +.loop: + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + sub r3d, 4 + jne .loop + RET +%endmacro + +INIT_XMM sse2 +CAVS_QPEL_MC02 avg +CAVS_QPEL_MC02 put + +%macro FILT_V3 1 + pmullw m0, PW_7 + movh m4, [r1] + mova m5, m1 + mova m6, m2 + pmullw m5, PW_42 + punpcklbw m4, m7 + pmullw m6, PW_96 + paddw m0, m3 + add r1, r2 + paddw m0, m3 + paddw m5, m6 + paddw m0, m4 + ; m5-m0 can be in the -10*255..(42 + 96)*255 range and + ; therefore is not guaranteed to fit into either a signed or + ; an unsigned word. Because we need to clamp the result to 0..255 + ; anyway, we use saturated subtraction and a logical right shift + ; for rescaling. + psubusw m5, m0 + paddw m5, PW_64 + psrlw m5, 7 + packuswb m5, m7 + op_%1h m5, [r0], m0 + add r0, r2 + SWAP 0, 1, 2, 3, 4 +%endmacro + +%macro CAVS_QPEL_MC03 1 +; ff_put_cavs_qpel8_mc03(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) +cglobal %1_cavs_qpel8_mc03, 3,4,8+4*ARCH_X86_64 + mov r3d, 8 + jmp %1_cavs_qpel8_v3_after_prologue + +; ff_put_cavs_qpel8_v3(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h) +cglobal %1_cavs_qpel8_v3, 4,4,8+4*ARCH_X86_64 +%1_cavs_qpel8_v3_after_prologue: + movh m1, [r1] + movh m2, [r1+r2] + movh m3, [r1+2*r2] + sub r1, r2 + pxor m7, m7 + movh m0, [r1] + lea r1, [r1+4*r2] + punpcklbw m1, m7 + punpcklbw m2, m7 +%if ARCH_X86_64 +%define PW_7 m8 +%define PW_42 m9 +%define PW_96 m10 +%define PW_64 m11 + mova m8, [pw_7] + mova m9, [pw_42] + mova m10, [pw_96] + mova m11, [pw_64] +%else +%define PW_7 [pw_7] +%define PW_42 [pw_42] +%define PW_96 [pw_96] +%define PW_64 [pw_64] +%endif + punpcklbw m3, m7 + punpcklbw m0, m7 + +.loop: + FILT_V3 %1 + FILT_V3 %1 + FILT_V3 %1 + FILT_V3 %1 + SWAP 0, 1, 2, 3, 4 + mova m3, m2 + mova m2, m1 + mova m1, m0 + mova m0, m4 + sub r3d, 4 + jne .loop + RET +%endmacro + +INIT_XMM sse2 +CAVS_QPEL_MC03 avg +CAVS_QPEL_MC03 put diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c index 00bab0d5f9..cc18c6cac6 100644 --- a/libavcodec/x86/cavsdsp.c +++ b/libavcodec/x86/cavsdsp.c @@ -373,12 +373,34 @@ CAVS_MC(avg_, 16, mmxext) #if HAVE_SSE2_EXTERNAL #define DEF_QPEL(OPNAME) \ void ff_ ## OPNAME ## _cavs_qpel8_mc20_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ + void ff_ ## OPNAME ## _cavs_qpel8_mc02_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ + void ff_ ## OPNAME ## _cavs_qpel8_mc03_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ void ff_ ## OPNAME ## _cavs_qpel8_h_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h); \ + void ff_ ## OPNAME ## _cavs_qpel8_v2_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h);\ + void ff_ ## OPNAME ## _cavs_qpel8_v3_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h);\ DEF_QPEL(put) DEF_QPEL(avg) #define QPEL_CAVS_XMM(OPNAME, XMM) \ +static void OPNAME ## _cavs_qpel16_mc02_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \ +{ \ + ff_ ## OPNAME ## _cavs_qpel8_v2_ ## XMM(dst, src, stride, 16); \ + ff_ ## OPNAME ## _cavs_qpel8_v2_ ## XMM(dst + 8, src + 8, stride, 16); \ +} \ +static void OPNAME ## _cavs_qpel16_mc03_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \ +{ \ + ff_ ## OPNAME ## _cavs_qpel8_v3_ ## XMM(dst, src, stride, 16); \ + ff_ ## OPNAME ## _cavs_qpel8_v3_ ## XMM(dst + 8, src + 8, stride, 16); \ +} \ +static void OPNAME ## _cavs_qpel8_mc01_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \ +{ \ + ff_ ## OPNAME ## _cavs_qpel8_mc03_ ## XMM(dst + 7 * stride, src + 8 * stride, -stride); \ +} \ +static void OPNAME ## _cavs_qpel16_mc01_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \ +{ \ + OPNAME ## _cavs_qpel16_mc03_ ## XMM(dst + 15 * stride, src + 16 * stride, -stride); \ +} \ static void OPNAME ## _cavs_qpel16_mc20_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \ { \ ff_ ## OPNAME ## _cavs_qpel8_h_ ## XMM(dst, src, stride, 16); \ @@ -413,11 +435,23 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c) if (EXTERNAL_SSE2(cpu_flags)) { c->put_cavs_qpel_pixels_tab[0][ 0] = put_cavs_qpel16_mc00_sse2; c->put_cavs_qpel_pixels_tab[0][ 2] = put_cavs_qpel16_mc20_sse2; + c->put_cavs_qpel_pixels_tab[0][ 4] = put_cavs_qpel16_mc01_sse2; + c->put_cavs_qpel_pixels_tab[0][ 8] = put_cavs_qpel16_mc02_sse2; + c->put_cavs_qpel_pixels_tab[0][12] = put_cavs_qpel16_mc03_sse2; c->put_cavs_qpel_pixels_tab[1][ 2] = ff_put_cavs_qpel8_mc20_sse2; + c->put_cavs_qpel_pixels_tab[1][ 4] = put_cavs_qpel8_mc01_sse2; + c->put_cavs_qpel_pixels_tab[1][ 8] = ff_put_cavs_qpel8_mc02_sse2; + c->put_cavs_qpel_pixels_tab[1][12] = ff_put_cavs_qpel8_mc03_sse2; c->avg_cavs_qpel_pixels_tab[0][ 0] = avg_cavs_qpel16_mc00_sse2; c->avg_cavs_qpel_pixels_tab[0][ 2] = avg_cavs_qpel16_mc20_sse2; + c->avg_cavs_qpel_pixels_tab[0][ 4] = avg_cavs_qpel16_mc01_sse2; + c->avg_cavs_qpel_pixels_tab[0][ 8] = avg_cavs_qpel16_mc02_sse2; + c->avg_cavs_qpel_pixels_tab[0][12] = avg_cavs_qpel16_mc03_sse2; c->avg_cavs_qpel_pixels_tab[1][ 2] = ff_avg_cavs_qpel8_mc20_sse2; + c->avg_cavs_qpel_pixels_tab[1][ 4] = avg_cavs_qpel8_mc01_sse2; + c->avg_cavs_qpel_pixels_tab[1][ 8] = ff_avg_cavs_qpel8_mc02_sse2; + c->avg_cavs_qpel_pixels_tab[1][12] = ff_avg_cavs_qpel8_mc03_sse2; c->cavs_idct8_add = cavs_idct8_add_sse2; c->idct_perm = FF_IDCT_PERM_TRANSPOSE; diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c index 1e2f5990e4..eb01221c8d 100644 --- a/libavcodec/x86/constants.c +++ b/libavcodec/x86/constants.c @@ -30,6 +30,7 @@ DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0 DECLARE_ASM_ALIGNED(32, const ymm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL, 0x0004000400040004ULL, 0x0004000400040004ULL }; DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL }; +DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_7) = { 0x0007000700070007ULL, 0x0007000700070007ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL }; DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL; diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h index 7d0bd975b9..4c4d32ac8d 100644 --- a/libavcodec/x86/constants.h +++ b/libavcodec/x86/constants.h @@ -30,6 +30,7 @@ extern const ymm_reg ff_pw_2; extern const xmm_reg ff_pw_3; extern const ymm_reg ff_pw_4; extern const xmm_reg ff_pw_5; +extern const xmm_reg ff_pw_7; extern const xmm_reg ff_pw_8; extern const xmm_reg ff_pw_9; extern const uint64_t ff_pw_15; diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm index 6ae7f888b3..1844aaf89a 100644 --- a/libavcodec/x86/diracdsp.asm +++ b/libavcodec/x86/diracdsp.asm @@ -21,11 +21,11 @@ %include "libavutil/x86/x86util.asm" SECTION_RODATA -pw_7: times 8 dw 7 convert_to_unsigned_10bit: times 4 dd 0x200 clip_10bit: times 8 dw 0x3ff cextern pw_3 +cextern pw_7 cextern pw_16 cextern pw_32 cextern pb_80 -- 2.49.1 >From 59f24af00427d65b9a6f01a1ed0f1cccde516968 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sun, 5 Oct 2025 19:25:32 +0200 Subject: [PATCH 6/7] avcodec/x86/cavsdsp: Remove MMXEXT Qpeldsp Superseded by SSE2. Saves about 11630B here. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/cavsdsp.c | 299 --------------------------------------- 1 file changed, 299 deletions(-) diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c index cc18c6cac6..593faa85b8 100644 --- a/libavcodec/x86/cavsdsp.c +++ b/libavcodec/x86/cavsdsp.c @@ -23,14 +23,11 @@ */ #include "libavutil/attributes.h" -#include "libavutil/common.h" #include "libavutil/cpu.h" #include "libavutil/mem_internal.h" -#include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/cavsdsp.h" #include "libavcodec/idctdsp.h" -#include "constants.h" #include "fpel.h" #include "idctdsp.h" #include "config.h" @@ -49,278 +46,6 @@ static void cavs_idct8_add_sse2(uint8_t *dst, int16_t *block, ptrdiff_t stride) #endif /* HAVE_SSE2_EXTERNAL */ -#if HAVE_MMXEXT_INLINE - -DECLARE_ASM_CONST(8, uint64_t, pw_42) = 0x002A002A002A002AULL; -DECLARE_ASM_CONST(8, uint64_t, pw_96) = 0x0060006000600060ULL; -DECLARE_ASM_CONST(8, uint64_t, pw_4160) = 0x1040104010401040ULL; - -/***************************************************************************** - * - * motion compensation - * - ****************************************************************************/ - -/* vertical filter [-1 -2 96 42 -7 0] */ -#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, SUB, MUL1, MUL2) \ - "movd (%0), "#F" \n\t"\ - "movq "#C", %%mm6 \n\t"\ - "pmullw "MANGLE(MUL1)", %%mm6\n\t"\ - "movq "#D", %%mm7 \n\t"\ - "pmullw "MANGLE(MUL2)", %%mm7\n\t"\ - "psllw $3, "#E" \n\t"\ - "psubw "#E", %%mm6 \n\t"\ - "psraw $3, "#E" \n\t"\ - "paddw %%mm7, %%mm6 \n\t"\ - "paddw "#E", %%mm6 \n\t"\ - "paddw "#B", "#B" \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, "#F" \n\t"\ - "psubw "#B", %%mm6 \n\t"\ - "psraw $1, "#B" \n\t"\ - "psubw "#A", %%mm6 \n\t"\ - "paddw "MANGLE(ADD)", %%mm6 \n\t"\ - "psrlw $7, %%mm6 \n\t"\ - "psubw "MANGLE(SUB)", %%mm6 \n\t"\ - "packuswb %%mm6, %%mm6 \n\t"\ - OP(%%mm6, (%1), A, d) \ - "add %3, %1 \n\t" - -/* vertical filter [ 0 -1 5 5 -1 0] */ -#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, SUB, MUL1, MUL2) \ - "movd (%0), "#F" \n\t"\ - "movq "#C", %%mm6 \n\t"\ - "paddw "#D", %%mm6 \n\t"\ - "pmullw "MANGLE(MUL1)", %%mm6\n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, "#F" \n\t"\ - "psubw "#B", %%mm6 \n\t"\ - "psubw "#E", %%mm6 \n\t"\ - "paddw "MANGLE(ADD)", %%mm6 \n\t"\ - "psraw $3, %%mm6 \n\t"\ - "packuswb %%mm6, %%mm6 \n\t"\ - OP(%%mm6, (%1), A, d) \ - "add %3, %1 \n\t" - -/* vertical filter [ 0 -7 42 96 -2 -1] */ -#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, SUB, MUL1, MUL2) \ - "movd (%0), "#F" \n\t"\ - "movq "#C", %%mm6 \n\t"\ - "pmullw "MANGLE(MUL2)", %%mm6\n\t"\ - "movq "#D", %%mm7 \n\t"\ - "pmullw "MANGLE(MUL1)", %%mm7\n\t"\ - "psllw $3, "#B" \n\t"\ - "psubw "#B", %%mm6 \n\t"\ - "psraw $3, "#B" \n\t"\ - "paddw %%mm7, %%mm6 \n\t"\ - "paddw "#B", %%mm6 \n\t"\ - "paddw "#E", "#E" \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, "#F" \n\t"\ - "psubw "#E", %%mm6 \n\t"\ - "psraw $1, "#E" \n\t"\ - "psubw "#F", %%mm6 \n\t"\ - "paddw "MANGLE(ADD)", %%mm6 \n\t"\ - "psrlw $7, %%mm6 \n\t"\ - "psubw "MANGLE(SUB)", %%mm6 \n\t"\ - "packuswb %%mm6, %%mm6 \n\t"\ - OP(%%mm6, (%1), A, d) \ - "add %3, %1 \n\t" - - -#define QPEL_CAVSVNUM(VOP,OP,ADD,SUB,MUL1,MUL2)\ - int w= 2;\ - src -= 2*srcStride;\ - \ - while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, SUB, MUL1, MUL2)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, SUB, MUL1, MUL2)\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, SUB, MUL1, MUL2)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, SUB, MUL1, MUL2)\ - VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, SUB, MUL1, MUL2)\ - VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, SUB, MUL1, MUL2)\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, SUB, MUL1, MUL2)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, SUB, MUL1, MUL2)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\ - NAMED_CONSTRAINTS_ADD(ADD,SUB,MUL1,MUL2)\ - : "memory"\ - );\ - if(h==16){\ - __asm__ volatile(\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, SUB, MUL1, MUL2)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, SUB, MUL1, MUL2)\ - VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, SUB, MUL1, MUL2)\ - VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, SUB, MUL1, MUL2)\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, SUB, MUL1, MUL2)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, SUB, MUL1, MUL2)\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, SUB, MUL1, MUL2)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, SUB, MUL1, MUL2)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\ - NAMED_CONSTRAINTS_ADD(ADD,SUB,MUL1,MUL2)\ - : "memory"\ - );\ - }\ - src += 4-(h+5)*srcStride;\ - dst += 4-h*dstStride;\ - } - -#define QPEL_CAVS(OPNAME, OP, MMX)\ -static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\ -{\ - int h=8;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 1(%0), %%mm2 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "pmullw %%mm6, %%mm0 \n\t"\ - "pmullw %%mm6, %%mm1 \n\t"\ - "movq -1(%0), %%mm2 \n\t"\ - "movq 2(%0), %%mm4 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "movq %%mm4, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm4, %%mm2 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psubw %%mm2, %%mm0 \n\t"\ - "psubw %%mm5, %%mm1 \n\t"\ - "movq "MANGLE(ff_pw_4)", %%mm5\n\t"\ - "paddw %%mm5, %%mm0 \n\t"\ - "paddw %%mm5, %%mm1 \n\t"\ - "psraw $3, %%mm0 \n\t"\ - "psraw $3, %%mm1 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm5, q) \ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+m"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ - NAMED_CONSTRAINTS_ADD(ff_pw_4,ff_pw_5)\ - : "memory"\ - );\ -}\ -\ -static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\ -{ \ - QPEL_CAVSVNUM(QPEL_CAVSV1,OP,pw_4160,ff_pw_32,pw_96,pw_42) \ -}\ -\ -static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\ -{ \ - QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_4,ff_pw_5,pw_42) \ -}\ -\ -static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\ -{ \ - QPEL_CAVSVNUM(QPEL_CAVSV3,OP,pw_4160,ff_pw_32,pw_96,pw_42) \ -}\ -\ -static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\ -{ \ - OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\ -{ \ - OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\ -{ \ - OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\ -{ \ - OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\ -{ \ - OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\ -{ \ - OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\ -{ \ - OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ - src += 8*srcStride;\ - dst += 8*dstStride;\ - OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ -}\ - -#define CAVS_MC(OPNAME, SIZE, MMX) \ -static void OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ -{\ - OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ -{\ - OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ -{\ - OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ -{\ - OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\ -}\ - -#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" -#define AVG_MMXEXT_OP(a, b, temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" - -#endif /* HAVE_MMXEXT_INLINE */ - #if HAVE_MMX_EXTERNAL static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) @@ -354,22 +79,6 @@ static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c) #endif /* HAVE_MMX_EXTERNAL */ } -#define DSPFUNC(PFX, IDX, NUM, EXT) \ - c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \ - c->PFX ## _cavs_qpel_pixels_tab[IDX][ 4] = PFX ## _cavs_qpel ## NUM ## _mc01_ ## EXT; \ - c->PFX ## _cavs_qpel_pixels_tab[IDX][ 8] = PFX ## _cavs_qpel ## NUM ## _mc02_ ## EXT; \ - c->PFX ## _cavs_qpel_pixels_tab[IDX][12] = PFX ## _cavs_qpel ## NUM ## _mc03_ ## EXT; \ - -#if HAVE_MMXEXT_INLINE -QPEL_CAVS(put_, PUT_OP, mmxext) -QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext) - -CAVS_MC(put_, 8, mmxext) -CAVS_MC(put_, 16, mmxext) -CAVS_MC(avg_, 8, mmxext) -CAVS_MC(avg_, 16, mmxext) -#endif /* HAVE_MMXEXT_INLINE */ - #if HAVE_SSE2_EXTERNAL #define DEF_QPEL(OPNAME) \ void ff_ ## OPNAME ## _cavs_qpel8_mc20_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ @@ -418,14 +127,6 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c) if (X86_MMX(cpu_flags)) cavsdsp_init_mmx(c); -#if HAVE_MMXEXT_INLINE - if (INLINE_MMXEXT(cpu_flags)) { - DSPFUNC(put, 0, 16, mmxext); - DSPFUNC(put, 1, 8, mmxext); - DSPFUNC(avg, 0, 16, mmxext); - DSPFUNC(avg, 1, 8, mmxext); - } -#endif #if HAVE_MMX_EXTERNAL if (EXTERNAL_MMXEXT(cpu_flags)) { c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmxext; -- 2.49.1 >From 8decd810bff653e56c0f8c9aa0d6f8cc299096a5 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Sun, 5 Oct 2025 05:33:21 +0200 Subject: [PATCH 7/7] avcodec/x86/fpel: Add blocksize x blocksize avg/put functions This commit deduplicates the wrappers around the fpel functions for copying whole blocks (i.e. height equaling width). It does this in a manner which avoids having push/pop function arguments when the calling convention forces one to pass them on the stack (as in 32bit systems). Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/x86/cavsdsp.c | 34 ++++------------------------------ libavcodec/x86/fpel.asm | 5 +++++ libavcodec/x86/fpel.h | 8 ++++++++ libavcodec/x86/h264_qpel.c | 23 +++-------------------- libavcodec/x86/qpeldsp_init.c | 21 ++++----------------- 5 files changed, 24 insertions(+), 67 deletions(-) diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c index 593faa85b8..d14b472d54 100644 --- a/libavcodec/x86/cavsdsp.c +++ b/libavcodec/x86/cavsdsp.c @@ -46,36 +46,10 @@ static void cavs_idct8_add_sse2(uint8_t *dst, int16_t *block, ptrdiff_t stride) #endif /* HAVE_SSE2_EXTERNAL */ -#if HAVE_MMX_EXTERNAL -static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride) -{ - ff_put_pixels8_mmx(dst, src, stride, 8); -} - -static void avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride) -{ - ff_avg_pixels8_mmxext(dst, src, stride, 8); -} - -static void put_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride) -{ - ff_put_pixels16_sse2(dst, src, stride, 16); -} - -static void avg_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride) -{ - ff_avg_pixels16_sse2(dst, src, stride, 16); -} -#endif - static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c) { #if HAVE_MMX_EXTERNAL - c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx; + c->put_cavs_qpel_pixels_tab[1][0] = ff_put_pixels8x8_mmx; #endif /* HAVE_MMX_EXTERNAL */ } @@ -129,12 +103,12 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c) #if HAVE_MMX_EXTERNAL if (EXTERNAL_MMXEXT(cpu_flags)) { - c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmxext; + c->avg_cavs_qpel_pixels_tab[1][0] = ff_avg_pixels8x8_mmxext; } #endif #if HAVE_SSE2_EXTERNAL if (EXTERNAL_SSE2(cpu_flags)) { - c->put_cavs_qpel_pixels_tab[0][ 0] = put_cavs_qpel16_mc00_sse2; + c->put_cavs_qpel_pixels_tab[0][ 0] = ff_put_pixels16x16_sse2; c->put_cavs_qpel_pixels_tab[0][ 2] = put_cavs_qpel16_mc20_sse2; c->put_cavs_qpel_pixels_tab[0][ 4] = put_cavs_qpel16_mc01_sse2; c->put_cavs_qpel_pixels_tab[0][ 8] = put_cavs_qpel16_mc02_sse2; @@ -144,7 +118,7 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c) c->put_cavs_qpel_pixels_tab[1][ 8] = ff_put_cavs_qpel8_mc02_sse2; c->put_cavs_qpel_pixels_tab[1][12] = ff_put_cavs_qpel8_mc03_sse2; - c->avg_cavs_qpel_pixels_tab[0][ 0] = avg_cavs_qpel16_mc00_sse2; + c->avg_cavs_qpel_pixels_tab[0][ 0] = ff_avg_pixels16x16_sse2; c->avg_cavs_qpel_pixels_tab[0][ 2] = avg_cavs_qpel16_mc20_sse2; c->avg_cavs_qpel_pixels_tab[0][ 4] = avg_cavs_qpel16_mc01_sse2; c->avg_cavs_qpel_pixels_tab[0][ 8] = avg_cavs_qpel16_mc02_sse2; diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm index 8ca684efa9..68a05310f2 100644 --- a/libavcodec/x86/fpel.asm +++ b/libavcodec/x86/fpel.asm @@ -35,7 +35,12 @@ SECTION .text %define LOAD movu %define SAVE mova %endif +cglobal %1_pixels%2x%2, 3,5,4 + mov r3d, %2 + jmp %1_pixels%2_after_prologue + cglobal %1_pixels%2, 4,5,4 +%1_pixels%2_after_prologue: lea r4, [r2*3] .loop: LOAD m0, [r1] diff --git a/libavcodec/x86/fpel.h b/libavcodec/x86/fpel.h index dc69e1cd83..598a7a6f63 100644 --- a/libavcodec/x86/fpel.h +++ b/libavcodec/x86/fpel.h @@ -24,12 +24,20 @@ void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_avg_pixels8x8_mmxext(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size); void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_avg_pixels16x16_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size); void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_pixels8x8_mmx(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size); void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_pixels16x16_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size); #endif /* AVCODEC_X86_FPEL_H */ diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 636be54530..aa8ee61ae7 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -203,23 +203,6 @@ H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ -static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src, - ptrdiff_t stride) -{ - ff_put_pixels16_sse2(dst, src, stride, 16); -} -static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src, - ptrdiff_t stride) -{ - ff_avg_pixels16_sse2(dst, src, stride, 16); -} - -static void avg_h264_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride) -{ - ff_avg_pixels8_mmxext(dst, src, stride, 8); -} - #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ {\ @@ -462,7 +445,7 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth) if (EXTERNAL_MMXEXT(cpu_flags)) { if (!high_bit_depth) { SET_QPEL_FUNCS_1PP(put_h264_qpel, 2, 4, mmxext, ); - c->avg_h264_qpel_pixels_tab[1][0] = avg_h264_qpel8_mc00_mmxext; + c->avg_h264_qpel_pixels_tab[1][0] = ff_avg_pixels8x8_mmxext; SET_QPEL_FUNCS_1PP(avg_h264_qpel, 2, 4, mmxext, ); c->avg_h264_qpel_pixels_tab[2][0] = ff_avg_pixels4_mmxext; } else if (bit_depth == 10) { @@ -485,8 +468,8 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth) H264_QPEL_FUNCS(3, 1, sse2); H264_QPEL_FUNCS(3, 2, sse2); H264_QPEL_FUNCS(3, 3, sse2); - c->put_h264_qpel_pixels_tab[0][0] = put_h264_qpel16_mc00_sse2; - c->avg_h264_qpel_pixels_tab[0][0] = avg_h264_qpel16_mc00_sse2; + c->put_h264_qpel_pixels_tab[0][0] = ff_put_pixels16x16_sse2; + c->avg_h264_qpel_pixels_tab[0][0] = ff_avg_pixels16x16_sse2; } if (bit_depth == 10) { diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c index 0bb39402d4..4bd45a7779 100644 --- a/libavcodec/x86/qpeldsp_init.c +++ b/libavcodec/x86/qpeldsp_init.c @@ -489,19 +489,6 @@ QPEL_OP(put_, _, mmxext, PASSTHROUGH) QPEL_OP(avg_, _, mmxext, STRIP_HEIGHT) QPEL_OP(put_no_rnd_, _no_rnd_, mmxext, PASSTHROUGH) -#define MC00(OPNAME, SIZE, EXT) \ -static void OPNAME ## _qpel ## SIZE ## _mc00_ ## EXT(uint8_t *dst, \ - const uint8_t *src,\ - ptrdiff_t stride) \ -{ \ - ff_ ## OPNAME ## _pixels ## SIZE ##_ ## EXT(dst, src, stride, SIZE);\ -} - -MC00(put, 8, mmx) -MC00(avg, 8, mmxext) -MC00(put, 16, sse2) -MC00(avg, 16, sse2) - #endif /* HAVE_X86ASM */ #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ @@ -530,12 +517,12 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c) if (X86_MMXEXT(cpu_flags)) { #if HAVE_MMXEXT_EXTERNAL SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, ); - c->avg_qpel_pixels_tab[1][0] = avg_qpel8_mc00_mmxext; + c->avg_qpel_pixels_tab[1][0] = ff_avg_pixels8x8_mmxext; SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, ); SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, ); c->put_no_rnd_qpel_pixels_tab[1][0] = - c->put_qpel_pixels_tab[1][0] = put_qpel8_mc00_mmx; + c->put_qpel_pixels_tab[1][0] = ff_put_pixels8x8_mmx; SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, ); SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, ); SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, ); @@ -544,8 +531,8 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c) #if HAVE_SSE2_EXTERNAL if (EXTERNAL_SSE2(cpu_flags)) { c->put_no_rnd_qpel_pixels_tab[0][0] = - c->put_qpel_pixels_tab[0][0] = put_qpel16_mc00_sse2; - c->avg_qpel_pixels_tab[0][0] = avg_qpel16_mc00_sse2; + c->put_qpel_pixels_tab[0][0] = ff_put_pixels16x16_sse2; + c->avg_qpel_pixels_tab[0][0] = ff_avg_pixels16x16_sse2; } #endif } -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2025-10-05 18:54 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=175969040086.65.6437601043884458921@bf249f23a2c8 \ --to=ffmpeg-devel@ffmpeg.org \ --cc=code@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git