* [FFmpeg-devel] [PATCH] avcodec/x86/qpel: Add specializations for put_l2 functions (PR #20785)
@ 2025-10-29 14:32 mkver via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-10-29 14:32 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: mkver
PR #20785 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20785
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20785.patch
>From 995bc3690264ac6711d8364156fbfcf5b40766e1 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 9 Oct 2025 03:57:33 +0200
Subject: [PATCH 1/3] avcodec/x86/{h264_qpel,qpeldsp_init}: Move shared decls
into header
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/h264_qpel.c | 9 +--------
libavcodec/x86/qpel.h | 38 +++++++++++++++++++++++++++++++++++
libavcodec/x86/qpeldsp_init.c | 14 +------------
3 files changed, 40 insertions(+), 21 deletions(-)
create mode 100644 libavcodec/x86/qpel.h
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 5d1445a8bb..30851e266e 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -28,6 +28,7 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/h264qpel.h"
#include "fpel.h"
+#include "qpel.h"
#if HAVE_X86ASM
void ff_avg_pixels4_mmxext(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
@@ -35,14 +36,6 @@ void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *
ptrdiff_t stride);
void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
ptrdiff_t stride);
-void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
-void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t dstStride, ptrdiff_t src1Stride);
-void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
-void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t dstStride, ptrdiff_t src1Stride);
#define ff_put_pixels4_l2_mmxext(dst, src1, src2, dststride, src1stride, h) \
ff_put_pixels4_l2_mmxext((dst), (src1), (src2), (dststride))
#define ff_avg_pixels4_l2_mmxext(dst, src1, src2, dststride, src1stride, h) \
diff --git a/libavcodec/x86/qpel.h b/libavcodec/x86/qpel.h
new file mode 100644
index 0000000000..b30d5e23dc
--- /dev/null
+++ b/libavcodec/x86/qpel.h
@@ -0,0 +1,38 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_QPEL_H
+#define AVCODEC_X86_QPEL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void ff_put_pixels8_l2_mmxext(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
+void ff_avg_pixels8_l2_mmxext(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_pixels16_l2_mmxext(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
+void ff_avg_pixels16_l2_mmxext(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride);
+
+#endif /* AVCODEC_X86_QPEL_H */
diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c
index a1d1eb80b3..f88c804a48 100644
--- a/libavcodec/x86/qpeldsp_init.c
+++ b/libavcodec/x86/qpeldsp_init.c
@@ -27,25 +27,13 @@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
-#include "libavcodec/pixels.h"
#include "libavcodec/qpeldsp.h"
#include "fpel.h"
+#include "qpel.h"
-void ff_put_pixels8_l2_mmxext(uint8_t *dst,
- const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst,
const uint8_t *src1, const uint8_t *src2,
ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
-void ff_avg_pixels8_l2_mmxext(uint8_t *dst,
- const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t dstStride, ptrdiff_t src1Stride);
-void ff_put_pixels16_l2_mmxext(uint8_t *dst,
- const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
-void ff_avg_pixels16_l2_mmxext(uint8_t *dst,
- const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t dstStride, ptrdiff_t src1Stride);
void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst,
const uint8_t *src1, const uint8_t *src2,
ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
--
2.49.1
>From d418a760f06765542a8a6e61eb28206e310f3782 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 29 Oct 2025 14:37:20 +0100
Subject: [PATCH 2/3] avcodec/x86/qpel: Add specializations for put_l2
functions
These functions are currently always called with height either
being equal to the block size or block size+1. height is
a compile-time constant at every callsite. This makes it possible
to split this function into two to avoid the check inside
the function for whether height is odd or even.
The corresponding avg function is only used with height == block size,
so that it does not have a height parameter at all. Removing the
parameter from the put_l2 functions as well therefore simplifies
the C code.
The new functions increase the size of .text from qpel{dsp}.o
by 32B here, yet they save 464B of C code here.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/h264_qpel.c | 30 +++---
libavcodec/x86/h264_qpel_8bit.asm | 6 +-
libavcodec/x86/qpel.asm | 42 ++++----
libavcodec/x86/qpel.h | 24 ++---
libavcodec/x86/qpeldsp.asm | 34 ++++---
libavcodec/x86/qpeldsp_init.c | 164 ++++++++++++++++--------------
6 files changed, 157 insertions(+), 143 deletions(-)
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 30851e266e..b17ee7e02d 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -32,20 +32,18 @@
#if HAVE_X86ASM
void ff_avg_pixels4_mmxext(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
-void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t stride);
-void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t stride);
-#define ff_put_pixels4_l2_mmxext(dst, src1, src2, dststride, src1stride, h) \
- ff_put_pixels4_l2_mmxext((dst), (src1), (src2), (dststride))
-#define ff_avg_pixels4_l2_mmxext(dst, src1, src2, dststride, src1stride, h) \
- ff_avg_pixels4_l2_mmxext((dst), (src1), (src2), (dststride))
-#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext
-#define ff_avg_pixels8_l2_sse2(dst, src1, src2, dststride, src1stride, h) \
- ff_avg_pixels8_l2_mmxext((dst), (src1), (src2), (dststride), (src1stride))
-#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
-#define ff_avg_pixels16_l2_sse2(dst, src1, src2, dststride, src1stride, h) \
- ff_avg_pixels16_l2_mmxext((dst), (src1), (src2), (dststride), (src1stride))
+void ff_put_pixels4x4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t stride);
+void ff_avg_pixels4x4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t stride);
+#define ff_put_pixels4x4_l2_mmxext(dst, src1, src2, dststride, src1stride) \
+ ff_put_pixels4x4_l2_mmxext((dst), (src1), (src2), (dststride))
+#define ff_avg_pixels4x4_l2_mmxext(dst, src1, src2, dststride, src1stride) \
+ ff_avg_pixels4x4_l2_mmxext((dst), (src1), (src2), (dststride))
+#define ff_put_pixels8x8_l2_sse2 ff_put_pixels8x8_l2_mmxext
+#define ff_avg_pixels8x8_l2_sse2 ff_avg_pixels8x8_l2_mmxext
+#define ff_put_pixels16x16_l2_sse2 ff_put_pixels16x16_l2_mmxext
+#define ff_avg_pixels16x16_l2_sse2 ff_avg_pixels16x16_l2_mmxext
#define DEF_QPEL(OPNAME)\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride);\
@@ -177,7 +175,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uin
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
- ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
+ ff_ ## OPNAME ## pixels ## SIZE ## x ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
@@ -189,7 +187,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
- ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
+ ff_ ## OPNAME ## pixels ## SIZE ## x ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride);\
}\
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN, SHIFT5_EXT) \
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index 6e082819ac..3aa1f233a0 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -69,11 +69,11 @@ cglobal avg_pixels4, 3,4
mova %2, %1
%endmacro
-; void ff_put/avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
-; ptrdiff_t stride)
+; void ff_put/avg_pixels4x4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+; ptrdiff_t stride)
%macro PIXELS4_L2 1
%define OP op_%1h
-cglobal %1_pixels4_l2, 4,4
+cglobal %1_pixels4x4_l2, 4,4
mova m0, [r1]
mova m1, [r1+r3]
lea r1, [r1+2*r3]
diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm
index 93f0d007c3..be8bc4f579 100644
--- a/libavcodec/x86/qpel.asm
+++ b/libavcodec/x86/qpel.asm
@@ -37,11 +37,9 @@ SECTION .text
%macro PIXELS8_L2 1
%define OP op_%1
%ifidn %1, put
-; void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
-; ptrdiff_t dstStride, ptrdiff_t src1Stride, int h)
-cglobal put_pixels8_l2, 6,6
- test r5d, 1
- je .loop
+; void ff_put_pixels8x9_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+; ptrdiff_t dstStride, ptrdiff_t src1Stride)
+cglobal put_pixels8x9_l2, 5,6
mova m0, [r1]
mova m1, [r2]
add r1, r4
@@ -49,13 +47,14 @@ cglobal put_pixels8_l2, 6,6
pavgb m0, m1
OP m0, [r0]
add r0, r3
- dec r5d
-%else
-; void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
-; ptrdiff_t dstStride, ptrdiff_t src1Stride)
-cglobal avg_pixels8_l2, 5,6
- mov r5d, 8
+ ; FIXME: avoid jump if prologue is empty
+ jmp %1_pixels8x8_after_prologue_ %+ cpuname
%endif
+; void ff_avg/put_pixels8x8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+; ptrdiff_t dstStride, ptrdiff_t src1Stride)
+cglobal %1_pixels8x8_l2, 5,6
+%1_pixels8x8_after_prologue_ %+ cpuname:
+ mov r5d, 8
.loop:
mova m0, [r1]
mova m1, [r1+r4]
@@ -86,11 +85,9 @@ PIXELS8_L2 avg
%macro PIXELS16_L2 1
%define OP op_%1
%ifidn %1, put
-; void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
-; ptrdiff_t dstStride, ptrdiff_t src1Stride, int h)
-cglobal put_pixels16_l2, 6,6
- test r5d, 1
- je .loop
+; void ff_put_pixels16x17_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+; ptrdiff_t dstStride, ptrdiff_t src1Stride)
+cglobal put_pixels16x17_l2, 5,6
mova m0, [r1]
mova m1, [r1+8]
pavgb m0, [r2]
@@ -100,13 +97,14 @@ cglobal put_pixels16_l2, 6,6
OP m0, [r0]
OP m1, [r0+8]
add r0, r3
- dec r5d
-%else
-; void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
-; ptrdiff_t dstStride, ptrdiff_t src1Stride)
-cglobal avg_pixels16_l2, 5,6
- mov r5d, 16
+ ; FIXME: avoid jump if prologue is empty
+ jmp %1_pixels16x16_after_prologue_ %+ cpuname
%endif
+; void ff_avg/put_pixels16x16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+; ptrdiff_t dstStride, ptrdiff_t src1Stride)
+cglobal %1_pixels16x16_l2, 5,6
+%1_pixels16x16_after_prologue_ %+ cpuname:
+ mov r5d, 16
.loop:
mova m0, [r1]
mova m1, [r1+8]
diff --git a/libavcodec/x86/qpel.h b/libavcodec/x86/qpel.h
index b30d5e23dc..c4b6ee0413 100644
--- a/libavcodec/x86/qpel.h
+++ b/libavcodec/x86/qpel.h
@@ -22,17 +22,17 @@
#include <stddef.h>
#include <stdint.h>
-void ff_put_pixels8_l2_mmxext(uint8_t *dst,
- const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
-void ff_avg_pixels8_l2_mmxext(uint8_t *dst,
- const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t dstStride, ptrdiff_t src1Stride);
-void ff_put_pixels16_l2_mmxext(uint8_t *dst,
- const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
-void ff_avg_pixels16_l2_mmxext(uint8_t *dst,
- const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_pixels8x8_l2_mmxext(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_avg_pixels8x8_l2_mmxext(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_pixels16x16_l2_mmxext(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_avg_pixels16x16_l2_mmxext(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride);
#endif /* AVCODEC_X86_QPEL_H */
diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm
index 7fa7dbd2dc..52ddd8a8b2 100644
--- a/libavcodec/x86/qpeldsp.asm
+++ b/libavcodec/x86/qpeldsp.asm
@@ -32,13 +32,11 @@ cextern pw_20
SECTION .text
-; void ff_put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2,
-; ptrdiff_t dstStride, ptrdiff_t src1Stride, int h)
%macro PUT_NO_RND_PIXELS8_L2 0
-cglobal put_no_rnd_pixels8_l2, 6,6
+; void ff_put_no_rnd_pixels8x9_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+; ptrdiff_t dstStride, ptrdiff_t src1Stride)
+cglobal put_no_rnd_pixels8x9_l2, 5,6
pcmpeqb m6, m6
- test r5d, 1
- je .loop
mova m0, [r1]
mova m1, [r2]
add r1, r4
@@ -49,7 +47,14 @@ cglobal put_no_rnd_pixels8_l2, 6,6
pxor m0, m6
mova [r0], m0
add r0, r3
- dec r5d
+ jmp put_no_rnd_pixels8x8_after_prologue_ %+ cpuname
+
+; void ff_put_no_rnd_pixels8x8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+; ptrdiff_t dstStride, ptrdiff_t src1Stride)
+cglobal put_no_rnd_pixels8x8_l2, 5,6
+ pcmpeqb m6, m6
+put_no_rnd_pixels8x8_after_prologue_ %+ cpuname:
+ mov r5d, 8
.loop:
mova m0, [r1]
add r1, r4
@@ -97,13 +102,11 @@ INIT_MMX mmxext
PUT_NO_RND_PIXELS8_L2
-; void ff_put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2,
-; ptrdiff_t dstStride, ptrdiff_t src1Stride, int h)
+; void ff_put_no_rnd_pixels16x17_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+; ptrdiff_t dstStride, ptrdiff_t src1Stride)
%macro PUT_NO_RND_PIXELS16_l2 0
-cglobal put_no_rnd_pixels16_l2, 6,6
+cglobal put_no_rnd_pixels16x17_l2, 5,6
pcmpeqb m6, m6
- test r5d, 1
- je .loop
mova m0, [r1]
mova m1, [r1+8]
mova m2, [r2]
@@ -121,7 +124,14 @@ cglobal put_no_rnd_pixels16_l2, 6,6
mova [r0], m0
mova [r0+8], m1
add r0, r3
- dec r5d
+ jmp put_no_rnd_pixels16x16_after_prologue_ %+ cpuname
+
+; void ff_put_no_rnd_pixels16x16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+; ptrdiff_t dstStride, ptrdiff_t src1Stride)
+cglobal put_no_rnd_pixels16x16_l2, 5,6
+ pcmpeqb m6, m6
+put_no_rnd_pixels16x16_after_prologue_ %+ cpuname:
+ mov r5d, 16
.loop:
mova m0, [r1]
mova m1, [r1+8]
diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c
index f88c804a48..cab2ac433a 100644
--- a/libavcodec/x86/qpeldsp_init.c
+++ b/libavcodec/x86/qpeldsp_init.c
@@ -31,12 +31,24 @@
#include "fpel.h"
#include "qpel.h"
-void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst,
- const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
-void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst,
- const uint8_t *src1, const uint8_t *src2,
- ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
+void ff_put_pixels8x9_l2_mmxext(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_pixels16x17_l2_mmxext(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_no_rnd_pixels8x8_l2_mmxext(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_no_rnd_pixels8x9_l2_mmxext(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_no_rnd_pixels16x16_l2_mmxext(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_no_rnd_pixels16x17_l2_mmxext(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride);
void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
ptrdiff_t dstStride, ptrdiff_t srcStride, int h);
void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
@@ -70,7 +82,7 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst,
#if HAVE_X86ASM
-#define QPEL_OP(OPNAME, RND, MMX, ARG) \
+#define QPEL_OP(OPNAME, RND, MMX) \
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
@@ -79,8 +91,8 @@ static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, \
uint8_t *const half = (uint8_t *) temp; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
stride, 8); \
- ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, src, half, \
- stride, stride, 8)); \
+ ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, src, half, \
+ stride, stride); \
} \
\
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, \
@@ -99,8 +111,8 @@ static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, \
uint8_t *const half = (uint8_t *) temp; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
stride, 8); \
- ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, src + 1, half, \
- stride, stride, 8)); \
+ ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, src + 1, half, \
+ stride, stride); \
} \
\
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, \
@@ -111,8 +123,8 @@ static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, \
uint8_t *const half = (uint8_t *) temp; \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
8, stride); \
- ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, src, half, \
- stride, stride, 8)); \
+ ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, src, half, \
+ stride, stride); \
} \
\
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, \
@@ -131,8 +143,8 @@ static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, \
uint8_t *const half = (uint8_t *) temp; \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
8, stride); \
- ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, src + stride, half, \
- stride, stride, 8)); \
+ ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, src + stride, half, \
+ stride, stride); \
} \
\
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, \
@@ -144,11 +156,11 @@ static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
- ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
- stride, 9); \
+ ff_put ## RND ## pixels8x9_l2_ ## MMX(halfH, src, halfH, 8, \
+ stride); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
- ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH, halfHV, \
- stride, 8, 8)); \
+ ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 8); \
} \
\
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, \
@@ -160,11 +172,11 @@ static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
- ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
- stride, 9); \
+ ff_put ## RND ## pixels8x9_l2_ ## MMX(halfH, src + 1, halfH, 8, \
+ stride); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
- ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH, halfHV, \
- stride, 8, 8)); \
+ ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 8); \
} \
\
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, \
@@ -176,11 +188,11 @@ static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
- ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
- stride, 9); \
+ ff_put ## RND ## pixels8x9_l2_ ## MMX(halfH, src, halfH, 8, \
+ stride); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
- ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH + 8, halfHV, \
- stride, 8, 8)); \
+ ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, halfH + 8, halfHV, \
+ stride, 8); \
} \
\
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, \
@@ -192,11 +204,11 @@ static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
- ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
- stride, 9); \
+ ff_put ## RND ## pixels8x9_l2_ ## MMX(halfH, src + 1, halfH, 8, \
+ stride); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
- ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH + 8, halfHV, \
- stride, 8, 8)); \
+ ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, halfH + 8, halfHV, \
+ stride, 8); \
} \
\
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, \
@@ -209,8 +221,8 @@ static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
- ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH, halfHV, \
- stride, 8, 8)); \
+ ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 8); \
} \
\
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, \
@@ -223,8 +235,8 @@ static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
- ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH + 8, halfHV, \
- stride, 8, 8)); \
+ ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, halfH + 8, halfHV, \
+ stride, 8); \
} \
\
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, \
@@ -235,8 +247,8 @@ static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, \
uint8_t *const halfH = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
- ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
- 8, stride, 9); \
+ ff_put ## RND ## pixels8x9_l2_ ## MMX(halfH, src, halfH, \
+ 8, stride); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
@@ -249,8 +261,8 @@ static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, \
uint8_t *const halfH = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
- ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
- stride, 9); \
+ ff_put ## RND ## pixels8x9_l2_ ## MMX(halfH, src + 1, halfH, 8, \
+ stride); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
@@ -275,8 +287,8 @@ static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, \
uint8_t *const half = (uint8_t *) temp; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
stride, 16); \
- ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, src, half, \
- stride, stride, 16)); \
+ ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, src, half, \
+ stride, stride); \
} \
\
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, \
@@ -295,8 +307,8 @@ static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, \
uint8_t *const half = (uint8_t*) temp; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
stride, 16); \
- ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, src + 1, half, \
- stride, stride, 16)); \
+ ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, src + 1, half, \
+ stride, stride); \
} \
\
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, \
@@ -307,8 +319,8 @@ static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, \
uint8_t *const half = (uint8_t *) temp; \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
stride); \
- ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, src, half, \
- stride, stride, 16)); \
+ ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, src, half, \
+ stride, stride); \
} \
\
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, \
@@ -327,8 +339,8 @@ static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, \
uint8_t *const half = (uint8_t *) temp; \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
stride); \
- ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, src+stride, half, \
- stride, stride, 16)); \
+ ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, src+stride, half, \
+ stride, stride); \
} \
\
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, \
@@ -340,12 +352,12 @@ static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
- ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
- stride, 17); \
+ ff_put ## RND ## pixels16x17_l2_ ## MMX(halfH, src, halfH, 16, \
+ stride); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
- ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH, halfHV, \
- stride, 16, 16)); \
+ ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 16); \
} \
\
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, \
@@ -357,12 +369,12 @@ static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
- ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
- stride, 17); \
+ ff_put ## RND ## pixels16x17_l2_ ## MMX(halfH, src + 1, halfH, 16, \
+ stride); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
- ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH, halfHV, \
- stride, 16, 16)); \
+ ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 16); \
} \
\
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, \
@@ -374,12 +386,12 @@ static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
- ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
- stride, 17); \
+ ff_put ## RND ## pixels16x17_l2_ ## MMX(halfH, src, halfH, 16, \
+ stride); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
- ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH + 16, halfHV, \
- stride, 16, 16)); \
+ ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, halfH + 16, halfHV, \
+ stride, 16); \
} \
\
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, \
@@ -391,12 +403,12 @@ static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
- ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
- stride, 17); \
+ ff_put ## RND ## pixels16x17_l2_ ## MMX(halfH, src + 1, halfH, 16, \
+ stride); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
- ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH + 16, halfHV, \
- stride, 16, 16)); \
+ ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, halfH + 16, halfHV, \
+ stride, 16); \
} \
\
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, \
@@ -410,8 +422,8 @@ static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
- ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH, halfHV, \
- stride, 16, 16)); \
+ ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 16); \
} \
\
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, \
@@ -425,8 +437,8 @@ static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
- ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH + 16, halfHV, \
- stride, 16, 16)); \
+ ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, halfH + 16, halfHV, \
+ stride, 16); \
} \
\
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, \
@@ -437,8 +449,8 @@ static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, \
uint8_t *const halfH = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
- ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
- stride, 17); \
+ ff_put ## RND ## pixels16x17_l2_ ## MMX(halfH, src, halfH, 16, \
+ stride); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
} \
@@ -451,8 +463,8 @@ static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, \
uint8_t *const halfH = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
- ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
- stride, 17); \
+ ff_put ## RND ## pixels16x17_l2_ ## MMX(halfH, src + 1, halfH, 16, \
+ stride); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
} \
@@ -469,13 +481,9 @@ static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, \
stride, 16); \
}
-#define PASSTHROUGH(...) __VA_ARGS__
-#define STRIP_HEIGHT(dst, src1, src2, dststride, srcstride, height) \
- (dst), (src1), (src2), (dststride), (srcstride)
-
-QPEL_OP(put_, _, mmxext, PASSTHROUGH)
-QPEL_OP(avg_, _, mmxext, STRIP_HEIGHT)
-QPEL_OP(put_no_rnd_, _no_rnd_, mmxext, PASSTHROUGH)
+QPEL_OP(put_, _, mmxext)
+QPEL_OP(avg_, _, mmxext)
+QPEL_OP(put_no_rnd_, _no_rnd_, mmxext)
#endif /* HAVE_X86ASM */
--
2.49.1
>From 756b072c08a41659b9c7400e4c60f070fd16eefa Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 29 Oct 2025 15:07:40 +0100
Subject: [PATCH 3/3] avcodec/x86/h264_qpel: Add and use
ff_{avg,put}_pixels16x16_l2_sse2()
This avoids mmx (the size 16 h264qpel dsp now no longer uses any mmx)
and improves performance, particularly for the avg case:
Old benchmarks:
avg_h264_qpel_16_mc01_8_c: 780.0 ( 1.00x)
avg_h264_qpel_16_mc01_8_sse2: 91.2 ( 8.55x)
avg_h264_qpel_16_mc03_8_c: 804.0 ( 1.00x)
avg_h264_qpel_16_mc03_8_sse2: 91.2 ( 8.82x)
put_h264_qpel_16_mc01_8_c: 779.5 ( 1.00x)
put_h264_qpel_16_mc01_8_sse2: 82.8 ( 9.41x)
put_h264_qpel_16_mc03_8_c: 770.1 ( 1.00x)
put_h264_qpel_16_mc03_8_sse2: 82.5 ( 9.33x)
New benchmarks:
avg_h264_qpel_16_mc01_8_c: 783.9 ( 1.00x)
avg_h264_qpel_16_mc01_8_sse2: 84.1 ( 9.32x)
avg_h264_qpel_16_mc03_8_c: 797.4 ( 1.00x)
avg_h264_qpel_16_mc03_8_sse2: 83.9 ( 9.51x)
put_h264_qpel_16_mc01_8_c: 767.4 ( 1.00x)
put_h264_qpel_16_mc01_8_sse2: 80.5 ( 9.53x)
put_h264_qpel_16_mc03_8_c: 779.9 ( 1.00x)
put_h264_qpel_16_mc03_8_sse2: 80.3 ( 9.71x)
(qpeldsp will use these functions when it gets ported to SSE2;
then the mmxext functions will be removed as well.)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/h264_qpel.c | 2 --
libavcodec/x86/qpel.asm | 45 +++++++++++++++++++++-----------------
libavcodec/x86/qpel.h | 6 +++++
3 files changed, 31 insertions(+), 22 deletions(-)
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index b17ee7e02d..f2bcca1e11 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -42,8 +42,6 @@ void ff_avg_pixels4x4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t
ff_avg_pixels4x4_l2_mmxext((dst), (src1), (src2), (dststride))
#define ff_put_pixels8x8_l2_sse2 ff_put_pixels8x8_l2_mmxext
#define ff_avg_pixels8x8_l2_sse2 ff_avg_pixels8x8_l2_mmxext
-#define ff_put_pixels16x16_l2_sse2 ff_put_pixels16x16_l2_mmxext
-#define ff_avg_pixels16x16_l2_sse2 ff_avg_pixels16x16_l2_mmxext
#define DEF_QPEL(OPNAME)\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride);\
diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm
index be8bc4f579..8f18cf93db 100644
--- a/libavcodec/x86/qpel.asm
+++ b/libavcodec/x86/qpel.asm
@@ -34,53 +34,58 @@ SECTION .text
mova %2, %1
%endmacro
-%macro PIXELS8_L2 1
+%macro PIXELS_L2 2-3 ; avg vs put, size, size+1
%define OP op_%1
%ifidn %1, put
+%if notcpuflag(sse2) ; SSE2 currently only uses 16x16
; void ff_put_pixels8x9_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
; ptrdiff_t dstStride, ptrdiff_t src1Stride)
-cglobal put_pixels8x9_l2, 5,6
- mova m0, [r1]
- mova m1, [r2]
+cglobal put_pixels%2x%3_l2, 5,6,2
+ movu m0, [r1]
+ pavgb m0, [r2]
add r1, r4
- add r2, 8
- pavgb m0, m1
+ add r2, mmsize
OP m0, [r0]
add r0, r3
; FIXME: avoid jump if prologue is empty
- jmp %1_pixels8x8_after_prologue_ %+ cpuname
+ jmp %1_pixels%2x%2_after_prologue_ %+ cpuname
+%endif
%endif
; void ff_avg/put_pixels8x8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
; ptrdiff_t dstStride, ptrdiff_t src1Stride)
-cglobal %1_pixels8x8_l2, 5,6
-%1_pixels8x8_after_prologue_ %+ cpuname:
- mov r5d, 8
+cglobal %1_pixels%2x%2_l2, 5,6,2
+%1_pixels%2x%2_after_prologue_ %+ cpuname:
+ mov r5d, %2
.loop:
- mova m0, [r1]
- mova m1, [r1+r4]
+ movu m0, [r1]
+ movu m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2]
- pavgb m1, [r2+8]
+ pavgb m1, [r2+mmsize]
OP m0, [r0]
OP m1, [r0+r3]
lea r0, [r0+2*r3]
- mova m0, [r1]
- mova m1, [r1+r4]
+ movu m0, [r1]
+ movu m1, [r1+r4]
lea r1, [r1+2*r4]
- pavgb m0, [r2+16]
- pavgb m1, [r2+24]
+ pavgb m0, [r2+2*mmsize]
+ pavgb m1, [r2+3*mmsize]
OP m0, [r0]
OP m1, [r0+r3]
lea r0, [r0+2*r3]
- add r2, 32
+ add r2, 4*mmsize
sub r5d, 4
jne .loop
RET
%endmacro
INIT_MMX mmxext
-PIXELS8_L2 put
-PIXELS8_L2 avg
+PIXELS_L2 put, 8, 9
+PIXELS_L2 avg, 8
+
+INIT_XMM sse2
+PIXELS_L2 put, 16, 17
+PIXELS_L2 avg, 16
%macro PIXELS16_L2 1
%define OP op_%1
diff --git a/libavcodec/x86/qpel.h b/libavcodec/x86/qpel.h
index c4b6ee0413..61c0473331 100644
--- a/libavcodec/x86/qpel.h
+++ b/libavcodec/x86/qpel.h
@@ -31,8 +31,14 @@ void ff_avg_pixels8x8_l2_mmxext(uint8_t *dst,
void ff_put_pixels16x16_l2_mmxext(uint8_t *dst,
const uint8_t *src1, const uint8_t *src2,
ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_pixels16x16_l2_sse2(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride);
void ff_avg_pixels16x16_l2_mmxext(uint8_t *dst,
const uint8_t *src1, const uint8_t *src2,
ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_avg_pixels16x16_l2_sse2(uint8_t *dst,
+ const uint8_t *src1, const uint8_t *src2,
+ ptrdiff_t dstStride, ptrdiff_t src1Stride);
#endif /* AVCODEC_X86_QPEL_H */
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-10-29 14:33 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-29 14:32 [FFmpeg-devel] [PATCH] avcodec/x86/qpel: Add specializations for put_l2 functions (PR #20785) mkver via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git