[FFmpeg-devel] [PATCH] avcodec/x86/qpel: Add specializations for put

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] [PATCH] avcodec/x86/qpel: Add specializations for put_l2 functions (PR #20785)
@ 2025-10-29 14:32 mkver via ffmpeg-devel
  0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-10-29 14:32 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: mkver

PR #20785 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20785
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20785.patch


>From 995bc3690264ac6711d8364156fbfcf5b40766e1 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 9 Oct 2025 03:57:33 +0200
Subject: [PATCH 1/3] avcodec/x86/{h264_qpel,qpeldsp_init}: Move shared decls
 into header

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/h264_qpel.c    |  9 +--------
 libavcodec/x86/qpel.h         | 38 +++++++++++++++++++++++++++++++++++
 libavcodec/x86/qpeldsp_init.c | 14 +------------
 3 files changed, 40 insertions(+), 21 deletions(-)
 create mode 100644 libavcodec/x86/qpel.h

diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 5d1445a8bb..30851e266e 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -28,6 +28,7 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/h264qpel.h"
 #include "fpel.h"
+#include "qpel.h"
 
 #if HAVE_X86ASM
 void ff_avg_pixels4_mmxext(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
@@ -35,14 +36,6 @@ void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *
                               ptrdiff_t stride);
 void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                               ptrdiff_t stride);
-void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
-                              ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
-void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
-                              ptrdiff_t dstStride, ptrdiff_t src1Stride);
-void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
-                               ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
-void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
-                               ptrdiff_t dstStride, ptrdiff_t src1Stride);
 #define ff_put_pixels4_l2_mmxext(dst, src1, src2, dststride, src1stride, h) \
     ff_put_pixels4_l2_mmxext((dst), (src1), (src2), (dststride))
 #define ff_avg_pixels4_l2_mmxext(dst, src1, src2, dststride, src1stride, h) \
diff --git a/libavcodec/x86/qpel.h b/libavcodec/x86/qpel.h
new file mode 100644
index 0000000000..b30d5e23dc
--- /dev/null
+++ b/libavcodec/x86/qpel.h
@@ -0,0 +1,38 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_QPEL_H
+#define AVCODEC_X86_QPEL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void ff_put_pixels8_l2_mmxext(uint8_t *dst,
+                              const uint8_t *src1, const uint8_t *src2,
+                              ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
+void ff_avg_pixels8_l2_mmxext(uint8_t *dst,
+                              const uint8_t *src1, const uint8_t *src2,
+                              ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_pixels16_l2_mmxext(uint8_t *dst,
+                               const uint8_t *src1, const uint8_t *src2,
+                               ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
+void ff_avg_pixels16_l2_mmxext(uint8_t *dst,
+                               const uint8_t *src1, const uint8_t *src2,
+                               ptrdiff_t dstStride, ptrdiff_t src1Stride);
+
+#endif /* AVCODEC_X86_QPEL_H */
diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c
index a1d1eb80b3..f88c804a48 100644
--- a/libavcodec/x86/qpeldsp_init.c
+++ b/libavcodec/x86/qpeldsp_init.c
@@ -27,25 +27,13 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/pixels.h"
 #include "libavcodec/qpeldsp.h"
 #include "fpel.h"
+#include "qpel.h"
 
-void ff_put_pixels8_l2_mmxext(uint8_t *dst,
-                              const uint8_t *src1, const uint8_t *src2,
-                              ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst,
                                      const uint8_t *src1, const uint8_t *src2,
                                      ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
-void ff_avg_pixels8_l2_mmxext(uint8_t *dst,
-                              const uint8_t *src1, const uint8_t *src2,
-                              ptrdiff_t dstStride, ptrdiff_t src1Stride);
-void ff_put_pixels16_l2_mmxext(uint8_t *dst,
-                               const uint8_t *src1, const uint8_t *src2,
-                               ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
-void ff_avg_pixels16_l2_mmxext(uint8_t *dst,
-                               const uint8_t *src1, const uint8_t *src2,
-                               ptrdiff_t dstStride, ptrdiff_t src1Stride);
 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst,
                                       const uint8_t *src1, const uint8_t *src2,
                                       ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
-- 
2.49.1


>From d418a760f06765542a8a6e61eb28206e310f3782 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 29 Oct 2025 14:37:20 +0100
Subject: [PATCH 2/3] avcodec/x86/qpel: Add specializations for put_l2
 functions

These functions are currently always called with height either
being equal to the block size or block size+1. height is
a compile-time constant at every callsite. This makes it possible
to split this function into two to avoid the check inside
the function for whether height is odd or even.

The corresponding avg function is only used with height == block size,
so that it does not have a height parameter at all. Removing the
parameter from the put_l2 functions as well therefore simplifies
the C code.

The new functions increase the size of .text from qpel{dsp}.o
by 32B here, yet they save 464B of C code here.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/h264_qpel.c        |  30 +++---
 libavcodec/x86/h264_qpel_8bit.asm |   6 +-
 libavcodec/x86/qpel.asm           |  42 ++++----
 libavcodec/x86/qpel.h             |  24 ++---
 libavcodec/x86/qpeldsp.asm        |  34 ++++---
 libavcodec/x86/qpeldsp_init.c     | 164 ++++++++++++++++--------------
 6 files changed, 157 insertions(+), 143 deletions(-)

diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 30851e266e..b17ee7e02d 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -32,20 +32,18 @@
 
 #if HAVE_X86ASM
 void ff_avg_pixels4_mmxext(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
-void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
-                              ptrdiff_t stride);
-void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
-                              ptrdiff_t stride);
-#define ff_put_pixels4_l2_mmxext(dst, src1, src2, dststride, src1stride, h) \
-    ff_put_pixels4_l2_mmxext((dst), (src1), (src2), (dststride))
-#define ff_avg_pixels4_l2_mmxext(dst, src1, src2, dststride, src1stride, h) \
-    ff_avg_pixels4_l2_mmxext((dst), (src1), (src2), (dststride))
-#define ff_put_pixels8_l2_sse2  ff_put_pixels8_l2_mmxext
-#define ff_avg_pixels8_l2_sse2(dst, src1, src2, dststride, src1stride, h) \
-    ff_avg_pixels8_l2_mmxext((dst), (src1), (src2), (dststride), (src1stride))
-#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
-#define ff_avg_pixels16_l2_sse2(dst, src1, src2, dststride, src1stride, h) \
-    ff_avg_pixels16_l2_mmxext((dst), (src1), (src2), (dststride), (src1stride))
+void ff_put_pixels4x4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                                ptrdiff_t stride);
+void ff_avg_pixels4x4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                                ptrdiff_t stride);
+#define ff_put_pixels4x4_l2_mmxext(dst, src1, src2, dststride, src1stride) \
+    ff_put_pixels4x4_l2_mmxext((dst), (src1), (src2), (dststride))
+#define ff_avg_pixels4x4_l2_mmxext(dst, src1, src2, dststride, src1stride) \
+    ff_avg_pixels4x4_l2_mmxext((dst), (src1), (src2), (dststride))
+#define ff_put_pixels8x8_l2_sse2  ff_put_pixels8x8_l2_mmxext
+#define ff_avg_pixels8x8_l2_sse2  ff_avg_pixels8x8_l2_mmxext
+#define ff_put_pixels16x16_l2_sse2 ff_put_pixels16x16_l2_mmxext
+#define ff_avg_pixels16x16_l2_sse2 ff_avg_pixels16x16_l2_mmxext
 
 #define DEF_QPEL(OPNAME)\
 void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride);\
@@ -177,7 +175,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uin
 {\
     LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
-    ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
+    ff_ ## OPNAME ## pixels ## SIZE ## x ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
@@ -189,7 +187,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin
 {\
     LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
-    ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
+    ff_ ## OPNAME ## pixels ## SIZE ## x ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride);\
 }\
 
 #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN, SHIFT5_EXT) \
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index 6e082819ac..3aa1f233a0 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -69,11 +69,11 @@ cglobal avg_pixels4, 3,4
     mova   %2, %1
 %endmacro
 
-; void ff_put/avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
-;                                   ptrdiff_t stride)
+; void ff_put/avg_pixels4x4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                                     ptrdiff_t stride)
 %macro PIXELS4_L2 1
 %define OP op_%1h
-cglobal %1_pixels4_l2, 4,4
+cglobal %1_pixels4x4_l2, 4,4
     mova         m0, [r1]
     mova         m1, [r1+r3]
     lea          r1, [r1+2*r3]
diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm
index 93f0d007c3..be8bc4f579 100644
--- a/libavcodec/x86/qpel.asm
+++ b/libavcodec/x86/qpel.asm
@@ -37,11 +37,9 @@ SECTION .text
 %macro PIXELS8_L2 1
 %define OP op_%1
 %ifidn %1, put
-; void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
-;                               ptrdiff_t dstStride, ptrdiff_t src1Stride, int h)
-cglobal put_pixels8_l2, 6,6
-    test        r5d, 1
-    je        .loop
+; void ff_put_pixels8x9_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                                 ptrdiff_t dstStride, ptrdiff_t src1Stride)
+cglobal put_pixels8x9_l2, 5,6
     mova         m0, [r1]
     mova         m1, [r2]
     add          r1, r4
@@ -49,13 +47,14 @@ cglobal put_pixels8_l2, 6,6
     pavgb        m0, m1
     OP           m0, [r0]
     add          r0, r3
-    dec         r5d
-%else
-; void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
-;                               ptrdiff_t dstStride, ptrdiff_t src1Stride)
-cglobal avg_pixels8_l2, 5,6
-    mov         r5d, 8
+    ; FIXME: avoid jump if prologue is empty
+    jmp          %1_pixels8x8_after_prologue_ %+ cpuname
 %endif
+; void ff_avg/put_pixels8x8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                                     ptrdiff_t dstStride, ptrdiff_t src1Stride)
+cglobal %1_pixels8x8_l2, 5,6
+%1_pixels8x8_after_prologue_ %+ cpuname:
+    mov         r5d, 8
 .loop:
     mova         m0, [r1]
     mova         m1, [r1+r4]
@@ -86,11 +85,9 @@ PIXELS8_L2 avg
 %macro PIXELS16_L2 1
 %define OP op_%1
 %ifidn %1, put
-; void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
-;                                ptrdiff_t dstStride, ptrdiff_t src1Stride, int h)
-cglobal put_pixels16_l2, 6,6
-    test        r5d, 1
-    je        .loop
+; void ff_put_pixels16x17_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                                   ptrdiff_t dstStride, ptrdiff_t src1Stride)
+cglobal put_pixels16x17_l2, 5,6
     mova         m0, [r1]
     mova         m1, [r1+8]
     pavgb        m0, [r2]
@@ -100,13 +97,14 @@ cglobal put_pixels16_l2, 6,6
     OP           m0, [r0]
     OP           m1, [r0+8]
     add          r0, r3
-    dec         r5d
-%else
-; void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
-;                                ptrdiff_t dstStride, ptrdiff_t src1Stride)
-cglobal avg_pixels16_l2, 5,6
-    mov         r5d, 16
+    ; FIXME: avoid jump if prologue is empty
+    jmp          %1_pixels16x16_after_prologue_ %+ cpuname
 %endif
+; void ff_avg/put_pixels16x16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                                       ptrdiff_t dstStride, ptrdiff_t src1Stride)
+cglobal %1_pixels16x16_l2, 5,6
+%1_pixels16x16_after_prologue_ %+ cpuname:
+    mov         r5d, 16
 .loop:
     mova         m0, [r1]
     mova         m1, [r1+8]
diff --git a/libavcodec/x86/qpel.h b/libavcodec/x86/qpel.h
index b30d5e23dc..c4b6ee0413 100644
--- a/libavcodec/x86/qpel.h
+++ b/libavcodec/x86/qpel.h
@@ -22,17 +22,17 @@
 #include <stddef.h>
 #include <stdint.h>
 
-void ff_put_pixels8_l2_mmxext(uint8_t *dst,
-                              const uint8_t *src1, const uint8_t *src2,
-                              ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
-void ff_avg_pixels8_l2_mmxext(uint8_t *dst,
-                              const uint8_t *src1, const uint8_t *src2,
-                              ptrdiff_t dstStride, ptrdiff_t src1Stride);
-void ff_put_pixels16_l2_mmxext(uint8_t *dst,
-                               const uint8_t *src1, const uint8_t *src2,
-                               ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
-void ff_avg_pixels16_l2_mmxext(uint8_t *dst,
-                               const uint8_t *src1, const uint8_t *src2,
-                               ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_pixels8x8_l2_mmxext(uint8_t *dst,
+                                const uint8_t *src1, const uint8_t *src2,
+                                ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_avg_pixels8x8_l2_mmxext(uint8_t *dst,
+                                const uint8_t *src1, const uint8_t *src2,
+                                ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_pixels16x16_l2_mmxext(uint8_t *dst,
+                                  const uint8_t *src1, const uint8_t *src2,
+                                  ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_avg_pixels16x16_l2_mmxext(uint8_t *dst,
+                                  const uint8_t *src1, const uint8_t *src2,
+                                  ptrdiff_t dstStride, ptrdiff_t src1Stride);
 
 #endif /* AVCODEC_X86_QPEL_H */
diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm
index 7fa7dbd2dc..52ddd8a8b2 100644
--- a/libavcodec/x86/qpeldsp.asm
+++ b/libavcodec/x86/qpeldsp.asm
@@ -32,13 +32,11 @@ cextern pw_20
 
 SECTION .text
 
-; void ff_put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2,
-;                               ptrdiff_t dstStride, ptrdiff_t src1Stride, int h)
 %macro PUT_NO_RND_PIXELS8_L2 0
-cglobal put_no_rnd_pixels8_l2, 6,6
+; void ff_put_no_rnd_pixels8x9_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                                 ptrdiff_t dstStride, ptrdiff_t src1Stride)
+cglobal put_no_rnd_pixels8x9_l2, 5,6
     pcmpeqb      m6, m6
-    test        r5d, 1
-    je .loop
     mova         m0, [r1]
     mova         m1, [r2]
     add          r1, r4
@@ -49,7 +47,14 @@ cglobal put_no_rnd_pixels8_l2, 6,6
     pxor         m0, m6
     mova       [r0], m0
     add          r0, r3
-    dec r5d
+    jmp          put_no_rnd_pixels8x8_after_prologue_ %+ cpuname
+
+; void ff_put_no_rnd_pixels8x8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                                 ptrdiff_t dstStride, ptrdiff_t src1Stride)
+cglobal put_no_rnd_pixels8x8_l2, 5,6
+    pcmpeqb      m6, m6
+put_no_rnd_pixels8x8_after_prologue_ %+ cpuname:
+    mov         r5d, 8
 .loop:
     mova         m0, [r1]
     add          r1, r4
@@ -97,13 +102,11 @@ INIT_MMX mmxext
 PUT_NO_RND_PIXELS8_L2
 
 
-; void ff_put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2,
-;                                ptrdiff_t dstStride, ptrdiff_t src1Stride, int h)
+; void ff_put_no_rnd_pixels16x17_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                                   ptrdiff_t dstStride, ptrdiff_t src1Stride)
 %macro PUT_NO_RND_PIXELS16_l2 0
-cglobal put_no_rnd_pixels16_l2, 6,6
+cglobal put_no_rnd_pixels16x17_l2, 5,6
     pcmpeqb      m6, m6
-    test        r5d, 1
-    je .loop
     mova         m0, [r1]
     mova         m1, [r1+8]
     mova         m2, [r2]
@@ -121,7 +124,14 @@ cglobal put_no_rnd_pixels16_l2, 6,6
     mova       [r0], m0
     mova     [r0+8], m1
     add          r0, r3
-    dec r5d
+    jmp          put_no_rnd_pixels16x16_after_prologue_ %+ cpuname
+
+; void ff_put_no_rnd_pixels16x16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                                   ptrdiff_t dstStride, ptrdiff_t src1Stride)
+cglobal put_no_rnd_pixels16x16_l2, 5,6
+    pcmpeqb      m6, m6
+put_no_rnd_pixels16x16_after_prologue_ %+ cpuname:
+    mov         r5d, 16
 .loop:
     mova         m0, [r1]
     mova         m1, [r1+8]
diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c
index f88c804a48..cab2ac433a 100644
--- a/libavcodec/x86/qpeldsp_init.c
+++ b/libavcodec/x86/qpeldsp_init.c
@@ -31,12 +31,24 @@
 #include "fpel.h"
 #include "qpel.h"
 
-void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst,
-                                     const uint8_t *src1, const uint8_t *src2,
-                                     ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
-void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst,
-                                      const uint8_t *src1, const uint8_t *src2,
-                                      ptrdiff_t dstStride, ptrdiff_t src1Stride, int h);
+void ff_put_pixels8x9_l2_mmxext(uint8_t *dst,
+                                const uint8_t *src1, const uint8_t *src2,
+                                ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_pixels16x17_l2_mmxext(uint8_t *dst,
+                                  const uint8_t *src1, const uint8_t *src2,
+                                  ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_no_rnd_pixels8x8_l2_mmxext(uint8_t *dst,
+                                       const uint8_t *src1, const uint8_t *src2,
+                                       ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_no_rnd_pixels8x9_l2_mmxext(uint8_t *dst,
+                                       const uint8_t *src1, const uint8_t *src2,
+                                       ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_no_rnd_pixels16x16_l2_mmxext(uint8_t *dst,
+                                         const uint8_t *src1, const uint8_t *src2,
+                                         ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_no_rnd_pixels16x17_l2_mmxext(uint8_t *dst,
+                                         const uint8_t *src1, const uint8_t *src2,
+                                         ptrdiff_t dstStride, ptrdiff_t src1Stride);
 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
                                           ptrdiff_t dstStride, ptrdiff_t srcStride, int h);
 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
@@ -70,7 +82,7 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst,
 
 #if HAVE_X86ASM
 
-#define QPEL_OP(OPNAME, RND, MMX, ARG)                                  \
+#define QPEL_OP(OPNAME, RND, MMX)                                       \
 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst,                  \
                                          const uint8_t *src,            \
                                          ptrdiff_t stride)              \
@@ -79,8 +91,8 @@ static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst,                  \
     uint8_t *const half = (uint8_t *) temp;                             \
     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,        \
                                                    stride, 8);          \
-    ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, src, half,             \
-                                            stride, stride, 8));        \
+    ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, src, half,               \
+                                          stride, stride);              \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst,                  \
@@ -99,8 +111,8 @@ static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst,                  \
     uint8_t *const half = (uint8_t *) temp;                             \
     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,        \
                                                    stride, 8);          \
-    ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, src + 1, half,         \
-                                            stride, stride, 8));        \
+    ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, src + 1, half,           \
+                                          stride, stride);              \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst,                  \
@@ -111,8 +123,8 @@ static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst,                  \
     uint8_t *const half = (uint8_t *) temp;                             \
     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src,           \
                                                    8, stride);          \
-    ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, src, half,             \
-                                            stride, stride, 8));        \
+    ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, src, half,               \
+                                          stride, stride);              \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst,                  \
@@ -131,8 +143,8 @@ static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst,                  \
     uint8_t *const half = (uint8_t *) temp;                             \
     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src,           \
                                                    8, stride);          \
-    ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, src + stride, half,    \
-                                            stride, stride, 8));        \
+    ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, src + stride, half,      \
+                                          stride, stride);              \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst,                  \
@@ -144,11 +156,11 @@ static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst,                  \
     uint8_t *const halfHV = (uint8_t *) half;                           \
     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
                                                    stride, 9);          \
-    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8,           \
-                                        stride, 9);                     \
+    ff_put ## RND ## pixels8x9_l2_ ## MMX(halfH, src, halfH, 8,         \
+                                          stride);                      \
     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH, halfHV,         \
-                                            stride, 8, 8));             \
+    ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, halfH, halfHV,           \
+                                          stride, 8);                   \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst,                  \
@@ -160,11 +172,11 @@ static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst,                  \
     uint8_t *const halfHV = (uint8_t *) half;                           \
     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
                                                    stride, 9);          \
-    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
-                                        stride, 9);                     \
+    ff_put ## RND ## pixels8x9_l2_ ## MMX(halfH, src + 1, halfH, 8,     \
+                                          stride);                      \
     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH, halfHV,         \
-                                            stride, 8, 8));             \
+    ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, halfH, halfHV,           \
+                                          stride, 8);                   \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst,                  \
@@ -176,11 +188,11 @@ static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst,                  \
     uint8_t *const halfHV = (uint8_t *) half;                           \
     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
                                                    stride, 9);          \
-    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8,           \
-                                        stride, 9);                     \
+    ff_put ## RND ## pixels8x9_l2_ ## MMX(halfH, src, halfH, 8,         \
+                                          stride);                      \
     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH + 8, halfHV,     \
-                                            stride, 8, 8));             \
+    ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, halfH + 8, halfHV,       \
+                                          stride, 8);                   \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst,                  \
@@ -192,11 +204,11 @@ static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst,                  \
     uint8_t *const halfHV = (uint8_t *) half;                           \
     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
                                                    stride, 9);          \
-    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
-                                        stride, 9);                     \
+    ff_put ## RND ## pixels8x9_l2_ ## MMX(halfH, src + 1, halfH, 8,     \
+                                          stride);                      \
     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH + 8, halfHV,     \
-                                            stride, 8, 8));             \
+    ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, halfH + 8, halfHV,       \
+                                          stride, 8);                   \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst,                  \
@@ -209,8 +221,8 @@ static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst,                  \
     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
                                                    stride, 9);          \
     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH, halfHV,         \
-                                            stride, 8, 8));             \
+    ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, halfH, halfHV,           \
+                                          stride, 8);                   \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst,                  \
@@ -223,8 +235,8 @@ static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst,                  \
     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
                                                    stride, 9);          \
     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    ff_ ## OPNAME ## pixels8_l2_ ## MMX(ARG(dst, halfH + 8, halfHV,     \
-                                            stride, 8, 8));             \
+    ff_ ## OPNAME ## pixels8x8_l2_ ## MMX(dst, halfH + 8, halfHV,       \
+                                          stride, 8);                   \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst,                  \
@@ -235,8 +247,8 @@ static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst,                  \
     uint8_t *const halfH = (uint8_t *) half;                            \
     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
                                                    stride, 9);          \
-    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH,              \
-                                        8, stride, 9);                  \
+    ff_put ## RND ## pixels8x9_l2_ ## MMX(halfH, src, halfH,            \
+                                          8, stride);                   \
     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
                                                    stride, 8);          \
 }                                                                       \
@@ -249,8 +261,8 @@ static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst,                  \
     uint8_t *const halfH = (uint8_t *) half;                            \
     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
                                                    stride, 9);          \
-    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
-                                        stride, 9);                     \
+    ff_put ## RND ## pixels8x9_l2_ ## MMX(halfH, src + 1, halfH, 8,     \
+                                          stride);                      \
     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
                                                    stride, 8);          \
 }                                                                       \
@@ -275,8 +287,8 @@ static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst,                 \
     uint8_t *const half = (uint8_t *) temp;                             \
     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,      \
                                                     stride, 16);        \
-    ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, src, half,            \
-                                             stride, stride, 16));      \
+    ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, src, half,             \
+                                            stride, stride);            \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst,                 \
@@ -295,8 +307,8 @@ static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst,                 \
     uint8_t *const half = (uint8_t*) temp;                              \
     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,      \
                                                     stride, 16);        \
-    ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, src + 1, half,        \
-                                             stride, stride, 16));      \
+    ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, src + 1, half,         \
+                                            stride, stride);            \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst,                 \
@@ -307,8 +319,8 @@ static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst,                 \
     uint8_t *const half = (uint8_t *) temp;                             \
     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,      \
                                                     stride);            \
-    ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, src, half,            \
-                                             stride, stride, 16));      \
+    ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, src, half,             \
+                                            stride, stride);            \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst,                 \
@@ -327,8 +339,8 @@ static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst,                 \
     uint8_t *const half = (uint8_t *) temp;                             \
     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,      \
                                                     stride);            \
-    ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, src+stride, half,     \
-                                             stride, stride, 16));      \
+    ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, src+stride, half,      \
+                                            stride, stride);            \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst,                 \
@@ -340,12 +352,12 @@ static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst,                 \
     uint8_t *const halfHV = (uint8_t *) half;                           \
     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
                                                     stride, 17);        \
-    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
-                                         stride, 17);                   \
+    ff_put ## RND ## pixels16x17_l2_ ## MMX(halfH, src, halfH, 16,      \
+                                            stride);                    \
     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
                                                     16, 16);            \
-    ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH, halfHV,        \
-                                             stride, 16, 16));          \
+    ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, halfH, halfHV,         \
+                                            stride, 16);                \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst,                 \
@@ -357,12 +369,12 @@ static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst,                 \
     uint8_t *const halfHV = (uint8_t *) half;                           \
     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
                                                     stride, 17);        \
-    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
-                                         stride, 17);                   \
+    ff_put ## RND ## pixels16x17_l2_ ## MMX(halfH, src + 1, halfH, 16,  \
+                                            stride);                    \
     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
                                                     16, 16);            \
-    ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH, halfHV,        \
-                                             stride, 16, 16));          \
+    ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, halfH, halfHV,         \
+                                            stride, 16);                \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst,                 \
@@ -374,12 +386,12 @@ static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst,                 \
     uint8_t *const halfHV = (uint8_t *) half;                           \
     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
                                                     stride, 17);        \
-    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
-                                         stride, 17);                   \
+    ff_put ## RND ## pixels16x17_l2_ ## MMX(halfH, src, halfH, 16,      \
+                                            stride);                    \
     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
                                                     16, 16);            \
-    ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH + 16, halfHV,   \
-                                             stride, 16, 16));          \
+    ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, halfH + 16, halfHV,    \
+                                            stride, 16);                \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst,                 \
@@ -391,12 +403,12 @@ static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst,                 \
     uint8_t *const halfHV = (uint8_t *) half;                           \
     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
                                                     stride, 17);        \
-    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
-                                         stride, 17);                   \
+    ff_put ## RND ## pixels16x17_l2_ ## MMX(halfH, src + 1, halfH, 16,  \
+                                            stride);                    \
     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
                                                     16, 16);            \
-    ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH + 16, halfHV,   \
-                                             stride, 16, 16));          \
+    ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, halfH + 16, halfHV,    \
+                                            stride, 16);                \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst,                 \
@@ -410,8 +422,8 @@ static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst,                 \
                                                     stride, 17);        \
     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
                                                     16, 16);            \
-    ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH, halfHV,        \
-                                             stride, 16, 16));          \
+    ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, halfH, halfHV,         \
+                                            stride, 16);                \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst,                 \
@@ -425,8 +437,8 @@ static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst,                 \
                                                     stride, 17);        \
     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
                                                     16, 16);            \
-    ff_ ## OPNAME ## pixels16_l2_ ## MMX(ARG(dst, halfH + 16, halfHV,   \
-                                             stride, 16, 16));          \
+    ff_ ## OPNAME ## pixels16x16_l2_ ## MMX(dst, halfH + 16, halfHV,    \
+                                            stride, 16);                \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst,                 \
@@ -437,8 +449,8 @@ static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst,                 \
     uint8_t *const halfH = (uint8_t *) half;                            \
     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
                                                     stride, 17);        \
-    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
-                                         stride, 17);                   \
+    ff_put ## RND ## pixels16x17_l2_ ## MMX(halfH, src, halfH, 16,      \
+                                            stride);                    \
     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
                                                     stride, 16);        \
 }                                                                       \
@@ -451,8 +463,8 @@ static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst,                 \
     uint8_t *const halfH = (uint8_t *) half;                            \
     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
                                                     stride, 17);        \
-    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
-                                         stride, 17);                   \
+    ff_put ## RND ## pixels16x17_l2_ ## MMX(halfH, src + 1, halfH, 16,  \
+                                            stride);                    \
     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
                                                     stride, 16);        \
 }                                                                       \
@@ -469,13 +481,9 @@ static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst,                 \
                                                     stride, 16);        \
 }
 
-#define PASSTHROUGH(...) __VA_ARGS__
-#define STRIP_HEIGHT(dst, src1, src2, dststride, srcstride, height) \
-    (dst), (src1), (src2), (dststride), (srcstride)
-
-QPEL_OP(put_,        _,        mmxext, PASSTHROUGH)
-QPEL_OP(avg_,        _,        mmxext, STRIP_HEIGHT)
-QPEL_OP(put_no_rnd_, _no_rnd_, mmxext, PASSTHROUGH)
+QPEL_OP(put_,        _,        mmxext)
+QPEL_OP(avg_,        _,        mmxext)
+QPEL_OP(put_no_rnd_, _no_rnd_, mmxext)
 
 #endif /* HAVE_X86ASM */
 
-- 
2.49.1


>From 756b072c08a41659b9c7400e4c60f070fd16eefa Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 29 Oct 2025 15:07:40 +0100
Subject: [PATCH 3/3] avcodec/x86/h264_qpel: Add and use
 ff_{avg,put}_pixels16x16_l2_sse2()

This avoids mmx (the size 16 h264qpel dsp now no longer uses any mmx)
and improves performance, particularly for the avg case:

Old benchmarks:
avg_h264_qpel_16_mc01_8_c:                             780.0 ( 1.00x)
avg_h264_qpel_16_mc01_8_sse2:                           91.2 ( 8.55x)
avg_h264_qpel_16_mc03_8_c:                             804.0 ( 1.00x)
avg_h264_qpel_16_mc03_8_sse2:                           91.2 ( 8.82x)
put_h264_qpel_16_mc01_8_c:                             779.5 ( 1.00x)
put_h264_qpel_16_mc01_8_sse2:                           82.8 ( 9.41x)
put_h264_qpel_16_mc03_8_c:                             770.1 ( 1.00x)
put_h264_qpel_16_mc03_8_sse2:                           82.5 ( 9.33x)

New benchmarks:
avg_h264_qpel_16_mc01_8_c:                             783.9 ( 1.00x)
avg_h264_qpel_16_mc01_8_sse2:                           84.1 ( 9.32x)
avg_h264_qpel_16_mc03_8_c:                             797.4 ( 1.00x)
avg_h264_qpel_16_mc03_8_sse2:                           83.9 ( 9.51x)
put_h264_qpel_16_mc01_8_c:                             767.4 ( 1.00x)
put_h264_qpel_16_mc01_8_sse2:                           80.5 ( 9.53x)
put_h264_qpel_16_mc03_8_c:                             779.9 ( 1.00x)
put_h264_qpel_16_mc03_8_sse2:                           80.3 ( 9.71x)

(qpeldsp will use these functions when it gets ported to SSE2;
then the mmxext functions will be removed as well.)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/h264_qpel.c |  2 --
 libavcodec/x86/qpel.asm    | 45 +++++++++++++++++++++-----------------
 libavcodec/x86/qpel.h      |  6 +++++
 3 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index b17ee7e02d..f2bcca1e11 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -42,8 +42,6 @@ void ff_avg_pixels4x4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t
     ff_avg_pixels4x4_l2_mmxext((dst), (src1), (src2), (dststride))
 #define ff_put_pixels8x8_l2_sse2  ff_put_pixels8x8_l2_mmxext
 #define ff_avg_pixels8x8_l2_sse2  ff_avg_pixels8x8_l2_mmxext
-#define ff_put_pixels16x16_l2_sse2 ff_put_pixels16x16_l2_mmxext
-#define ff_avg_pixels16x16_l2_sse2 ff_avg_pixels16x16_l2_mmxext
 
 #define DEF_QPEL(OPNAME)\
 void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride);\
diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm
index be8bc4f579..8f18cf93db 100644
--- a/libavcodec/x86/qpel.asm
+++ b/libavcodec/x86/qpel.asm
@@ -34,53 +34,58 @@ SECTION .text
     mova   %2, %1
 %endmacro
 
-%macro PIXELS8_L2 1
+%macro PIXELS_L2 2-3 ; avg vs put, size, size+1
 %define OP op_%1
 %ifidn %1, put
+%if notcpuflag(sse2) ; SSE2 currently only uses 16x16
 ; void ff_put_pixels8x9_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
 ;                                 ptrdiff_t dstStride, ptrdiff_t src1Stride)
-cglobal put_pixels8x9_l2, 5,6
-    mova         m0, [r1]
-    mova         m1, [r2]
+cglobal put_pixels%2x%3_l2, 5,6,2
+    movu         m0, [r1]
+    pavgb        m0, [r2]
     add          r1, r4
-    add          r2, 8
-    pavgb        m0, m1
+    add          r2, mmsize
     OP           m0, [r0]
     add          r0, r3
     ; FIXME: avoid jump if prologue is empty
-    jmp          %1_pixels8x8_after_prologue_ %+ cpuname
+    jmp          %1_pixels%2x%2_after_prologue_ %+ cpuname
+%endif
 %endif
 ; void ff_avg/put_pixels8x8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
 ;                                     ptrdiff_t dstStride, ptrdiff_t src1Stride)
-cglobal %1_pixels8x8_l2, 5,6
-%1_pixels8x8_after_prologue_ %+ cpuname:
-    mov         r5d, 8
+cglobal %1_pixels%2x%2_l2, 5,6,2
+%1_pixels%2x%2_after_prologue_ %+ cpuname:
+    mov         r5d, %2
 .loop:
-    mova         m0, [r1]
-    mova         m1, [r1+r4]
+    movu         m0, [r1]
+    movu         m1, [r1+r4]
     lea          r1, [r1+2*r4]
     pavgb        m0, [r2]
-    pavgb        m1, [r2+8]
+    pavgb        m1, [r2+mmsize]
     OP           m0, [r0]
     OP           m1, [r0+r3]
     lea          r0, [r0+2*r3]
-    mova         m0, [r1]
-    mova         m1, [r1+r4]
+    movu         m0, [r1]
+    movu         m1, [r1+r4]
     lea          r1, [r1+2*r4]
-    pavgb        m0, [r2+16]
-    pavgb        m1, [r2+24]
+    pavgb        m0, [r2+2*mmsize]
+    pavgb        m1, [r2+3*mmsize]
     OP           m0, [r0]
     OP           m1, [r0+r3]
     lea          r0, [r0+2*r3]
-    add          r2, 32
+    add          r2, 4*mmsize
     sub         r5d, 4
     jne       .loop
     RET
 %endmacro
 
 INIT_MMX mmxext
-PIXELS8_L2 put
-PIXELS8_L2 avg
+PIXELS_L2 put, 8, 9
+PIXELS_L2 avg, 8
+
+INIT_XMM sse2
+PIXELS_L2 put, 16, 17
+PIXELS_L2 avg, 16
 
 %macro PIXELS16_L2 1
 %define OP op_%1
diff --git a/libavcodec/x86/qpel.h b/libavcodec/x86/qpel.h
index c4b6ee0413..61c0473331 100644
--- a/libavcodec/x86/qpel.h
+++ b/libavcodec/x86/qpel.h
@@ -31,8 +31,14 @@ void ff_avg_pixels8x8_l2_mmxext(uint8_t *dst,
 void ff_put_pixels16x16_l2_mmxext(uint8_t *dst,
                                   const uint8_t *src1, const uint8_t *src2,
                                   ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_put_pixels16x16_l2_sse2(uint8_t *dst,
+                                const uint8_t *src1, const uint8_t *src2,
+                                ptrdiff_t dstStride, ptrdiff_t src1Stride);
 void ff_avg_pixels16x16_l2_mmxext(uint8_t *dst,
                                   const uint8_t *src1, const uint8_t *src2,
                                   ptrdiff_t dstStride, ptrdiff_t src1Stride);
+void ff_avg_pixels16x16_l2_sse2(uint8_t *dst,
+                                const uint8_t *src1, const uint8_t *src2,
+                                ptrdiff_t dstStride, ptrdiff_t src1Stride);
 
 #endif /* AVCODEC_X86_QPEL_H */
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2025-10-29 14:33 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-29 14:32 [FFmpeg-devel] [PATCH] avcodec/x86/qpel: Add specializations for put_l2 functions (PR #20785) mkver via ffmpeg-devel

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git