* [FFmpeg-devel] [PATCH] h264qpel (PR #20645)
@ 2025-10-04 15:47 mkver via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-10-04 15:47 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: mkver
PR #20645 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20645
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20645.patch
>From 549383606f01ac1cf481c0b69b923ed44fcb99d4 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 4 Oct 2025 07:29:35 +0200
Subject: [PATCH 1/8] avcodec/x86/h264_qpel_8bit: Improve register allocation
None of the other registers need to be preserved at this time,
so six XMM registers are always enough. Forgotten in
fa9ea5113b48904daef9df6a282bd9c04c32258d.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/h264_qpel_8bit.asm | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index ede4f382e1..bbf591664a 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -634,11 +634,7 @@ QPEL8OR16_HV2_LOWPASS_OP put
QPEL8OR16_HV2_LOWPASS_OP avg
%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
-%ifidn %1, avg
-cglobal %1_h264_qpel8_hv2_lowpass, 3,4,7 ; dst, tmp, dstStride
-%else
cglobal %1_h264_qpel8_hv2_lowpass, 3,4,6 ; dst, tmp, dstStride
-%endif
mov r3d, 8
.loop:
mova m1, [r1+16]
@@ -663,7 +659,7 @@ cglobal %1_h264_qpel8_hv2_lowpass, 3,4,6 ; dst, tmp, dstStride
paddw m0, m2
psraw m0, 6
packuswb m0, m0
- op_%1h m0, [r0], m6
+ op_%1h m0, [r0], m5
add r1, 48
add r0, r2
dec r3d
--
2.49.1
>From 48460a769cf736dac6237bd4116eafb1cce51d2a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 4 Oct 2025 08:43:21 +0200
Subject: [PATCH 2/8] avcodec/h264qpel: Move Snow-only code to snow.c
Blocksize 2 is Snow-only, so move all the code pertaining
to it to snow.c. Also make the put array in H264QpelContext
smaller -- it only needs three sets of 16 function pointers.
This continues 6eb8bc42176f73c1d7c2e9f4bc1ab988f7149de5
and b0c91c2fba82f98dfe7a70f2591ec7a2126820c0.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/h264qpel.c | 60 -------------------
libavcodec/h264qpel.h | 2 +-
libavcodec/h264qpel_template.c | 94 ++++++++++++++---------------
libavcodec/pel_template.c | 2 +-
libavcodec/snow.c | 104 ++++++++++++++++++++++++++++++---
libavcodec/snow.h | 6 +-
tests/checkasm/h264qpel.c | 2 +-
7 files changed, 152 insertions(+), 118 deletions(-)
diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c
index be80203c4b..0bc715c638 100644
--- a/libavcodec/h264qpel.c
+++ b/libavcodec/h264qpel.c
@@ -26,65 +26,6 @@
#define pixeltmp int16_t
#define BIT_DEPTH 8
#include "h264qpel_template.c"
-
-static void put_h264_qpel2_h_lowpass_8(uint8_t *dst, const uint8_t *restrict src, int dstStride, int srcStride)
-{
- const int h = 2;
- for (int i = 0; i < h; ++i) {
- dst[0] = av_clip_uint8(((src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]) + 16) >> 5);
- dst[1] = av_clip_uint8(((src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]) + 16) >> 5);
- dst += dstStride;
- src += srcStride;
- }
-}
-
-static void put_h264_qpel2_v_lowpass_8(uint8_t *dst, const uint8_t *restrict src, int dstStride, int srcStride)
-{
- const int w = 2;
- for (int i = 0; i < w; ++i) {
- const int srcB = src[-2*srcStride];
- const int srcA = src[-1*srcStride];
- const int src0 = src[0 *srcStride];
- const int src1 = src[1 *srcStride];
- const int src2 = src[2 *srcStride];
- const int src3 = src[3 *srcStride];
- const int src4 = src[4 *srcStride];
- dst[0*dstStride] = av_clip_uint8(((src0+src1)*20 - (srcA+src2)*5 + (srcB+src3) + 16) >> 5);
- dst[1*dstStride] = av_clip_uint8(((src1+src2)*20 - (src0+src3)*5 + (srcA+src4) + 16) >> 5);
- dst++;
- src++;
- }
-}
-
-static void put_h264_qpel2_hv_lowpass_8(uint8_t *dst, pixeltmp *tmp, const uint8_t *restrict src, int dstStride, int tmpStride, int srcStride)
-{
- const int h = 2;
- const int w = 2;
- src -= 2*srcStride;
- for (int i = 0; i < h + 5; ++i) {
- tmp[0] = (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);
- tmp[1] = (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);
- tmp += tmpStride;
- src += srcStride;
- }
- tmp -= tmpStride*(h+5-2);
- for (int i = 0; i < w; ++i) {
- const int tmpB = tmp[-2*tmpStride];
- const int tmpA = tmp[-1*tmpStride];
- const int tmp0 = tmp[0 *tmpStride];
- const int tmp1 = tmp[1 *tmpStride];
- const int tmp2 = tmp[2 *tmpStride];
- const int tmp3 = tmp[3 *tmpStride];
- const int tmp4 = tmp[4 *tmpStride];
- dst[0*dstStride] = av_clip_uint8(((tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3) + 512) >> 10);
- dst[1*dstStride] = av_clip_uint8(((tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4) + 512) >> 10);
- dst++;
- tmp++;
- }
-}
-
-H264_MC(put_, 2)
-
#undef BIT_DEPTH
#define BIT_DEPTH 9
@@ -140,7 +81,6 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
switch (bit_depth) {
default:
SET_QPEL(8);
- dspfunc2(put_h264_qpel, 3, 2, 8); // only used by Snow
break;
case 9:
SET_QPEL(9);
diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h
index 6ae5ba1724..f198a9483c 100644
--- a/libavcodec/h264qpel.h
+++ b/libavcodec/h264qpel.h
@@ -25,7 +25,7 @@
#include "qpeldsp.h"
typedef struct H264QpelContext {
- qpel_mc_func put_h264_qpel_pixels_tab[4][16];
+ qpel_mc_func put_h264_qpel_pixels_tab[3][16];
qpel_mc_func avg_h264_qpel_pixels_tab[3][16];
} H264QpelContext;
diff --git a/libavcodec/h264qpel_template.c b/libavcodec/h264qpel_template.c
index a55b45e824..875ac86d15 100644
--- a/libavcodec/h264qpel_template.c
+++ b/libavcodec/h264qpel_template.c
@@ -304,134 +304,134 @@ static void FUNC(OPNAME ## h264_qpel16_hv_lowpass)(uint8_t *dst, pixeltmp *tmp,
FUNC(OPNAME ## h264_qpel8_hv_lowpass)(dst+8*sizeof(pixel), tmp+8, src+8*sizeof(pixel), dstStride, tmpStride, srcStride);\
}\
-#define H264_MC(OPNAME, SIZE) \
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc00)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+#define H264_MC(OPNAME, NAME, SIZE) \
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc00)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
FUNCC(OPNAME ## pixels ## SIZE)(dst, src, stride, SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc10)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc10)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t half[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(half, src, SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(half, src, SIZE*sizeof(pixel), stride);\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, src, half, stride, stride, SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc20)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc20)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
- FUNC(OPNAME ## h264_qpel ## SIZE ## _h_lowpass)(dst, src, stride, stride);\
+ FUNC(OPNAME ## NAME ## _qpel ## SIZE ## _h_lowpass)(dst, src, stride, stride);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc30)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc30)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t half[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(half, src, SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(half, src, SIZE*sizeof(pixel), stride);\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, src+sizeof(pixel), half, stride, stride, SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc01)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc01)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
uint8_t half[SIZE*SIZE*sizeof(pixel)];\
FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel), stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(half, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(half, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, full_mid, half, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc02)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc02)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel), stride, SIZE + 5);\
- FUNC(OPNAME ## h264_qpel ## SIZE ## _v_lowpass)(dst, full_mid, stride, SIZE*sizeof(pixel));\
+ FUNC(OPNAME ## NAME ## _qpel ## SIZE ## _v_lowpass)(dst, full_mid, stride, SIZE*sizeof(pixel));\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc03)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc03)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
uint8_t half[SIZE*SIZE*sizeof(pixel)];\
FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel), stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(half, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(half, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, full_mid+SIZE*sizeof(pixel), half, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc11)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc11)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src, SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(halfH, src, SIZE*sizeof(pixel), stride);\
FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel), stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc31)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc31)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src, SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(halfH, src, SIZE*sizeof(pixel), stride);\
FUNC(copy_block ## SIZE )(full, src - stride*2 + sizeof(pixel), SIZE*sizeof(pixel), stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc13)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc13)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src + stride, SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(halfH, src + stride, SIZE*sizeof(pixel), stride);\
FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel), stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc33)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc33)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src + stride, SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(halfH, src + stride, SIZE*sizeof(pixel), stride);\
FUNC(copy_block ## SIZE )(full, src - stride*2 + sizeof(pixel), SIZE*sizeof(pixel), stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc22)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc22)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
- FUNC(OPNAME ## h264_qpel ## SIZE ## _hv_lowpass)(dst, tmp, src, stride, SIZE*sizeof(pixel), stride);\
+ FUNC(OPNAME ## NAME ## _qpel ## SIZE ## _hv_lowpass)(dst, tmp, src, stride, SIZE*sizeof(pixel), stride);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc21)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc21)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src, SIZE*sizeof(pixel), stride);\
- FUNC(put_h264_qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src, SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(halfH, src, SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src, SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfHV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc23)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc23)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src + stride, SIZE*sizeof(pixel), stride);\
- FUNC(put_h264_qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src, SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(halfH, src + stride, SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src, SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfHV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc12)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc12)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
@@ -439,12 +439,12 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc12)(uint8_t *dst, const uint
uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel), stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
- FUNC(put_h264_qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src, SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src, SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfV, halfHV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc32)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc32)(uint8_t *dst, const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
@@ -452,8 +452,8 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc32)(uint8_t *dst, const uint
uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
FUNC(copy_block ## SIZE )(full, src - stride*2 + sizeof(pixel), SIZE*sizeof(pixel), stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
- FUNC(put_h264_qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src, SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src, SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfV, halfHV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
@@ -463,14 +463,16 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc32)(uint8_t *dst, const uint
#define op2_avg(a, b) a = (((a)+CLIP(((b) + 512)>>10)+1)>>1)
#define op2_put(a, b) a = CLIP(((b) + 512)>>10)
+#ifndef SNOW
H264_LOWPASS(put_ , op_put, op2_put)
H264_LOWPASS(avg_ , op_avg, op2_avg)
-H264_MC(put_, 4)
-H264_MC(put_, 8)
-H264_MC(put_, 16)
-H264_MC(avg_, 4)
-H264_MC(avg_, 8)
-H264_MC(avg_, 16)
+H264_MC(put_, h264, 4)
+H264_MC(put_, h264, 8)
+H264_MC(put_, h264, 16)
+H264_MC(avg_, h264, 4)
+H264_MC(avg_, h264, 8)
+H264_MC(avg_, h264, 16)
+#endif
#undef op_avg
#undef op_put
diff --git a/libavcodec/pel_template.c b/libavcodec/pel_template.c
index 6da7a56b2d..7de3db72f1 100644
--- a/libavcodec/pel_template.c
+++ b/libavcodec/pel_template.c
@@ -66,7 +66,7 @@ static inline void FUNCC(OPNAME ## _pixels8)(uint8_t *block, \
block += line_size; \
} \
} \
- \
+av_unused \
CALL_2X_PIXELS(FUNCC(OPNAME ## _pixels16), \
FUNCC(OPNAME ## _pixels8), \
8 * sizeof(pixel))
diff --git a/libavcodec/snow.c b/libavcodec/snow.c
index e0ce83eb9c..006d84d8ce 100644
--- a/libavcodec/snow.c
+++ b/libavcodec/snow.c
@@ -18,6 +18,8 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <assert.h>
+
#include "libavutil/log.h"
#include "libavutil/mem.h"
#include "libavutil/thread.h"
@@ -26,6 +28,91 @@
#include "snow.h"
#include "snowdata.h"
+#define pixeltmp int16_t
+#define BIT_DEPTH 8
+#define SNOW
+#include "h264qpel_template.c"
+
+static void put_snow_qpel2_h_lowpass_8(uint8_t *dst, const uint8_t *restrict src, int dstStride, int srcStride)
+{
+ const int h = 2;
+ for (int i = 0; i < h; ++i) {
+ dst[0] = av_clip_uint8(((src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]) + 16) >> 5);
+ dst[1] = av_clip_uint8(((src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]) + 16) >> 5);
+ dst += dstStride;
+ src += srcStride;
+ }
+}
+
+static void put_snow_qpel2_v_lowpass_8(uint8_t *dst, const uint8_t *restrict src, int dstStride, int srcStride)
+{
+ const int w = 2;
+ for (int i = 0; i < w; ++i) {
+ const int srcB = src[-2*srcStride];
+ const int srcA = src[-1*srcStride];
+ const int src0 = src[0 *srcStride];
+ const int src1 = src[1 *srcStride];
+ const int src2 = src[2 *srcStride];
+ const int src3 = src[3 *srcStride];
+ const int src4 = src[4 *srcStride];
+ dst[0*dstStride] = av_clip_uint8(((src0+src1)*20 - (srcA+src2)*5 + (srcB+src3) + 16) >> 5);
+ dst[1*dstStride] = av_clip_uint8(((src1+src2)*20 - (src0+src3)*5 + (srcA+src4) + 16) >> 5);
+ dst++;
+ src++;
+ }
+}
+
+static void put_snow_qpel2_hv_lowpass_8(uint8_t *dst, pixeltmp *tmp, const uint8_t *restrict src, int dstStride, int tmpStride, int srcStride)
+{
+ const int h = 2;
+ const int w = 2;
+ src -= 2*srcStride;
+ for (int i = 0; i < h + 5; ++i) {
+ tmp[0] = (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);
+ tmp[1] = (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);
+ tmp += tmpStride;
+ src += srcStride;
+ }
+ tmp -= tmpStride*(h+5-2);
+ for (int i = 0; i < w; ++i) {
+ const int tmpB = tmp[-2*tmpStride];
+ const int tmpA = tmp[-1*tmpStride];
+ const int tmp0 = tmp[0 *tmpStride];
+ const int tmp1 = tmp[1 *tmpStride];
+ const int tmp2 = tmp[2 *tmpStride];
+ const int tmp3 = tmp[3 *tmpStride];
+ const int tmp4 = tmp[4 *tmpStride];
+ dst[0*dstStride] = av_clip_uint8(((tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3) + 512) >> 10);
+ dst[1*dstStride] = av_clip_uint8(((tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4) + 512) >> 10);
+ dst++;
+ tmp++;
+ }
+}
+
+H264_MC(put_, snow, 2)
+
+static av_cold void init_qpel(SnowContext *const s)
+{
+ static_assert(offsetof(H264QpelContext, put_h264_qpel_pixels_tab) == 0,
+ "put_h264_qpel_pixels_tab not at start of H264QpelContext");
+ ff_h264qpel_init(&s->h264qpel, 8);
+ s->put_snow_qpel_pixels_tab[3][0] = put_snow_qpel2_mc00_8_c;
+ s->put_snow_qpel_pixels_tab[3][1] = put_snow_qpel2_mc10_8_c;
+ s->put_snow_qpel_pixels_tab[3][2] = put_snow_qpel2_mc20_8_c;
+ s->put_snow_qpel_pixels_tab[3][3] = put_snow_qpel2_mc30_8_c;
+ s->put_snow_qpel_pixels_tab[3][4] = put_snow_qpel2_mc01_8_c;
+ s->put_snow_qpel_pixels_tab[3][5] = put_snow_qpel2_mc11_8_c;
+ s->put_snow_qpel_pixels_tab[3][6] = put_snow_qpel2_mc21_8_c;
+ s->put_snow_qpel_pixels_tab[3][7] = put_snow_qpel2_mc31_8_c;
+ s->put_snow_qpel_pixels_tab[3][8] = put_snow_qpel2_mc02_8_c;
+ s->put_snow_qpel_pixels_tab[3][9] = put_snow_qpel2_mc12_8_c;
+ s->put_snow_qpel_pixels_tab[3][10] = put_snow_qpel2_mc22_8_c;
+ s->put_snow_qpel_pixels_tab[3][11] = put_snow_qpel2_mc32_8_c;
+ s->put_snow_qpel_pixels_tab[3][12] = put_snow_qpel2_mc03_8_c;
+ s->put_snow_qpel_pixels_tab[3][13] = put_snow_qpel2_mc13_8_c;
+ s->put_snow_qpel_pixels_tab[3][14] = put_snow_qpel2_mc23_8_c;
+ s->put_snow_qpel_pixels_tab[3][15] = put_snow_qpel2_mc33_8_c;
+}
void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
@@ -354,18 +441,18 @@ void ff_snow_pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, ptrdiff_t st
else if(b_w==32){
int y;
for(y=0; y<b_h; y+=16){
- s->h264qpel.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 3 + (y+3)*stride,stride);
- s->h264qpel.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 19 + (y+3)*stride,stride);
+ s->put_snow_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 3 + (y+3)*stride,stride);
+ s->put_snow_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 19 + (y+3)*stride,stride);
}
}else if(b_w==b_h)
- s->h264qpel.put_h264_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst,src + 3 + 3*stride,stride);
+ s->put_snow_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst,src + 3 + 3*stride,stride);
else if(b_w==2*b_h){
- s->h264qpel.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst ,src + 3 + 3*stride,stride);
- s->h264qpel.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 3 + b_h + 3*stride,stride);
+ s->put_snow_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst ,src + 3 + 3*stride,stride);
+ s->put_snow_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 3 + b_h + 3*stride,stride);
}else{
av_assert2(2*b_w==b_h);
- s->h264qpel.put_h264_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst ,src + 3 + 3*stride ,stride);
- s->h264qpel.put_h264_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst+b_w*stride,src + 3 + 3*stride+b_w*stride,stride);
+ s->put_snow_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst ,src + 3 + 3*stride ,stride);
+ s->put_snow_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst+b_w*stride,src + 3 + 3*stride+b_w*stride,stride);
}
}
}
@@ -404,7 +491,8 @@ av_cold int ff_snow_common_init(AVCodecContext *avctx){
ff_videodsp_init(&s->vdsp, 8);
ff_dwt_init(&s->dwt);
- ff_h264qpel_init(&s->h264qpel, 8);
+
+ init_qpel(s);
#define mcfh(dx,dy)\
s->hdsp.put_pixels_tab [0][dy/4+dx/8]=\
diff --git a/libavcodec/snow.h b/libavcodec/snow.h
index 9b19e70bd5..83dc6c1256 100644
--- a/libavcodec/snow.h
+++ b/libavcodec/snow.h
@@ -116,7 +116,11 @@ typedef struct SnowContext{
RangeCoder c;
HpelDSPContext hdsp;
VideoDSPContext vdsp;
- H264QpelContext h264qpel;
+ union {
+ /// everything except size 2 are from H.264
+ qpel_mc_func put_snow_qpel_pixels_tab[4][16];
+ H264QpelContext h264qpel;
+ };
SnowDWTContext dwt;
AVFrame *input_picture; ///< new_picture with the internal linesizes
AVFrame *current_picture;
diff --git a/tests/checkasm/h264qpel.c b/tests/checkasm/h264qpel.c
index 7387c2510a..6f4a021faf 100644
--- a/tests/checkasm/h264qpel.c
+++ b/tests/checkasm/h264qpel.c
@@ -64,7 +64,7 @@ void checkasm_check_h264qpel(void)
for (bit_depth = 8; bit_depth <= 10; bit_depth++) {
ff_h264qpel_init(&h, bit_depth);
- for (i = 0; i < (op || bit_depth != 8 ? 3 : 4); i++) {
+ for (i = 0; i < 3; i++) {
int size = 16 >> i;
for (j = 0; j < 16; j++)
if (check_func(tab[i][j], "%s_h264_qpel_%d_mc%d%d_%d", op_name, size, j & 3, j >> 2, bit_depth)) {
--
2.49.1
>From 61a5e967e9a90e8ab6a78231536735827a6939b1 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 4 Oct 2025 09:19:23 +0200
Subject: [PATCH 3/8] avcodec/x86/h264_qpel_8bit: Don't duplicate
qpel4_hv_lowpass_v_mmxext
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/h264_qpel.c | 5 +++--
libavcodec/x86/h264_qpel_8bit.asm | 5 +++--
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 636be54530..45e0878e57 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -63,7 +63,6 @@ void ff_ ## OPNAME ## _h264_qpel16_h_lowpass_l2_sse2(uint8_t *dst, const uint8_t
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride);\
void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h);\
-void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp, ptrdiff_t srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, ptrdiff_t srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride, int h);\
@@ -73,6 +72,8 @@ void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src
void ff_ ## OPNAME ## _pixels8_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, ptrdiff_t dstStride);\
+void ff_put_h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp, ptrdiff_t srcStride);
+
DEF_QPEL(avg)
DEF_QPEL(put)
@@ -82,7 +83,7 @@ static av_always_inline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *ds
int w=3;\
src -= 2*srcStride+2;\
while(w--){\
- ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
+ ff_put_h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
tmp += 4;\
src += 4;\
}\
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index bbf591664a..a610a831db 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -502,8 +502,8 @@ QPEL8OR16_V_LOWPASS_OP avg
SWAP 0, 1, 2, 3, 4, 5
%endmacro
-%macro QPEL4_HV1_LOWPASS_OP 1
-cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
+INIT_MMX mmxext
+cglobal put_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
pxor m7, m7
movh m0, [r0]
movh m1, [r0+r2]
@@ -524,6 +524,7 @@ cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
FILT_HV 3*24
RET
+%macro QPEL4_HV1_LOWPASS_OP 1
cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
mov r3d, 4
.loop:
--
2.49.1
>From 83a0d0ace9846bba82878c8993088d21f457a168 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 4 Oct 2025 10:15:09 +0200
Subject: [PATCH 4/8] avcodec/h264_qpel: Move loop into
qpel4_hv_lowpass_v_mmxext()
Every caller calls it three times in a loop, with slightly
modified arguments. So it makes sense to move the loop
into the callee.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/h264_qpel.c | 8 +-------
libavcodec/x86/h264_qpel_8bit.asm | 10 +++++++++-
2 files changed, 10 insertions(+), 8 deletions(-)
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 45e0878e57..af031fe2e9 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -80,14 +80,8 @@ DEF_QPEL(put)
#define QPEL_H264(OPNAME, OP, MMX)\
static av_always_inline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{\
- int w=3;\
src -= 2*srcStride+2;\
- while(w--){\
- ff_put_h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
- tmp += 4;\
- src += 4;\
- }\
- tmp -= 3*4;\
+ ff_put_h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
}\
\
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index a610a831db..64c91ba63a 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -503,8 +503,11 @@ QPEL8OR16_V_LOWPASS_OP avg
%endmacro
INIT_MMX mmxext
-cglobal put_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
+cglobal put_h264_qpel4_hv_lowpass_v, 3,5 ; src, tmp, srcStride
+ mov r4d, 3
+ mov r3, r0
pxor m7, m7
+.loop:
movh m0, [r0]
movh m1, [r0+r2]
lea r0, [r0+2*r2]
@@ -522,6 +525,11 @@ cglobal put_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
FILT_HV 1*24
FILT_HV 2*24
FILT_HV 3*24
+ add r3, 4
+ add r1, 8
+ mov r0, r3
+ dec r4d
+ jnz .loop
RET
%macro QPEL4_HV1_LOWPASS_OP 1
--
2.49.1
>From 0c81a90999459b12f31cc7e0463c2a5769cbf047 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 4 Oct 2025 10:21:38 +0200
Subject: [PATCH 5/8] avcodec/x86/h264_qpel_8bit: Avoid doing unnecessary work
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/h264_qpel_8bit.asm | 24 ++++++++++++++++++------
1 file changed, 18 insertions(+), 6 deletions(-)
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index 64c91ba63a..c66a9bda40 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -387,8 +387,12 @@ QPEL8_H_LOWPASS_L2_OP_XMM avg
; All functions that call this are required to have function arguments of
; dst, src, dstStride, srcStride
-%macro FILT_V 1
+%macro FILT_V 1-2
+%ifnidn %2, last
mova m6, m2
+%else
+ SWAP 6, 2
+%endif
movh m5, [r1]
paddw m6, m3
psllw m6, 2
@@ -403,7 +407,9 @@ QPEL8_H_LOWPASS_L2_OP_XMM avg
psraw m6, 5
packuswb m6, m6
op_%1h m6, [r0], m0 ; 1
+%ifnidn %2, last
add r0, r2
+%endif
SWAP 0, 1, 2, 3, 4, 5
%endmacro
@@ -428,7 +434,7 @@ cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
FILT_V %1
FILT_V %1
FILT_V %1
- FILT_V %1
+ FILT_V %1, last
RET
%endmacro
@@ -473,7 +479,7 @@ cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
FILT_V %1
FILT_V %1
FILT_V %1
- FILT_V %1
+ FILT_V %1, last
.end:
RET
%endmacro
@@ -485,8 +491,12 @@ QPEL8OR16_V_LOWPASS_OP avg
; All functions that use this are required to have args:
; src, tmp, srcSize
-%macro FILT_HV 1 ; offset
+%macro FILT_HV 1-2 ; offset, last
+%ifnidn %2, last
mova m6, m2
+%else
+ SWAP 2, 6
+%endif
movh m5, [r0]
paddw m6, m3
psllw m6, 2
@@ -496,7 +506,9 @@ QPEL8OR16_V_LOWPASS_OP avg
punpcklbw m5, m7
pmullw m6, [pw_5]
paddw m0, m5
+%ifnidn %2, last
add r0, r2
+%endif
paddw m6, m0
mova [r1+%1], m6
SWAP 0, 1, 2, 3, 4, 5
@@ -524,7 +536,7 @@ cglobal put_h264_qpel4_hv_lowpass_v, 3,5 ; src, tmp, srcStride
FILT_HV 0*24
FILT_HV 1*24
FILT_HV 2*24
- FILT_HV 3*24
+ FILT_HV 3*24, last
add r3, 4
add r1, 8
mov r0, r3
@@ -595,7 +607,7 @@ cglobal put_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
FILT_HV 12*48
FILT_HV 13*48
FILT_HV 14*48
- FILT_HV 15*48
+ FILT_HV 15*48, last
.end:
RET
--
2.49.1
>From 3ad8a75f9b5acd47f644ecf199cd33d324d9cf2b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 4 Oct 2025 12:38:32 +0200
Subject: [PATCH 6/8] avcodec/x86/h264_qpel: Port
qpel8or16_hv2_lowpass_op_mmxext to SSE2
This means that only blocksize 4 still uses mmx(ext).
Old benchmarks:
avg_h264_qpel_8_mc12_8_c: 428.4 ( 1.00x)
avg_h264_qpel_8_mc12_8_sse2: 74.3 ( 5.77x)
avg_h264_qpel_8_mc12_8_ssse3: 69.3 ( 6.18x)
avg_h264_qpel_8_mc21_8_c: 401.4 ( 1.00x)
avg_h264_qpel_8_mc21_8_sse2: 97.8 ( 4.10x)
avg_h264_qpel_8_mc21_8_ssse3: 93.7 ( 4.28x)
avg_h264_qpel_8_mc22_8_c: 281.8 ( 1.00x)
avg_h264_qpel_8_mc22_8_sse2: 66.7 ( 4.23x)
avg_h264_qpel_8_mc22_8_ssse3: 62.6 ( 4.50x)
avg_h264_qpel_8_mc23_8_c: 397.2 ( 1.00x)
avg_h264_qpel_8_mc23_8_sse2: 97.9 ( 4.06x)
avg_h264_qpel_8_mc23_8_ssse3: 93.7 ( 4.24x)
avg_h264_qpel_8_mc32_8_c: 432.4 ( 1.00x)
avg_h264_qpel_8_mc32_8_sse2: 73.9 ( 5.85x)
avg_h264_qpel_8_mc32_8_ssse3: 69.5 ( 6.22x)
avg_h264_qpel_16_mc12_8_c: 1756.4 ( 1.00x)
avg_h264_qpel_16_mc12_8_sse2: 240.0 ( 7.32x)
avg_h264_qpel_16_mc12_8_ssse3: 204.5 ( 8.59x)
avg_h264_qpel_16_mc21_8_c: 1635.3 ( 1.00x)
avg_h264_qpel_16_mc21_8_sse2: 321.2 ( 5.09x)
avg_h264_qpel_16_mc21_8_ssse3: 288.5 ( 5.67x)
avg_h264_qpel_16_mc22_8_c: 1130.8 ( 1.00x)
avg_h264_qpel_16_mc22_8_sse2: 219.4 ( 5.15x)
avg_h264_qpel_16_mc22_8_ssse3: 182.2 ( 6.21x)
avg_h264_qpel_16_mc23_8_c: 1622.5 ( 1.00x)
avg_h264_qpel_16_mc23_8_sse2: 321.3 ( 5.05x)
avg_h264_qpel_16_mc23_8_ssse3: 289.5 ( 5.60x)
avg_h264_qpel_16_mc32_8_c: 1762.5 ( 1.00x)
avg_h264_qpel_16_mc32_8_sse2: 236.1 ( 7.46x)
avg_h264_qpel_16_mc32_8_ssse3: 205.2 ( 8.59x)
put_h264_qpel_8_mc12_8_c: 427.2 ( 1.00x)
put_h264_qpel_8_mc12_8_sse2: 72.1 ( 5.93x)
put_h264_qpel_8_mc12_8_ssse3: 67.0 ( 6.38x)
put_h264_qpel_8_mc21_8_c: 402.9 ( 1.00x)
put_h264_qpel_8_mc21_8_sse2: 95.9 ( 4.20x)
put_h264_qpel_8_mc21_8_ssse3: 91.9 ( 4.38x)
put_h264_qpel_8_mc22_8_c: 235.0 ( 1.00x)
put_h264_qpel_8_mc22_8_sse2: 64.6 ( 3.64x)
put_h264_qpel_8_mc22_8_ssse3: 60.0 ( 3.92x)
put_h264_qpel_8_mc23_8_c: 403.6 ( 1.00x)
put_h264_qpel_8_mc23_8_sse2: 95.9 ( 4.21x)
put_h264_qpel_8_mc23_8_ssse3: 91.7 ( 4.40x)
put_h264_qpel_8_mc32_8_c: 430.7 ( 1.00x)
put_h264_qpel_8_mc32_8_sse2: 72.1 ( 5.97x)
put_h264_qpel_8_mc32_8_ssse3: 67.0 ( 6.43x)
put_h264_qpel_16_mc12_8_c: 1724.2 ( 1.00x)
put_h264_qpel_16_mc12_8_sse2: 230.7 ( 7.47x)
put_h264_qpel_16_mc12_8_ssse3: 199.8 ( 8.63x)
put_h264_qpel_16_mc21_8_c: 1613.3 ( 1.00x)
put_h264_qpel_16_mc21_8_sse2: 327.5 ( 4.93x)
put_h264_qpel_16_mc21_8_ssse3: 297.2 ( 5.43x)
put_h264_qpel_16_mc22_8_c: 959.2 ( 1.00x)
put_h264_qpel_16_mc22_8_sse2: 211.9 ( 4.53x)
put_h264_qpel_16_mc22_8_ssse3: 186.1 ( 5.15x)
put_h264_qpel_16_mc23_8_c: 1619.0 ( 1.00x)
put_h264_qpel_16_mc23_8_sse2: 319.7 ( 5.06x)
put_h264_qpel_16_mc23_8_ssse3: 299.2 ( 5.41x)
put_h264_qpel_16_mc32_8_c: 1741.7 ( 1.00x)
put_h264_qpel_16_mc32_8_sse2: 230.9 ( 7.54x)
put_h264_qpel_16_mc32_8_ssse3: 199.4 ( 8.74x)
New benchmarks:
avg_h264_qpel_8_mc12_8_c: 427.2 ( 1.00x)
avg_h264_qpel_8_mc12_8_sse2: 63.9 ( 6.69x)
avg_h264_qpel_8_mc12_8_ssse3: 69.2 ( 6.18x)
avg_h264_qpel_8_mc21_8_c: 399.2 ( 1.00x)
avg_h264_qpel_8_mc21_8_sse2: 87.7 ( 4.55x)
avg_h264_qpel_8_mc21_8_ssse3: 93.9 ( 4.25x)
avg_h264_qpel_8_mc22_8_c: 285.7 ( 1.00x)
avg_h264_qpel_8_mc22_8_sse2: 56.4 ( 5.07x)
avg_h264_qpel_8_mc22_8_ssse3: 62.6 ( 4.56x)
avg_h264_qpel_8_mc23_8_c: 398.6 ( 1.00x)
avg_h264_qpel_8_mc23_8_sse2: 87.6 ( 4.55x)
avg_h264_qpel_8_mc23_8_ssse3: 93.8 ( 4.25x)
avg_h264_qpel_8_mc32_8_c: 425.8 ( 1.00x)
avg_h264_qpel_8_mc32_8_sse2: 63.8 ( 6.67x)
avg_h264_qpel_8_mc32_8_ssse3: 69.0 ( 6.17x)
avg_h264_qpel_16_mc12_8_c: 1748.2 ( 1.00x)
avg_h264_qpel_16_mc12_8_sse2: 198.5 ( 8.81x)
avg_h264_qpel_16_mc12_8_ssse3: 203.2 ( 8.60x)
avg_h264_qpel_16_mc21_8_c: 1638.1 ( 1.00x)
avg_h264_qpel_16_mc21_8_sse2: 277.4 ( 5.91x)
avg_h264_qpel_16_mc21_8_ssse3: 291.1 ( 5.63x)
avg_h264_qpel_16_mc22_8_c: 1140.7 ( 1.00x)
avg_h264_qpel_16_mc22_8_sse2: 180.3 ( 6.33x)
avg_h264_qpel_16_mc22_8_ssse3: 181.9 ( 6.27x)
avg_h264_qpel_16_mc23_8_c: 1629.9 ( 1.00x)
avg_h264_qpel_16_mc23_8_sse2: 278.0 ( 5.86x)
avg_h264_qpel_16_mc23_8_ssse3: 291.0 ( 5.60x)
avg_h264_qpel_16_mc32_8_c: 1752.1 ( 1.00x)
avg_h264_qpel_16_mc32_8_sse2: 193.7 ( 9.05x)
avg_h264_qpel_16_mc32_8_ssse3: 203.4 ( 8.61x)
put_h264_qpel_8_mc12_8_c: 421.8 ( 1.00x)
put_h264_qpel_8_mc12_8_sse2: 61.7 ( 6.83x)
put_h264_qpel_8_mc12_8_ssse3: 67.2 ( 6.28x)
put_h264_qpel_8_mc21_8_c: 396.8 ( 1.00x)
put_h264_qpel_8_mc21_8_sse2: 85.4 ( 4.65x)
put_h264_qpel_8_mc21_8_ssse3: 91.6 ( 4.33x)
put_h264_qpel_8_mc22_8_c: 234.1 ( 1.00x)
put_h264_qpel_8_mc22_8_sse2: 54.4 ( 4.30x)
put_h264_qpel_8_mc22_8_ssse3: 60.2 ( 3.89x)
put_h264_qpel_8_mc23_8_c: 399.2 ( 1.00x)
put_h264_qpel_8_mc23_8_sse2: 85.5 ( 4.67x)
put_h264_qpel_8_mc23_8_ssse3: 91.8 ( 4.35x)
put_h264_qpel_8_mc32_8_c: 422.2 ( 1.00x)
put_h264_qpel_8_mc32_8_sse2: 61.8 ( 6.83x)
put_h264_qpel_8_mc32_8_ssse3: 67.0 ( 6.30x)
put_h264_qpel_16_mc12_8_c: 1720.3 ( 1.00x)
put_h264_qpel_16_mc12_8_sse2: 189.9 ( 9.06x)
put_h264_qpel_16_mc12_8_ssse3: 199.9 ( 8.61x)
put_h264_qpel_16_mc21_8_c: 1624.5 ( 1.00x)
put_h264_qpel_16_mc21_8_sse2: 285.4 ( 5.69x)
put_h264_qpel_16_mc21_8_ssse3: 296.4 ( 5.48x)
put_h264_qpel_16_mc22_8_c: 963.9 ( 1.00x)
put_h264_qpel_16_mc22_8_sse2: 170.1 ( 5.67x)
put_h264_qpel_16_mc22_8_ssse3: 186.4 ( 5.17x)
put_h264_qpel_16_mc23_8_c: 1613.5 ( 1.00x)
put_h264_qpel_16_mc23_8_sse2: 274.6 ( 5.88x)
put_h264_qpel_16_mc23_8_ssse3: 300.4 ( 5.37x)
put_h264_qpel_16_mc32_8_c: 1735.9 ( 1.00x)
put_h264_qpel_16_mc32_8_sse2: 189.6 ( 9.15x)
put_h264_qpel_16_mc32_8_ssse3: 199.5 ( 8.70x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/h264_qpel.c | 4 ++--
libavcodec/x86/h264_qpel_8bit.asm | 34 +++++++++++--------------------
2 files changed, 14 insertions(+), 24 deletions(-)
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index af031fe2e9..f4082e2242 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -65,7 +65,7 @@ void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, ptrdiff_t srcStride, int size);\
-void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride, int h);\
+void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_sse2(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride, int h);\
void ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, ptrdiff_t dstStride);\
@@ -89,7 +89,7 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX
{\
int w = size>>4;\
do{\
- ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, size);\
+ ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_sse2(dst, tmp, dstStride, size);\
tmp += 8;\
dst += 8;\
}while(w--);\
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index c66a9bda40..39a387b4bb 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -613,36 +613,26 @@ cglobal put_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
%macro QPEL8OR16_HV2_LOWPASS_OP 1
-cglobal %1_h264_qpel8or16_hv2_lowpass_op, 4,4 ; dst, tmp, dstStride, h
+cglobal %1_h264_qpel8or16_hv2_lowpass_op, 4,4,6 ; dst, tmp, dstStride, h
.loop:
mova m0, [r1]
- mova m3, [r1+8]
- mova m1, [r1+2]
- mova m4, [r1+10]
- paddw m0, m4
- paddw m1, m3
- paddw m3, [r1+18]
- paddw m4, [r1+16]
- mova m2, [r1+4]
- mova m5, [r1+12]
- paddw m2, [r1+6]
- paddw m5, [r1+14]
+ movu m1, [r1+2]
+ movu m3, [r1+10]
+ movu m4, [r1+8]
+ movu m2, [r1+4]
+ movu m5, [r1+6]
+ paddw m0, m3
+ paddw m1, m4
psubw m0, m1
- psubw m3, m4
psraw m0, 2
- psraw m3, 2
+ paddw m2, m5
psubw m0, m1
- psubw m3, m4
paddsw m0, m2
- paddsw m3, m5
psraw m0, 2
- psraw m3, 2
paddw m0, m2
- paddw m3, m5
psraw m0, 6
- psraw m3, 6
- packuswb m0, m3
- op_%1 m0, [r0], m7
+ packuswb m0, m0
+ op_%1h m0, [r0], m5
add r1, 48
add r0, r2
dec r3d
@@ -650,7 +640,7 @@ cglobal %1_h264_qpel8or16_hv2_lowpass_op, 4,4 ; dst, tmp, dstStride, h
RET
%endmacro
-INIT_MMX mmxext
+INIT_XMM sse2
QPEL8OR16_HV2_LOWPASS_OP put
QPEL8OR16_HV2_LOWPASS_OP avg
--
2.49.1
>From 65beb628def380385dc0c8af4b09c8ade2f2ac9f Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 4 Oct 2025 15:07:18 +0200
Subject: [PATCH 7/8] avcodec/x86/h264_qpel: Split hv2_lowpass_sse2 into size
8,16 funcs
This is beneficial size-wise: 384B of new asm functions are more
than outweighted by 416B savings from simpler calls here (for size 16,
the size 8 function had been called twice).
It also makes the code more readable, as it allowed to remove
several wrappers in h264_qpel.c.
It is also beneficial performance-wise. Old benchmarks:
avg_h264_qpel_16_mc12_8_c: 1757.7 ( 1.00x)
avg_h264_qpel_16_mc12_8_sse2: 197.7 ( 8.89x)
avg_h264_qpel_16_mc12_8_ssse3: 204.6 ( 8.59x)
avg_h264_qpel_16_mc21_8_c: 1631.6 ( 1.00x)
avg_h264_qpel_16_mc21_8_sse2: 276.4 ( 5.90x)
avg_h264_qpel_16_mc21_8_ssse3: 290.7 ( 5.61x)
avg_h264_qpel_16_mc22_8_c: 1122.7 ( 1.00x)
avg_h264_qpel_16_mc22_8_sse2: 179.5 ( 6.25x)
avg_h264_qpel_16_mc22_8_ssse3: 181.8 ( 6.17x)
avg_h264_qpel_16_mc23_8_c: 1626.7 ( 1.00x)
avg_h264_qpel_16_mc23_8_sse2: 276.8 ( 5.88x)
avg_h264_qpel_16_mc23_8_ssse3: 290.9 ( 5.59x)
avg_h264_qpel_16_mc32_8_c: 1754.1 ( 1.00x)
avg_h264_qpel_16_mc32_8_sse2: 193.8 ( 9.05x)
avg_h264_qpel_16_mc32_8_ssse3: 203.6 ( 8.62x)
put_h264_qpel_16_mc12_8_c: 1733.6 ( 1.00x)
put_h264_qpel_16_mc12_8_sse2: 189.6 ( 9.14x)
put_h264_qpel_16_mc12_8_ssse3: 199.6 ( 8.69x)
put_h264_qpel_16_mc21_8_c: 1616.0 ( 1.00x)
put_h264_qpel_16_mc21_8_sse2: 284.3 ( 5.69x)
put_h264_qpel_16_mc21_8_ssse3: 296.5 ( 5.45x)
put_h264_qpel_16_mc22_8_c: 963.7 ( 1.00x)
put_h264_qpel_16_mc22_8_sse2: 169.9 ( 5.67x)
put_h264_qpel_16_mc22_8_ssse3: 186.1 ( 5.18x)
put_h264_qpel_16_mc23_8_c: 1607.2 ( 1.00x)
put_h264_qpel_16_mc23_8_sse2: 275.0 ( 5.84x)
put_h264_qpel_16_mc23_8_ssse3: 297.8 ( 5.40x)
put_h264_qpel_16_mc32_8_c: 1734.7 ( 1.00x)
put_h264_qpel_16_mc32_8_sse2: 189.4 ( 9.16x)
put_h264_qpel_16_mc32_8_ssse3: 199.4 ( 8.70x)
New benchmarks:
avg_h264_qpel_16_mc12_8_c: 1743.7 ( 1.00x)
avg_h264_qpel_16_mc12_8_sse2: 189.7 ( 9.19x)
avg_h264_qpel_16_mc12_8_ssse3: 204.4 ( 8.53x)
avg_h264_qpel_16_mc21_8_c: 1637.7 ( 1.00x)
avg_h264_qpel_16_mc21_8_sse2: 267.7 ( 6.12x)
avg_h264_qpel_16_mc21_8_ssse3: 291.5 ( 5.62x)
avg_h264_qpel_16_mc22_8_c: 1150.3 ( 1.00x)
avg_h264_qpel_16_mc22_8_sse2: 164.6 ( 6.99x)
avg_h264_qpel_16_mc22_8_ssse3: 182.1 ( 6.32x)
avg_h264_qpel_16_mc23_8_c: 1635.3 ( 1.00x)
avg_h264_qpel_16_mc23_8_sse2: 268.5 ( 6.09x)
avg_h264_qpel_16_mc23_8_ssse3: 298.5 ( 5.48x)
avg_h264_qpel_16_mc32_8_c: 1740.6 ( 1.00x)
avg_h264_qpel_16_mc32_8_sse2: 182.6 ( 9.53x)
avg_h264_qpel_16_mc32_8_ssse3: 201.9 ( 8.62x)
put_h264_qpel_16_mc12_8_c: 1727.4 ( 1.00x)
put_h264_qpel_16_mc12_8_sse2: 188.1 ( 9.18x)
put_h264_qpel_16_mc12_8_ssse3: 199.6 ( 8.65x)
put_h264_qpel_16_mc21_8_c: 1623.5 ( 1.00x)
put_h264_qpel_16_mc21_8_sse2: 265.9 ( 6.11x)
put_h264_qpel_16_mc21_8_ssse3: 299.4 ( 5.42x)
put_h264_qpel_16_mc22_8_c: 954.0 ( 1.00x)
put_h264_qpel_16_mc22_8_sse2: 161.8 ( 5.89x)
put_h264_qpel_16_mc22_8_ssse3: 180.4 ( 5.29x)
put_h264_qpel_16_mc23_8_c: 1611.2 ( 1.00x)
put_h264_qpel_16_mc23_8_sse2: 265.8 ( 6.06x)
put_h264_qpel_16_mc23_8_ssse3: 300.3 ( 5.37x)
put_h264_qpel_16_mc32_8_c: 1734.5 ( 1.00x)
put_h264_qpel_16_mc32_8_sse2: 180.0 ( 9.63x)
put_h264_qpel_16_mc32_8_ssse3: 199.7 ( 8.69x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/h264_qpel.c | 39 +++-----------------
libavcodec/x86/h264_qpel_8bit.asm | 61 ++++++++++++++++++++-----------
2 files changed, 45 insertions(+), 55 deletions(-)
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index f4082e2242..9b22c74286 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -65,7 +65,8 @@ void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, ptrdiff_t srcStride, int size);\
-void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_sse2(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride, int h);\
+void ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_sse2(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride);\
+void ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_sse2(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, ptrdiff_t dstStride);\
@@ -84,16 +85,6 @@ static av_always_inline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *ds
ff_put_h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
}\
-\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride, int size)\
-{\
- int w = size>>4;\
- do{\
- ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_sse2(dst, tmp, dstStride, size);\
- tmp += 8;\
- dst += 8;\
- }while(w--);\
-}\
#define QPEL_H264_H16(OPNAME, EXT) \
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## EXT(uint8_t *dst, const uint8_t *src, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)\
@@ -156,40 +147,22 @@ static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
}
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
-static av_always_inline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int size)\
-{\
- put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, srcStride, size);\
- ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, size);\
-}\
static av_always_inline void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{\
- OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, srcStride, 8);\
+ put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, srcStride, 8);\
+ ff_ ## OPNAME ## h264_qpel8_hv2_lowpass_ ## MMX(dst, tmp, dstStride);\
}\
static av_always_inline void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{\
- OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, srcStride, 16);\
+ put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, srcStride, 16);\
+ ff_ ## OPNAME ## h264_qpel16_hv2_lowpass_ ## MMX(dst, tmp, dstStride);\
}\
-#define SSSE3_HV2_LOWPASS_WRAPPER(OPNAME) \
-static av_always_inline void \
-ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride, int size) \
-{\
- if (size == 8)\
- ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(dst, tmp, dstStride);\
- else\
- ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(dst, tmp, dstStride);\
-}
-SSSE3_HV2_LOWPASS_WRAPPER(avg)
-SSSE3_HV2_LOWPASS_WRAPPER(put)
-
#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
-#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
-#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
-
#define ff_put_pixels4_l2_shift5_sse2 ff_put_pixels4_l2_shift5_mmxext
#define ff_avg_pixels4_l2_shift5_sse2 ff_avg_pixels4_l2_shift5_mmxext
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index 39a387b4bb..6e082819ac 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -611,28 +611,45 @@ cglobal put_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
.end:
RET
-
-%macro QPEL8OR16_HV2_LOWPASS_OP 1
-cglobal %1_h264_qpel8or16_hv2_lowpass_op, 4,4,6 ; dst, tmp, dstStride, h
-.loop:
- mova m0, [r1]
- movu m1, [r1+2]
- movu m3, [r1+10]
- movu m4, [r1+8]
- movu m2, [r1+4]
- movu m5, [r1+6]
- paddw m0, m3
+%macro HV2_LOWPASS 2
+ mova %1, [r1+%2]
+ movu m1, [r1+2+%2]
+ movu m3, [r1+10+%2]
+ movu m4, [r1+8+%2]
+ movu m2, [r1+4+%2]
+ paddw %1, m3
+ movu m3, [r1+6+%2]
paddw m1, m4
- psubw m0, m1
- psraw m0, 2
- paddw m2, m5
- psubw m0, m1
- paddsw m0, m2
- psraw m0, 2
- paddw m0, m2
- psraw m0, 6
+ psubw %1, m1
+ psraw %1, 2
+ paddw m2, m3
+ psubw %1, m1
+ paddsw %1, m2
+ psraw %1, 2
+ paddw %1, m2
+ psraw %1, 6
+%endmacro
+
+%macro QPEL8AND16_HV2_LOWPASS_OP 1
+cglobal %1_h264_qpel8_hv2_lowpass, 3,4,6 ; dst, tmp, dstStride
+ mov r3d, 8
+.loop:
+ HV2_LOWPASS m0, 0
packuswb m0, m0
- op_%1h m0, [r0], m5
+ op_%1h m0, [r0], m3
+ add r1, 48
+ add r0, r2
+ dec r3d
+ jne .loop
+ RET
+
+cglobal %1_h264_qpel16_hv2_lowpass, 3,4,6 ; dst, tmp, dstStride
+ mov r3d, 16
+.loop:
+ HV2_LOWPASS m0, 0
+ HV2_LOWPASS m5, 16
+ packuswb m0, m5
+ op_%1 m0, [r0], m3
add r1, 48
add r0, r2
dec r3d
@@ -641,8 +658,8 @@ cglobal %1_h264_qpel8or16_hv2_lowpass_op, 4,4,6 ; dst, tmp, dstStride, h
%endmacro
INIT_XMM sse2
-QPEL8OR16_HV2_LOWPASS_OP put
-QPEL8OR16_HV2_LOWPASS_OP avg
+QPEL8AND16_HV2_LOWPASS_OP put
+QPEL8AND16_HV2_LOWPASS_OP avg
%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
cglobal %1_h264_qpel8_hv2_lowpass, 3,4,6 ; dst, tmp, dstStride
--
2.49.1
>From fcf92b4db9feeb2b626d171317d2609a7d7402b4 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 4 Oct 2025 15:45:36 +0200
Subject: [PATCH 8/8] avcodec/x86/h264_qpel: Simplify macros
1. Remove the OP parameter from the QPEL_H264* macros. These are
a remnant of inline assembly and were forgotten in
610e00b3594bf0f2a75713f20e9c4edf0d03a818.
2. Pass the instruction set extension for the shift5 function
explicitly in the macro instead of using magic #defines.
3. Likewise, avoid magic #defines for (8|16)_v_lowpass_ssse3.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/h264_qpel.c | 98 ++++++++++++++++++--------------------
1 file changed, 46 insertions(+), 52 deletions(-)
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 9b22c74286..f7596329e2 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -78,7 +78,7 @@ void ff_put_h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp, ptr
DEF_QPEL(avg)
DEF_QPEL(put)
-#define QPEL_H264(OPNAME, OP, MMX)\
+#define QPEL_H264(OPNAME, MMX)\
static av_always_inline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{\
src -= 2*srcStride+2;\
@@ -100,17 +100,17 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## EXT(u
#if ARCH_X86_64
-#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+#define QPEL_H264_H16_XMM(OPNAME, MMX)\
void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride);
void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride);
#else // ARCH_X86_64
-#define QPEL_H264_H16_XMM(OPNAME, OP, EXT) QPEL_H264_H16(OPNAME, EXT)
+#define QPEL_H264_H16_XMM(OPNAME, EXT) QPEL_H264_H16(OPNAME, EXT)
#endif // ARCH_X86_64
-#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
-QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+#define QPEL_H264_H_XMM(OPNAME, MMX)\
+QPEL_H264_H16_XMM(OPNAME, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
@@ -121,15 +121,15 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
-#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+#define QPEL_H264_V_XMM(OPNAME, XMM, XMM2)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{\
- ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
+ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## XMM2(dst , src , dstStride, srcStride, 8);\
}\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{\
- ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
- ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## XMM2(dst , src , dstStride, srcStride, 16);\
+ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## XMM2(dst+8, src+8, dstStride, srcStride, 16);\
}
static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
@@ -146,7 +146,7 @@ static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
}
}
-#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
+#define QPEL_H264_HV_XMM(OPNAME, MMX)\
static av_always_inline void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{\
put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, srcStride, 8);\
@@ -158,18 +158,10 @@ static av_always_inline void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *d
ff_ ## OPNAME ## h264_qpel16_hv2_lowpass_ ## MMX(dst, tmp, dstStride);\
}\
-#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
-#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
-#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
-#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
-
-#define ff_put_pixels4_l2_shift5_sse2 ff_put_pixels4_l2_shift5_mmxext
-#define ff_avg_pixels4_l2_shift5_sse2 ff_avg_pixels4_l2_shift5_mmxext
-
-#define H264_MC_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \
-H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
+#define H264_MC_V_H_HV(OPNAME, SIZE, MMX, ALIGN, SHIFT5_EXT) \
+H264_MC_V(OPNAME, SIZE, MMX, ALIGN, SHIFT5_EXT)\
+H264_MC_H(OPNAME, SIZE, MMX, ALIGN, SHIFT5_EXT)\
+H264_MC_HV(OPNAME, SIZE, MMX, ALIGN, SHIFT5_EXT)\
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
@@ -188,7 +180,7 @@ static void avg_h264_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src,
ff_avg_pixels8_mmxext(dst, src, stride, 8);
}
-#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
+#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN, UNUSED) \
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
@@ -204,7 +196,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uin
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
}\
-#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
+#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN, UNUSED) \
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
@@ -224,7 +216,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
}\
-#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
+#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN, SHIFT5_EXT) \
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
@@ -286,7 +278,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uin
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((uintptr_t)temp & 7) == 0);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\
- ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_sse2(dst, halfV+2, halfHV, stride);\
+ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_ ## SHIFT5_EXT(dst, halfV+2, halfHV, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
@@ -296,35 +288,37 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uin
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((uintptr_t)temp & 7) == 0);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\
- ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_sse2(dst, halfV+3, halfHV, stride);\
+ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_ ## SHIFT5_EXT(dst, halfV+3, halfHV, stride);\
}\
-#define H264_MC(QPEL, SIZE, MMX, ALIGN)\
-QPEL(put_, SIZE, MMX, ALIGN) \
-QPEL(avg_, SIZE, MMX, ALIGN) \
+#define H264_MC(QPEL, SIZE, MMX, ALIGN, SHIFT5_EXT)\
+QPEL(put_, SIZE, MMX, ALIGN, SHIFT5_EXT) \
+QPEL(avg_, SIZE, MMX, ALIGN, SHIFT5_EXT) \
-#define H264_MC_816(QPEL, XMM)\
-QPEL(put_, 8, XMM, 16)\
-QPEL(put_, 16,XMM, 16)\
-QPEL(avg_, 8, XMM, 16)\
-QPEL(avg_, 16,XMM, 16)\
+#define H264_MC_816(QPEL, XMM, SHIFT5_EXT)\
+QPEL(put_, 8, XMM, 16, SHIFT5_EXT)\
+QPEL(put_, 16,XMM, 16, SHIFT5_EXT)\
+QPEL(avg_, 8, XMM, 16, SHIFT5_EXT)\
+QPEL(avg_, 16,XMM, 16, SHIFT5_EXT)\
-QPEL_H264(put_, PUT_OP, mmxext)
-QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
-QPEL_H264_V_XMM(put_, PUT_OP, sse2)
-QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
-QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
-QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
-QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
-QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
-QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
-QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
+QPEL_H264(put_, mmxext)
+QPEL_H264(avg_, mmxext)
+QPEL_H264_V_XMM(put_, sse2, sse2)
+QPEL_H264_V_XMM(avg_, sse2, sse2)
+QPEL_H264_HV_XMM(put_, sse2)
+QPEL_H264_HV_XMM(avg_, sse2)
+QPEL_H264_H_XMM(put_, ssse3)
+QPEL_H264_H_XMM(avg_, ssse3)
+QPEL_H264_V_XMM(put_, ssse3, sse2)
+QPEL_H264_V_XMM(avg_, ssse3, sse2)
+QPEL_H264_HV_XMM(put_, ssse3)
+QPEL_H264_HV_XMM(avg_, ssse3)
-H264_MC(H264_MC_V_H_HV, 4, mmxext, 8)
-H264_MC_816(H264_MC_V, sse2)
-H264_MC_816(H264_MC_HV, sse2)
-H264_MC_816(H264_MC_H, ssse3)
-H264_MC_816(H264_MC_HV, ssse3)
+H264_MC(H264_MC_V_H_HV, 4, mmxext, 8, mmxext)
+H264_MC_816(H264_MC_V, sse2, sse2)
+H264_MC_816(H264_MC_HV, sse2, sse2)
+H264_MC_816(H264_MC_H, ssse3, sse2)
+H264_MC_816(H264_MC_HV, ssse3, sse2)
//10bit
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-10-04 15:47 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-04 15:47 [FFmpeg-devel] [PATCH] h264qpel (PR #20645) mkver via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git