From: mkver via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: mkver <code@ffmpeg.org>
Subject: [FFmpeg-devel] [PR] avcodec/x86/cfhddsp: Avoid pmaddwd (PR #21581)
Date: Mon, 26 Jan 2026 02:39:14 -0000
Message-ID: <176939515513.25.11347051055088177752@4457048688e7> (raw)
PR #21581 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21581
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21581.patch
Also improve cfhdencdsp a bit.
>From 1c7ff999a850a20a8b3de9f90a986d4a89835b3c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 25 Jan 2026 19:19:39 +0100
Subject: [PATCH 1/6] avcodec/x86/cfhdencdsp: Avoid load of -1
It can be easily generated at runtime.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/cfhdencdsp.asm | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm
index 4aaeb56972..3d4aa90e96 100644
--- a/libavcodec/x86/cfhdencdsp.asm
+++ b/libavcodec/x86/cfhdencdsp.asm
@@ -31,7 +31,6 @@ pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5
pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5
pd_4: times 4 dd 4
pw_n4: times 8 dw -4
-cextern pw_m1
cextern pw_1
cextern pw_4
@@ -45,7 +44,7 @@ cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwid
shl hwidthq, 1
mova m7, [pd_4]
mova m8, [pw_1]
- mova m9, [pw_m1]
+ pcmpeqw m9, m9 ; -1
mova m10,[pw_p1_n1]
movsxdifnidn yq, yd
movsxdifnidn widthq, widthd
@@ -207,7 +206,7 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidt
mova m7, [pd_4]
mova m8, [pw_1]
- mova m9, [pw_m1]
+ pcmpeqw m9, m9 ; -1
mova m10,[pw_p1_n1]
mova m11,[pw_n1_p1]
mova m12,[pw_4]
--
2.52.0
>From 964a58b29677093fe2e195cb7c6fc43234967f22 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 25 Jan 2026 20:16:57 +0100
Subject: [PATCH 2/6] avcodec/x86/cfhdencdsp: Avoid unnecessary constants
Up until now, cfhdencdsp used constants consisting
of -1, 1, ...,-1,1 words and 1, -1,...,1,-1 words
for use as constants in pmaddwd. But one can use
the same constants if one shuffles the words in
a dword the opposite order. Similarly for some other
constants. This also allowed to avoid a register in
chfdenc_vert_filter.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/cfhdencdsp.asm | 38 +++++++++++++----------------------
1 file changed, 14 insertions(+), 24 deletions(-)
diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm
index 3d4aa90e96..73e12f283e 100644
--- a/libavcodec/x86/cfhdencdsp.asm
+++ b/libavcodec/x86/cfhdencdsp.asm
@@ -24,11 +24,8 @@
SECTION_RODATA
pw_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1
-pw_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1
pw_p5_n11: dw 5, -11, 5, -11, 5, -11, 5, -11
-pw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11
pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5
-pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5
pd_4: times 4 dd 4
pw_n4: times 8 dw -4
cextern pw_1
@@ -44,7 +41,7 @@ cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwid
shl hwidthq, 1
mova m7, [pd_4]
mova m8, [pw_1]
- pcmpeqw m9, m9 ; -1
+ pcmpeqw m9, m9 ; -1
mova m10,[pw_p1_n1]
movsxdifnidn yq, yd
movsxdifnidn widthq, widthd
@@ -196,7 +193,7 @@ cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwid
%if ARCH_X86_64
INIT_XMM sse2
-cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos
+cglobal cfhdenc_vert_filter, 8, 11, 13, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos
shl istrideq, 1
shl widthd, 1
@@ -208,9 +205,8 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidt
mova m8, [pw_1]
pcmpeqw m9, m9 ; -1
mova m10,[pw_p1_n1]
- mova m11,[pw_n1_p1]
- mova m12,[pw_4]
- mova m13,[pw_n4]
+ mova m11, [pw_4]
+ mova m12, [pw_n4]
.loopw:
mov yq, 2
@@ -237,9 +233,7 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidt
add posq, istrideq
movu m5, [inputq + posq]
- mova m6, m0
- punpcklwd m0, m1
- punpckhwd m1, m6
+ SBUTTERFLY wd, 0, 1, 6
mova m6, m2
punpcklwd m2, m3
@@ -250,9 +244,9 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidt
punpckhwd m5, m6
pmaddwd m0, [pw_p5_n11]
- pmaddwd m1, [pw_n11_p5]
- pmaddwd m2, m12
- pmaddwd m3, m12
+ pmaddwd m1, [pw_p5_n11]
+ pmaddwd m2, m11
+ pmaddwd m3, m11
pmaddwd m4, m9
pmaddwd m5, m9
@@ -313,9 +307,7 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidt
punpcklwd m0, m1
punpckhwd m1, m6
- mova m6, m2
- punpcklwd m2, m3
- punpckhwd m3, m6
+ SBUTTERFLY wd, 2, 3, 6
mova m6, m4
punpcklwd m4, m5
@@ -324,7 +316,7 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidt
pmaddwd m0, m9
pmaddwd m1, m9
pmaddwd m2, m10
- pmaddwd m3, m11
+ pmaddwd m3, m10
pmaddwd m4, m8
pmaddwd m5, m8
@@ -394,16 +386,14 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidt
punpcklwd m2, m3
punpckhwd m3, m6
- mova m6, m4
- punpcklwd m4, m5
- punpckhwd m5, m6
+ SBUTTERFLY wd, 4, 5, 6
pmaddwd m0, m8
pmaddwd m1, m8
- pmaddwd m2, m13
- pmaddwd m3, m13
+ pmaddwd m2, m12
+ pmaddwd m3, m12
pmaddwd m4, [pw_p11_n5]
- pmaddwd m5, [pw_n5_p11]
+ pmaddwd m5, [pw_p11_n5]
paddd m4, m2
paddd m5, m3
--
2.52.0
>From 3752f2d0fac7b3212ee2d1278c501ba5e8433f9d Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 25 Jan 2026 21:04:21 +0100
Subject: [PATCH 3/6] avcodec/x86/cfhdencdsp: Don't load twice
Sign extend the integer arguments directly from the stack
instead of loading qwords, followed by sign-extending the
lower half.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/cfhdencdsp.asm | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm
index 73e12f283e..83676cea81 100644
--- a/libavcodec/x86/cfhdencdsp.asm
+++ b/libavcodec/x86/cfhdencdsp.asm
@@ -35,7 +35,8 @@ SECTION .text
%if ARCH_X86_64
INIT_XMM sse2
-cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp
+cglobal cfhdenc_horiz_filter, 6, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp
+ movsxdifnidn widthq, widthm
shl istrideq, 1
shl lwidthq, 1
shl hwidthq, 1
@@ -43,8 +44,7 @@ cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwid
mova m8, [pw_1]
pcmpeqw m9, m9 ; -1
mova m10,[pw_p1_n1]
- movsxdifnidn yq, yd
- movsxdifnidn widthq, widthd
+ movsxdifnidn yq, ym
neg yq
.looph:
movsx xq, word [inputq]
--
2.52.0
>From 4dc24d619dec7088a5f28267deb771d19bd066b6 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 25 Jan 2026 21:29:33 +0100
Subject: [PATCH 4/6] avcodec/x86/cfhdencdsp: Avoid += x, -= x
Avoid incrementing lowq and highq inside the loop by using
complex addressing modes, avoiding to undo said modification
at the end of the horizontal loop.
For inputq, modify istrideq outside of the loop so that
it is only modified once at the end of the horizontal loop.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/cfhdencdsp.asm | 14 ++++----------
1 file changed, 4 insertions(+), 10 deletions(-)
diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm
index 83676cea81..f2da8720b4 100644
--- a/libavcodec/x86/cfhdencdsp.asm
+++ b/libavcodec/x86/cfhdencdsp.asm
@@ -37,12 +37,13 @@ SECTION .text
INIT_XMM sse2
cglobal cfhdenc_horiz_filter, 6, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp
movsxdifnidn widthq, widthm
- shl istrideq, 1
shl lwidthq, 1
shl hwidthq, 1
mova m7, [pd_4]
mova m8, [pw_1]
pcmpeqw m9, m9 ; -1
+ sub istrideq, widthq
+ shl istrideq, 1
mova m10,[pw_p1_n1]
movsxdifnidn yq, ym
neg yq
@@ -136,8 +137,6 @@ cglobal cfhdenc_horiz_filter, 6, 10, 11, input, low, high, istride, lwidth, hwid
cmp xq, widthq
jl .loopw
- add lowq, widthq
- add highq, widthq
lea inputq, [inputq + widthq * 2]
movsx xq, word [inputq - 4]
@@ -147,7 +146,7 @@ cglobal cfhdenc_horiz_filter, 6, 10, 11, input, low, high, istride, lwidth, hwid
movd xm0, tempd
packssdw m0, m0
movd tempd, m0
- mov word [lowq-2], tempw
+ mov word [lowq+widthq-2], tempw
movsx tempq, word [inputq - 4]
imul tempq, 11
@@ -175,12 +174,7 @@ cglobal cfhdenc_horiz_filter, 6, 10, 11, input, low, high, istride, lwidth, hwid
movd xm0, tempd
packssdw m0, m0
movd tempd, m0
- mov word [highq-2], tempw
-
- sub inputq, widthq
- sub inputq, widthq
- sub highq, widthq
- sub lowq, widthq
+ mov word [highq+widthq-2], tempw
add lowq, lwidthq
add highq, hwidthq
--
2.52.0
>From 79ef1a5651f3f4937974140717068b68dc6d51bb Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Mon, 26 Jan 2026 03:00:39 +0100
Subject: [PATCH 5/6] avcodec/x86/cfhddsp: Avoid pmaddwd
The result of using pmaddwd with the coefficients 1,-1,...,1,-1
is just the negative of using pmaddwd with the coefficients
-1,1,...,-1,1, so avoid one pmaddwd.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/cfhddsp.asm | 43 +++++++++++++-------------------------
1 file changed, 14 insertions(+), 29 deletions(-)
diff --git a/libavcodec/x86/cfhddsp.asm b/libavcodec/x86/cfhddsp.asm
index 87c2df634a..821d511ba2 100644
--- a/libavcodec/x86/cfhddsp.asm
+++ b/libavcodec/x86/cfhddsp.asm
@@ -24,7 +24,6 @@
SECTION_RODATA
factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1,
-factor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1,
factor_p11_n4: dw 11, -4, 11, -4, 11, -4, 11, -4,
factor_p5_p4: dw 5, 4, 5, 4, 5, 4, 5, 4,
pd_4: times 4 dd 4
@@ -80,7 +79,6 @@ cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height
%if ARCH_X86_64
mova m8, [factor_p1_n1]
- mova m9, [factor_n1_p1]
mova m10, [pw_1]
mova m11, [pd_4]
%endif
@@ -144,29 +142,23 @@ cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height
punpcklwd m4, m1
punpckhwd m5, m1
- mova m6, m4
- mova m7, m5
-
%if ARCH_X86_64
pmaddwd m4, m8
pmaddwd m5, m8
- pmaddwd m6, m9
- pmaddwd m7, m9
+ psubd m6, m11, m4
+ psubd m7, m11, m5
paddd m4, m11
paddd m5, m11
- paddd m6, m11
- paddd m7, m11
%else
+ mova m2, [pd_4]
pmaddwd m4, [factor_p1_n1]
pmaddwd m5, [factor_p1_n1]
- pmaddwd m6, [factor_n1_p1]
- pmaddwd m7, [factor_n1_p1]
- paddd m4, [pd_4]
- paddd m5, [pd_4]
- paddd m6, [pd_4]
- paddd m7, [pd_4]
+ psubd m6, m2, m4
+ psubd m7, m2, m5
+ paddd m4, m2
+ paddd m5, m2
%endif
psrad m4, 3
@@ -313,7 +305,6 @@ cglobal cfhd_vert_filter, 8, 11, 14, output, ostride, low, lwidth, high, hwidth,
dec heightd
mova m8, [factor_p1_n1]
- mova m9, [factor_n1_p1]
mova m10, [pw_1]
mova m11, [pd_4]
mova m12, [factor_p11_n4]
@@ -471,29 +462,23 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
punpcklwd m4, m1
punpckhwd m5, m1
- mova m6, m4
- mova m7, m5
-
%if ARCH_X86_64
pmaddwd m4, m8
pmaddwd m5, m8
- pmaddwd m6, m9
- pmaddwd m7, m9
+ psubd m6, m11, m4
+ psubd m7, m11, m5
paddd m4, m11
paddd m5, m11
- paddd m6, m11
- paddd m7, m11
%else
+ mova m2, [pd_4]
pmaddwd m4, [factor_p1_n1]
pmaddwd m5, [factor_p1_n1]
- pmaddwd m6, [factor_n1_p1]
- pmaddwd m7, [factor_n1_p1]
- paddd m4, [pd_4]
- paddd m5, [pd_4]
- paddd m6, [pd_4]
- paddd m7, [pd_4]
+ psubd m6, m2, m4
+ psubd m7, m2, m5
+ paddd m4, m2
+ paddd m5, m2
%endif
psrad m4, 3
--
2.52.0
>From 506f57782f9595e56fe5f32de495b884fdd9ffab Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Mon, 26 Jan 2026 03:21:48 +0100
Subject: [PATCH 6/6] avcodec/x86/cfhddsp: Reduce number of xmm registers used
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/cfhddsp.asm | 68 +++++++++++++++++++-------------------
1 file changed, 34 insertions(+), 34 deletions(-)
diff --git a/libavcodec/x86/cfhddsp.asm b/libavcodec/x86/cfhddsp.asm
index 821d511ba2..01ba00f8a6 100644
--- a/libavcodec/x86/cfhddsp.asm
+++ b/libavcodec/x86/cfhddsp.asm
@@ -36,20 +36,20 @@ SECTION .text
%macro CFHD_HORIZ_FILTER 1
%if %1 == 1023
-cglobal cfhd_horiz_filter_clip10, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp
+cglobal cfhd_horiz_filter_clip10, 5, 6, 8 + 3 * ARCH_X86_64, output, low, high, width, x, temp
shl widthd, 1
%define ostrideq widthq
%define lwidthq widthq
%define hwidthq widthq
%elif %1 == 4095
-cglobal cfhd_horiz_filter_clip12, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp
+cglobal cfhd_horiz_filter_clip12, 5, 6, 8 + 3 * ARCH_X86_64, output, low, high, width, x, temp
shl widthd, 1
%define ostrideq widthq
%define lwidthq widthq
%define hwidthq widthq
%else
%if ARCH_X86_64
-cglobal cfhd_horiz_filter, 8, 11, 12, output, ostride, low, lwidth, high, hwidth, width, height, x, y, temp
+cglobal cfhd_horiz_filter, 8, 11, 11, output, ostride, low, lwidth, high, hwidth, width, height, x, y, temp
shl ostrided, 1
shl lwidthd, 1
shl hwidthd, 1
@@ -79,8 +79,8 @@ cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height
%if ARCH_X86_64
mova m8, [factor_p1_n1]
- mova m10, [pw_1]
- mova m11, [pd_4]
+ mova m9, [pw_1]
+ mova m10, [pd_4]
%endif
%if %1 == 0
@@ -146,10 +146,10 @@ cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height
pmaddwd m4, m8
pmaddwd m5, m8
- psubd m6, m11, m4
- psubd m7, m11, m5
- paddd m4, m11
- paddd m5, m11
+ psubd m6, m10, m4
+ psubd m7, m10, m5
+ paddd m4, m10
+ paddd m5, m10
%else
mova m2, [pd_4]
pmaddwd m4, [factor_p1_n1]
@@ -177,8 +177,8 @@ cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height
mova m3, m0
%if ARCH_X86_64
- pmaddwd m2, m10
- pmaddwd m0, m10
+ pmaddwd m2, m9
+ pmaddwd m0, m9
pmaddwd m1, m8
pmaddwd m3, m8
%else
@@ -296,7 +296,7 @@ CFHD_HORIZ_FILTER 4095
INIT_XMM sse2
%if ARCH_X86_64
-cglobal cfhd_vert_filter, 8, 11, 14, output, ostride, low, lwidth, high, hwidth, width, height, x, y, pos
+cglobal cfhd_vert_filter, 8, 11, 13, output, ostride, low, lwidth, high, hwidth, width, height, x, y, pos
shl ostrided, 1
shl lwidthd, 1
shl hwidthd, 1
@@ -305,10 +305,10 @@ cglobal cfhd_vert_filter, 8, 11, 14, output, ostride, low, lwidth, high, hwidth,
dec heightd
mova m8, [factor_p1_n1]
- mova m10, [pw_1]
- mova m11, [pd_4]
- mova m12, [factor_p11_n4]
- mova m13, [factor_p5_p4]
+ mova m9, [pw_1]
+ mova m10, [pd_4]
+ mova m11, [factor_p11_n4]
+ mova m12, [factor_p5_p4]
%else
cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
shl xd, 1
@@ -344,8 +344,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
punpckhwd m2, m1
%if ARCH_X86_64
- pmaddwd m0, m12
- pmaddwd m2, m12
+ pmaddwd m0, m11
+ pmaddwd m2, m11
%else
pmaddwd m0, [factor_p11_n4]
pmaddwd m2, [factor_p11_n4]
@@ -398,8 +398,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
punpckhwd m2, m1
%if ARCH_X86_64
- pmaddwd m0, m13
- pmaddwd m2, m13
+ pmaddwd m0, m12
+ pmaddwd m2, m12
%else
pmaddwd m0, [factor_p5_p4]
pmaddwd m2, [factor_p5_p4]
@@ -466,10 +466,10 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
pmaddwd m4, m8
pmaddwd m5, m8
- psubd m6, m11, m4
- psubd m7, m11, m5
- paddd m4, m11
- paddd m5, m11
+ psubd m6, m10, m4
+ psubd m7, m10, m5
+ paddd m4, m10
+ paddd m5, m10
%else
mova m2, [pd_4]
pmaddwd m4, [factor_p1_n1]
@@ -502,8 +502,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
mova m3, m2
%if ARCH_X86_64
- pmaddwd m0, m10
- pmaddwd m2, m10
+ pmaddwd m0, m9
+ pmaddwd m2, m9
pmaddwd m1, m8
pmaddwd m3, m8
%else
@@ -550,8 +550,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
punpckhwd m2, m1
%if ARCH_X86_64
- pmaddwd m0, m13
- pmaddwd m2, m13
+ pmaddwd m0, m12
+ pmaddwd m2, m12
%else
pmaddwd m0, [factor_p5_p4]
pmaddwd m2, [factor_p5_p4]
@@ -571,8 +571,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
psubd m2, m3
%if ARCH_X86_64
- paddd m0, m11
- paddd m2, m11
+ paddd m0, m10
+ paddd m2, m10
%else
paddd m0, [pd_4]
paddd m2, [pd_4]
@@ -618,8 +618,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
punpckhwd m2, m1
%if ARCH_X86_64
- pmaddwd m0, m12
- pmaddwd m2, m12
+ pmaddwd m0, m11
+ pmaddwd m2, m11
%else
pmaddwd m0, [factor_p11_n4]
pmaddwd m2, [factor_p11_n4]
@@ -639,8 +639,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
paddd m2, m3
%if ARCH_X86_64
- paddd m0, m11
- paddd m2, m11
+ paddd m0, m10
+ paddd m2, m10
%else
paddd m0, [pd_4]
paddd m2, [pd_4]
--
2.52.0
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2026-01-26 2:40 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=176939515513.25.11347051055088177752@4457048688e7 \
--to=ffmpeg-devel@ffmpeg.org \
--cc=code@ffmpeg.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git