* [FFmpeg-devel] [PATCH] avcodec/x86/lossless_videodsp: Remove SSSE3 functions using MMX regs (PR #21236)
@ 2025-12-19 6:30 mkver via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-12-19 6:30 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: mkver
PR #21236 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21236
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21236.patch
>From 294893f6f81248d0f744f013488c5e49d483a97c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 18 Dec 2025 21:32:10 +0100
Subject: [PATCH 1/6] avcodec/x86/lossless_videodsp: Remove SSSE3 functions
using MMX regs
These functions are only used on Conroe (they are overwritten
by SSSE3 functions using xmm registers if the SSSE3SLOW is not set)
which is very old (introduced in 2006), so remove them.
Btw: The checkasm test (which uses declare_func and not
declare_func_emms since cd8a33bcce0a36874a851558aacd2e4b22dc6e00)
would fail on a Conroe, yet no one ever reported any such failure.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/lossless_videodsp.asm | 28 +------------------------
libavcodec/x86/lossless_videodsp_init.c | 12 ++---------
2 files changed, 3 insertions(+), 37 deletions(-)
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
index 7159aafe67..359d1ee4ca 100644
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -27,9 +27,8 @@ SECTION_RODATA
cextern pb_15
pb_zzzzzzzz77777777: times 8 db -1
-pb_7: times 8 db 7
+ times 8 db 7
pb_ef: times 8 db 14,15
-pb_67: times 8 db 6, 7
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
@@ -119,10 +118,8 @@ cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top
paddb m1, m2
pshufb m2, m1, m4
paddb m1, m2
-%if mmsize >= 16
pshufb m2, m1, m6
paddb m1, m2
-%endif
paddb xm0, xm1
%if %1
mova [dstq+wq], xm0
@@ -160,16 +157,6 @@ cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top
;------------------------------------------------------------------------------
; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
;------------------------------------------------------------------------------
-INIT_MMX ssse3
-cglobal add_left_pred, 3,3,7, dst, src, w, left
-.skip_prologue:
- mova m5, [pb_7]
- mova m4, [pb_zzzz3333zzzzbbbb]
- mova m3, [pb_zz11zz55zz99zzdd]
- movd m0, leftm
- psllq m0, 56
- ADD_LEFT_LOOP 1, 1
-
%macro ADD_LEFT_PRED_UNALIGNED 0
cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
mova xm5, [pb_15]
@@ -255,11 +242,9 @@ ADD_BYTES
pshufb m1, m3
paddw m1, m2
pshufb m0, m5
-%if mmsize == 16
mova m2, m1
pshufb m1, m4
paddw m1, m2
-%endif
paddw m0, m1
pand m0, m7
%ifidn %1, a
@@ -284,17 +269,6 @@ ADD_BYTES
;---------------------------------------------------------------------------------------------
; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
;---------------------------------------------------------------------------------------------
-INIT_MMX ssse3
-cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
-.skip_prologue:
- mova m5, [pb_67]
- mova m3, [pb_zzzz2323zzzzabab]
- movd m0, leftm
- psllq m0, 48
- movd m7, maskm
- SPLATW m7 ,m7
- ADD_HFYU_LEFT_LOOP_INT16 a, a
-
INIT_XMM ssse3
cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left
mova m5, [pb_ef]
diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c
index 5690cacaad..fce3dd4d62 100644
--- a/libavcodec/x86/lossless_videodsp_init.c
+++ b/libavcodec/x86/lossless_videodsp_init.c
@@ -29,14 +29,11 @@ void ff_add_median_pred_sse2(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, ptrdiff_t w,
int *left, int *left_top);
-int ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
- ptrdiff_t w, int left);
int ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left);
int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left);
-int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
int ff_add_left_pred_int16_unaligned_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
@@ -52,14 +49,9 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c)
}
if (EXTERNAL_SSSE3(cpu_flags)) {
- c->add_left_pred = ff_add_left_pred_ssse3;
- c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
- c->add_gradient_pred = ff_add_gradient_pred_ssse3;
- }
-
- if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
- c->add_left_pred = ff_add_left_pred_unaligned_ssse3;
+ c->add_left_pred = ff_add_left_pred_unaligned_ssse3;
c->add_left_pred_int16 = ff_add_left_pred_int16_unaligned_ssse3;
+ c->add_gradient_pred = ff_add_gradient_pred_ssse3;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
--
2.49.1
>From 214af879eb007b2f5833febd6f7bd607ef7d062b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 18 Dec 2025 21:45:46 +0100
Subject: [PATCH 2/6] tests/checkasm/llviddsp: Avoid unnecessary
initializations, allocs
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
tests/checkasm/llviddsp.c | 32 ++++++++++++++------------------
1 file changed, 14 insertions(+), 18 deletions(-)
diff --git a/tests/checkasm/llviddsp.c b/tests/checkasm/llviddsp.c
index 0552e98106..c7180ba698 100644
--- a/tests/checkasm/llviddsp.c
+++ b/tests/checkasm/llviddsp.c
@@ -56,13 +56,11 @@ static void check_add_bytes(LLVidDSPContext *c, int width)
fail();
- if (check_func(c->add_bytes, "add_bytes")) {
call_ref(dst0, src0, width);
call_new(dst1, src1, width);
if (memcmp(dst0, dst1, width))
fail();
bench_new(dst1, src1, width);
- }
av_free(src0);
av_free(src1);
@@ -91,13 +89,11 @@ static void check_add_median_pred(LLVidDSPContext *c, int width) {
b1 = b0;
- if (check_func(c->add_median_pred, "add_median_pred")) {
call_ref(dst0, src0, diff0, width, &a0, &b0);
call_new(dst1, src1, diff1, width, &a1, &b1);
if (memcmp(dst0, dst1, width) || (a0 != a1) || (b0 != b1))
fail();
bench_new(dst1, src1, diff1, width, &a1, &b1);
- }
av_free(src0);
av_free(src1);
@@ -107,7 +103,7 @@ static void check_add_median_pred(LLVidDSPContext *c, int width) {
av_free(dst1);
}
-static void check_add_left_pred(LLVidDSPContext *c, int width, int acc, const char * report)
+static void check_add_left_pred(LLVidDSPContext *c, int width, int acc)
{
int res0, res1;
uint8_t *dst0 = av_mallocz(width);
@@ -121,14 +117,12 @@ static void check_add_left_pred(LLVidDSPContext *c, int width, int acc, const ch
if (!dst0 || !dst1)
fail();
- if (check_func(c->add_left_pred, "%s", report)) {
res0 = call_ref(dst0, src0, width, acc);
res1 = call_new(dst1, src1, width, acc);
if ((res0 & 0xFF) != (res1 & 0xFF)||\
memcmp(dst0, dst1, width))
fail();
bench_new(dst1, src1, width, acc);
- }
av_free(src0);
av_free(src1);
@@ -136,7 +130,7 @@ static void check_add_left_pred(LLVidDSPContext *c, int width, int acc, const ch
av_free(dst1);
}
-static void check_add_left_pred_16(LLVidDSPContext *c, unsigned mask, int width, unsigned acc, const char * report)
+static void check_add_left_pred_16(LLVidDSPContext *c, unsigned mask, int width, unsigned acc)
{
int res0, res1;
uint16_t *dst0 = av_calloc(width, sizeof(*dst0));
@@ -150,14 +144,12 @@ static void check_add_left_pred_16(LLVidDSPContext *c, unsigned mask, int width,
if (!dst0 || !dst1)
fail();
- if (check_func(c->add_left_pred_int16, "%s", report)) {
res0 = call_ref(dst0, src0, mask, width, acc);
res1 = call_new(dst1, src1, mask, width, acc);
if ((res0 &0xFFFF) != (res1 &0xFFFF)||\
memcmp(dst0, dst1, width))
fail();
bench_new(dst1, src1, mask, width, acc);
- }
av_free(src0);
av_free(src1);
@@ -178,7 +170,6 @@ static void check_add_gradient_pred(LLVidDSPContext *c, int w) {
init_buffer(src0, src1, uint8_t, src_size);
- if (check_func(c->add_gradient_pred, "add_gradient_pred")) {
call_ref(src0 + stride + 32, stride, w);
call_new(src1 + stride + 32, stride, w);
if (memcmp(src0, src1, stride)||/* previous line doesn't change */
@@ -186,7 +177,6 @@ static void check_add_gradient_pred(LLVidDSPContext *c, int w) {
fail();
}
bench_new(src1 + stride + 32, stride, w);
- }
av_free(src0);
av_free(src1);
@@ -204,21 +194,27 @@ void checkasm_check_llviddsp(void)
ff_llviddsp_init(&c);
- check_add_bytes(&c, width);
+ if (check_func(c.add_bytes, "add_bytes"))
+ check_add_bytes(&c, width);
report("add_bytes");
- check_add_median_pred(&c, width);
+ if (check_func(c.add_median_pred, "add_median_pred"))
+ check_add_median_pred(&c, width);
report("add_median_pred");
- check_add_left_pred(&c, width, 0, "add_left_pred_zero");
+ if (check_func(c.add_left_pred, "add_left_pred_zero"))
+ check_add_left_pred(&c, width, 0);
report("add_left_pred_zero");
- check_add_left_pred(&c, width, accRnd, "add_left_pred_rnd_acc");
+ if (check_func(c.add_left_pred, "add_left_pred_rnd_acc"))
+ check_add_left_pred(&c, width, accRnd);
report("add_left_pred_rnd_acc");
- check_add_left_pred_16(&c, 255, width, accRnd, "add_left_pred_int16");
+ if (check_func(c.add_left_pred_int16, "add_left_pred_int16"))
+ check_add_left_pred_16(&c, 255, width, accRnd);
report("add_left_pred_int16");
- check_add_gradient_pred(&c, width);
+ if (check_func(c.add_gradient_pred, "add_gradient_pred"))
+ check_add_gradient_pred(&c, width);
report("add_gradient_pred");
}
--
2.49.1
>From 40c70223f8ae07c83cf31525978a798cb59c8dc7 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 18 Dec 2025 21:48:54 +0100
Subject: [PATCH 3/6] tests/checkasm/llviddsp: Reindent after the previous
commit
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
tests/checkasm/llviddsp.c | 56 +++++++++++++++++++--------------------
1 file changed, 27 insertions(+), 29 deletions(-)
diff --git a/tests/checkasm/llviddsp.c b/tests/checkasm/llviddsp.c
index c7180ba698..a8245b0d94 100644
--- a/tests/checkasm/llviddsp.c
+++ b/tests/checkasm/llviddsp.c
@@ -56,11 +56,11 @@ static void check_add_bytes(LLVidDSPContext *c, int width)
fail();
- call_ref(dst0, src0, width);
- call_new(dst1, src1, width);
- if (memcmp(dst0, dst1, width))
- fail();
- bench_new(dst1, src1, width);
+ call_ref(dst0, src0, width);
+ call_new(dst1, src1, width);
+ if (memcmp(dst0, dst1, width))
+ fail();
+ bench_new(dst1, src1, width);
av_free(src0);
av_free(src1);
@@ -89,11 +89,11 @@ static void check_add_median_pred(LLVidDSPContext *c, int width) {
b1 = b0;
- call_ref(dst0, src0, diff0, width, &a0, &b0);
- call_new(dst1, src1, diff1, width, &a1, &b1);
- if (memcmp(dst0, dst1, width) || (a0 != a1) || (b0 != b1))
- fail();
- bench_new(dst1, src1, diff1, width, &a1, &b1);
+ call_ref(dst0, src0, diff0, width, &a0, &b0);
+ call_new(dst1, src1, diff1, width, &a1, &b1);
+ if (memcmp(dst0, dst1, width) || (a0 != a1) || (b0 != b1))
+ fail();
+ bench_new(dst1, src1, diff1, width, &a1, &b1);
av_free(src0);
av_free(src1);
@@ -117,12 +117,11 @@ static void check_add_left_pred(LLVidDSPContext *c, int width, int acc)
if (!dst0 || !dst1)
fail();
- res0 = call_ref(dst0, src0, width, acc);
- res1 = call_new(dst1, src1, width, acc);
- if ((res0 & 0xFF) != (res1 & 0xFF)||\
- memcmp(dst0, dst1, width))
- fail();
- bench_new(dst1, src1, width, acc);
+ res0 = call_ref(dst0, src0, width, acc);
+ res1 = call_new(dst1, src1, width, acc);
+ if ((res0 & 0xFF) != (res1 & 0xFF) || memcmp(dst0, dst1, width))
+ fail();
+ bench_new(dst1, src1, width, acc);
av_free(src0);
av_free(src1);
@@ -144,12 +143,11 @@ static void check_add_left_pred_16(LLVidDSPContext *c, unsigned mask, int width,
if (!dst0 || !dst1)
fail();
- res0 = call_ref(dst0, src0, mask, width, acc);
- res1 = call_new(dst1, src1, mask, width, acc);
- if ((res0 &0xFFFF) != (res1 &0xFFFF)||\
- memcmp(dst0, dst1, width))
- fail();
- bench_new(dst1, src1, mask, width, acc);
+ res0 = call_ref(dst0, src0, mask, width, acc);
+ res1 = call_new(dst1, src1, mask, width, acc);
+ if ((res0 &0xFFFF) != (res1 &0xFFFF)|| memcmp(dst0, dst1, width))
+ fail();
+ bench_new(dst1, src1, mask, width, acc);
av_free(src0);
av_free(src1);
@@ -170,13 +168,13 @@ static void check_add_gradient_pred(LLVidDSPContext *c, int w) {
init_buffer(src0, src1, uint8_t, src_size);
- call_ref(src0 + stride + 32, stride, w);
- call_new(src1 + stride + 32, stride, w);
- if (memcmp(src0, src1, stride)||/* previous line doesn't change */
- memcmp(src0+stride, src1 + stride, w + 32)) {
- fail();
- }
- bench_new(src1 + stride + 32, stride, w);
+ call_ref(src0 + stride + 32, stride, w);
+ call_new(src1 + stride + 32, stride, w);
+ if (memcmp(src0, src1, stride)||/* previous line doesn't change */
+ memcmp(src0+stride, src1 + stride, w + 32)) {
+ fail();
+ }
+ bench_new(src1 + stride + 32, stride, w);
av_free(src0);
av_free(src1);
--
2.49.1
>From c978bb36ce8f50c7cfd73add26bddb9947cfe6d6 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 18 Dec 2025 21:59:33 +0100
Subject: [PATCH 4/6] avcodec/x86/lossless_videodsp: Don't store in eight byte
chunks
Use movu (movdqu) instead of movq+movhps.
Old benchmarks:
add_left_pred_int16_c: 2265.5 ( 1.00x)
add_left_pred_int16_ssse3: 595.4 ( 3.81x)
add_left_pred_rnd_acc_c: 1255.0 ( 1.00x)
add_left_pred_rnd_acc_ssse3: 326.2 ( 3.85x)
add_left_pred_rnd_acc_avx2: 279.0 ( 4.50x)
add_left_pred_zero_c: 1249.5 ( 1.00x)
add_left_pred_zero_ssse3: 326.1 ( 3.83x)
add_left_pred_zero_avx2: 277.0 ( 4.51x)
New benchmarks:
add_left_pred_int16_c: 2266.9 ( 1.00x)
add_left_pred_int16_ssse3: 509.9 ( 4.45x)
add_left_pred_rnd_acc_c: 1251.4 ( 1.00x)
add_left_pred_rnd_acc_ssse3: 282.6 ( 4.43x)
add_left_pred_rnd_acc_avx2: 208.9 ( 5.99x)
add_left_pred_zero_c: 1253.7 ( 1.00x)
add_left_pred_zero_ssse3: 280.0 ( 4.48x)
add_left_pred_zero_avx2: 206.8 ( 6.06x)
The checkasm test has been modified to use an unaligned destination
for this test.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/lossless_videodsp.asm | 35 +++++++---------------------
1 file changed, 8 insertions(+), 27 deletions(-)
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
index 359d1ee4ca..7dd10228fc 100644
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -101,17 +101,13 @@ cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top
RET
-%macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
+%macro ADD_LEFT_LOOP 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
add srcq, wq
add dstq, wq
neg wq
%%.loop:
pshufb xm0, xm5
-%if %2
- mova m1, [srcq+wq]
-%else
- movu m1, [srcq+wq]
-%endif
+ mov%2 m1, [srcq+wq]
psllw m2, m1, 8
paddb m1, m2
pshufb m2, m1, m3
@@ -121,24 +117,14 @@ cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top
pshufb m2, m1, m6
paddb m1, m2
paddb xm0, xm1
-%if %1
- mova [dstq+wq], xm0
-%else
- movq [dstq+wq], xm0
- movhps [dstq+wq+8], xm0
-%endif
+ mov%1 [dstq+wq], xm0
%if mmsize == 32
vextracti128 xm2, m1, 1 ; get second lane of the ymm
pshufb xm0, xm5 ; set alls val to last val of the first lane
paddb xm0, xm2
;store val
-%if %1
- mova [dstq+wq+16], xm0
-%else;
- movq [dstq+wq+16], xm0
- movhps [dstq+wq+16+8], xm0
-%endif
+ mov%1 [dstq+wq+16], xm0
%endif
add wq, mmsize
jl %%.loop
@@ -169,11 +155,11 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
jnz .src_unaligned
test dstq, mmsize - 1
jnz .dst_unaligned
- ADD_LEFT_LOOP 1, 1
+ ADD_LEFT_LOOP a, a
.dst_unaligned:
- ADD_LEFT_LOOP 0, 1
+ ADD_LEFT_LOOP u, a
.src_unaligned:
- ADD_LEFT_LOOP 0, 0
+ ADD_LEFT_LOOP u, u
%endmacro
INIT_XMM ssse3
@@ -247,12 +233,7 @@ ADD_BYTES
paddw m1, m2
paddw m0, m1
pand m0, m7
-%ifidn %1, a
- mova [dstq+wq], m0
-%else
- movq [dstq+wq], m0
- movhps [dstq+wq+8], m0
-%endif
+ mov%1 [dstq+wq], m0
add wq, mmsize
jl %%.loop
mov eax, mmsize-1
--
2.49.1
>From b6d23065f375846863fb2abce5a4aac543441210 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 18 Dec 2025 23:15:12 +0100
Subject: [PATCH 5/6] avcodec/x86/lossless_videodsp: Avoid aligned/unaligned
versions
For AVX2, movdqu is as fast as movdqa when used on aligned addresses,
so don't instantiate aligned/unaligned versions.
(The check was btw overtly strict: The AVX2 code only uses 16 byte
stores, so it would be enough for dst to be 16-byte aligned.)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/lossless_videodsp.asm | 2 ++
1 file changed, 2 insertions(+)
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
index 7dd10228fc..462155656a 100644
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -151,6 +151,7 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd]
movd xm0, leftm
pslldq xm0, 15
+%if notcpuflag(avx2)
test srcq, mmsize - 1
jnz .src_unaligned
test dstq, mmsize - 1
@@ -159,6 +160,7 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
.dst_unaligned:
ADD_LEFT_LOOP u, a
.src_unaligned:
+%endif
ADD_LEFT_LOOP u, u
%endmacro
--
2.49.1
>From 36039d378bfb040516470b78a08eda4ec1cfc44a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 18 Dec 2025 23:47:06 +0100
Subject: [PATCH 6/6] avcodec/x86/lossless_videodsp: Avoid unnecessary reg
push,pop
Happens on Win64.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/lossless_videodsp.asm | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
index 462155656a..1761a2f08f 100644
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -234,7 +234,7 @@ ADD_BYTES
pshufb m1, m4
paddw m1, m2
paddw m0, m1
- pand m0, m7
+ pand m0, m6
mov%1 [dstq+wq], m0
add wq, mmsize
jl %%.loop
@@ -253,14 +253,14 @@ ADD_BYTES
; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
;---------------------------------------------------------------------------------------------
INIT_XMM ssse3
-cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left
+cglobal add_left_pred_int16_unaligned, 4,4,7, dst, src, mask, w, left
mova m5, [pb_ef]
mova m4, [pb_zzzzzzzz67676767]
mova m3, [pb_zzzz2323zzzzabab]
movd m0, leftm
+ movd m6, maskm
pslldq m0, 14
- movd m7, maskm
- SPLATW m7 ,m7
+ SPLATW m6, m6
test srcq, 15
jnz .src_unaligned
test dstq, 15
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-12-19 6:31 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-12-19 6:30 [FFmpeg-devel] [PATCH] avcodec/x86/lossless_videodsp: Remove SSSE3 functions using MMX regs (PR #21236) mkver via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git