* [FFmpeg-devel] [PATCH] Avoid MMX in VP8 (PR #21081)
@ 2025-12-02 18:54 mkver via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-12-02 18:54 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: mkver
PR #21081 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21081
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21081.patch
Also remove some unused functions. For the RISCV stuff (the penultimate commit) only compilation was tested.
>From e495162f74195c6ef6060a2d8034f0a715425b2d Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 23 Nov 2025 11:08:14 +0100
Subject: [PATCH 01/15] avcodec/x86/vp8dsp: Remove MMXEXT functions overridden
by SSSE3
SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD),
so that the overwhelming majority of our users (particularly those
that actually update their FFmpeg) will be using the SSSE3 versions.
This commit therefore removes the MMX(EXT) functions overridden
by them (which don't abide by the ABI) to get closer to a removal
of emms_c.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vp8dsp.asm | 159 +----------------------------------
libavcodec/x86/vp8dsp_init.c | 37 +-------
2 files changed, 6 insertions(+), 190 deletions(-)
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 231c21ea0d..7b836351e4 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -1,5 +1,5 @@
;******************************************************************************
-;* VP8 MMXEXT optimizations
+;* VP8 ASM optimizations
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
;*
@@ -24,25 +24,6 @@
SECTION_RODATA
-fourtap_filter_hw_m: times 4 dw -6, 123
- times 4 dw 12, -1
- times 4 dw -9, 93
- times 4 dw 50, -6
- times 4 dw -6, 50
- times 4 dw 93, -9
- times 4 dw -1, 12
- times 4 dw 123, -6
-
-sixtap_filter_hw_m: times 4 dw 2, -11
- times 4 dw 108, 36
- times 4 dw -8, 1
- times 4 dw 3, -16
- times 4 dw 77, 77
- times 4 dw -16, 3
- times 4 dw 1, -8
- times 4 dw 36, 108
- times 4 dw -11, 2
-
fourtap_filter_hb_m: times 8 db -6, 123
times 8 db 12, -1
times 8 db -9, 93
@@ -115,8 +96,6 @@ bilinear_filter_vb_m: times 8 db 7, 1
times 8 db 1, 7
%if PIC
-%define fourtap_filter_hw picregq
-%define sixtap_filter_hw picregq
%define fourtap_filter_hb picregq
%define sixtap_filter_hb picregq
%define fourtap_filter_v picregq
@@ -125,8 +104,6 @@ bilinear_filter_vb_m: times 8 db 7, 1
%define bilinear_filter_vb picregq
%define npicregs 1
%else
-%define fourtap_filter_hw fourtap_filter_hw_m
-%define sixtap_filter_hw sixtap_filter_hw_m
%define fourtap_filter_hb fourtap_filter_hb_m
%define sixtap_filter_hb sixtap_filter_hb_m
%define fourtap_filter_v fourtap_filter_v_m
@@ -322,112 +299,6 @@ FILTER_SSSE3 4
INIT_XMM ssse3
FILTER_SSSE3 8
-; 4x4 block, H-only 4-tap filter
-INIT_MMX mmxext
-cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
- shl mxd, 4
-%if PIC
- lea picregq, [fourtap_filter_hw_m]
-%endif
- movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
- movq mm5, [fourtap_filter_hw+mxq]
- movq mm7, [pw_64]
- pxor mm6, mm6
-
-.nextrow:
- movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
-
- ; first set of 2 pixels
- movq mm2, mm1 ; byte ABCD..
- punpcklbw mm1, mm6 ; byte->word ABCD
- pshufw mm0, mm2, 9 ; byte CDEF..
- punpcklbw mm0, mm6 ; byte->word CDEF
- pshufw mm3, mm1, 0x94 ; word ABBC
- pshufw mm1, mm0, 0x94 ; word CDDE
- pmaddwd mm3, mm4 ; multiply 2px with F0/F1
- movq mm0, mm1 ; backup for second set of pixels
- pmaddwd mm1, mm5 ; multiply 2px with F2/F3
- paddd mm3, mm1 ; finish 1st 2px
-
- ; second set of 2 pixels, use backup of above
- punpckhbw mm2, mm6 ; byte->word EFGH
- pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
- pshufw mm1, mm2, 0x94 ; word EFFG
- pmaddwd mm1, mm5 ; multiply 2px with F2/F3
- paddd mm0, mm1 ; finish 2nd 2px
-
- ; merge two sets of 2 pixels into one set of 4, round/clip/store
- packssdw mm3, mm0 ; merge dword->word (4px)
- paddsw mm3, mm7 ; rounding
- psraw mm3, 7
- packuswb mm3, mm6 ; clip and word->bytes
- movd [dstq], mm3 ; store
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- RET
-
-; 4x4 block, H-only 6-tap filter
-INIT_MMX mmxext
-cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
- lea mxd, [mxq*3]
-%if PIC
- lea picregq, [sixtap_filter_hw_m]
-%endif
- movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
- movq mm5, [sixtap_filter_hw+mxq*8-32]
- movq mm6, [sixtap_filter_hw+mxq*8-16]
- movq mm7, [pw_64]
- pxor mm3, mm3
-
-.nextrow:
- movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
-
- ; first set of 2 pixels
- movq mm2, mm1 ; byte ABCD..
- punpcklbw mm1, mm3 ; byte->word ABCD
- pshufw mm0, mm2, 0x9 ; byte CDEF..
- punpckhbw mm2, mm3 ; byte->word EFGH
- punpcklbw mm0, mm3 ; byte->word CDEF
- pshufw mm1, mm1, 0x94 ; word ABBC
- pshufw mm2, mm2, 0x94 ; word EFFG
- pmaddwd mm1, mm4 ; multiply 2px with F0/F1
- pshufw mm3, mm0, 0x94 ; word CDDE
- movq mm0, mm3 ; backup for second set of pixels
- pmaddwd mm3, mm5 ; multiply 2px with F2/F3
- paddd mm1, mm3 ; add to 1st 2px cache
- movq mm3, mm2 ; backup for second set of pixels
- pmaddwd mm2, mm6 ; multiply 2px with F4/F5
- paddd mm1, mm2 ; finish 1st 2px
-
- ; second set of 2 pixels, use backup of above
- movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
- pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
- pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
- paddd mm0, mm3 ; add to 2nd 2px cache
- pxor mm3, mm3
- punpcklbw mm2, mm3 ; byte->word FGHI
- pshufw mm2, mm2, 0xE9 ; word GHHI
- pmaddwd mm2, mm6 ; multiply 2px with F4/F5
- paddd mm0, mm2 ; finish 2nd 2px
-
- ; merge two sets of 2 pixels into one set of 4, round/clip/store
- packssdw mm1, mm0 ; merge dword->word (4px)
- paddsw mm1, mm7 ; rounding
- psraw mm1, 7
- packuswb mm1, mm3 ; clip and word->bytes
- movd [dstq], mm1 ; store
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- RET
-
INIT_XMM sse2
cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
shl mxd, 5
@@ -539,9 +410,9 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h
jg .nextrow
RET
-%macro FILTER_V 1
+INIT_XMM sse2
; 4x4 block, V-only 4-tap filter
-cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
shl myd, 5
%if PIC
lea picregq, [fourtap_filter_v_m]
@@ -594,7 +465,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
; 4x4 block, V-only 6-tap filter
-cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
shl myd, 4
lea myq, [myq*3]
%if PIC
@@ -656,12 +527,6 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
dec heightd ; next row
jg .nextrow
RET
-%endmacro
-
-INIT_MMX mmxext
-FILTER_V 4
-INIT_XMM sse2
-FILTER_V 8
%macro FILTER_BILINEAR 1
%if cpuflag(ssse3)
@@ -722,16 +587,9 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p
psraw m2, 2
pavgw m0, m6
pavgw m2, m6
-%if mmsize == 8
- packuswb m0, m0
- packuswb m2, m2
- movh [dstq+dststrideq*0], m0
- movh [dstq+dststrideq*1], m2
-%else
packuswb m0, m2
movh [dstq+dststrideq*0], m0
movhps [dstq+dststrideq*1], m0
-%endif
%endif ; cpuflag(ssse3)
lea dstq, [dstq+dststrideq*2]
@@ -799,16 +657,9 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride
psraw m2, 2
pavgw m0, m6
pavgw m2, m6
-%if mmsize == 8
- packuswb m0, m0
- packuswb m2, m2
- movh [dstq+dststrideq*0], m0
- movh [dstq+dststrideq*1], m2
-%else
packuswb m0, m2
movh [dstq+dststrideq*0], m0
movhps [dstq+dststrideq*1], m0
-%endif
%endif ; cpuflag(ssse3)
lea dstq, [dstq+dststrideq*2]
@@ -818,8 +669,6 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride
RET
%endmacro
-INIT_MMX mmxext
-FILTER_BILINEAR 4
INIT_XMM sse2
FILTER_BILINEAR 8
INIT_MMX ssse3
diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c
index e37afab775..00733a2564 100644
--- a/libavcodec/x86/vp8dsp_init.c
+++ b/libavcodec/x86/vp8dsp_init.c
@@ -29,19 +29,6 @@
/*
* MC functions
*/
-void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
- const uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
- const uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
- const uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
- const uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-
void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
@@ -80,9 +67,6 @@ void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
-void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
- const uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
@@ -93,9 +77,6 @@ void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
-void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
- const uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
@@ -159,14 +140,6 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
dst, dststride, tmpptr, SIZE, height, mx, my); \
}
-#define HVTAPMMX(x, y) \
-HVTAP(mmxext, 8, x, y, 4, 8)
-
-HVTAPMMX(4, 4)
-HVTAPMMX(4, 6)
-HVTAPMMX(6, 4)
-HVTAPMMX(6, 6)
-
#define HVTAPSSE2(x, y, w) \
HVTAP(sse2, 16, x, y, w, 16) \
HVTAP(ssse3, 16, x, y, w, 16)
@@ -194,7 +167,6 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
dst, dststride, tmp, SIZE, height, mx, my); \
}
-HVBILIN(mmxext, 8, 4, 8)
HVBILIN(sse2, 8, 8, 16)
HVBILIN(sse2, 8, 16, 16)
HVBILIN(ssse3, 8, 4, 8)
@@ -285,13 +257,6 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
}
- /* note that 4-tap width=16 functions are missing because w=16
- * is only used for luma, and luma is always a copy or sixtap. */
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- VP8_MC_FUNC(2, 4, mmxext);
- VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
- }
-
if (EXTERNAL_SSE(cpu_flags)) {
c->put_vp8_epel_pixels_tab[0][0][0] =
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
@@ -304,6 +269,8 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
}
+ /* note that 4-tap width=16 functions are missing because w=16
+ * is only used for luma, and luma is always a copy or sixtap. */
if (EXTERNAL_SSSE3(cpu_flags)) {
VP8_LUMA_MC_FUNC(0, 16, ssse3);
VP8_MC_FUNC(1, 8, ssse3);
--
2.49.1
>From 3fd1685e3d4cbde7f8754c91911e70ea780ce52b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 23 Nov 2025 11:25:26 +0100
Subject: [PATCH 02/15] avcodec/x86/vp8dsp: Don't use MMX registers in
put_vp8_pixels8
Use GPRs on x64 and xmm registers else (using GPRs reduces codesize).
This avoids clobbering the floating point state and therefore no longer
breaks the ABI.
No change in benchmarks here.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vp8dsp.asm | 20 ++++++++++++++------
libavcodec/x86/vp8dsp_init.c | 9 +++------
2 files changed, 17 insertions(+), 12 deletions(-)
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 7b836351e4..7dee979e20 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -676,14 +676,22 @@ FILTER_BILINEAR 4
INIT_XMM ssse3
FILTER_BILINEAR 8
-INIT_MMX mmx
-cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
+INIT_XMM sse2
+cglobal put_vp8_pixels8, 5, 5+2*ARCH_X86_64, 2, dst, dststride, src, srcstride, height
.nextrow:
- movq mm0, [srcq+srcstrideq*0]
- movq mm1, [srcq+srcstrideq*1]
+%if ARCH_X86_64
+ mov r5q, [srcq+srcstrideq*0]
+ mov r6q, [srcq+srcstrideq*1]
lea srcq, [srcq+srcstrideq*2]
- movq [dstq+dststrideq*0], mm0
- movq [dstq+dststrideq*1], mm1
+ mov [dstq+dststrideq*0], r5q
+ mov [dstq+dststrideq*1], r6q
+%else
+ movq m0, [srcq+srcstrideq*0]
+ movq m1, [srcq+srcstrideq*1]
+ lea srcq, [srcq+srcstrideq*2]
+ movq [dstq+dststrideq*0], m0
+ movq [dstq+dststrideq*1], m1
+%endif
lea dstq, [dstq+dststrideq*2]
sub heightd, 2
jg .nextrow
diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c
index 00733a2564..40aa52c7f0 100644
--- a/libavcodec/x86/vp8dsp_init.c
+++ b/libavcodec/x86/vp8dsp_init.c
@@ -88,7 +88,7 @@ void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
int height, int mx, int my);
-void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
+void ff_put_vp8_pixels8_sse2(uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
@@ -252,17 +252,14 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
- if (EXTERNAL_MMX(cpu_flags)) {
- c->put_vp8_epel_pixels_tab[1][0][0] =
- c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
- }
-
if (EXTERNAL_SSE(cpu_flags)) {
c->put_vp8_epel_pixels_tab[0][0][0] =
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
}
if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
+ c->put_vp8_epel_pixels_tab[1][0][0] =
+ c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_sse2;
VP8_LUMA_MC_FUNC(0, 16, sse2);
VP8_MC_FUNC(1, 8, sse2);
VP8_BILINEAR_MC_FUNC(0, 16, sse2);
--
2.49.1
>From a08ac2daa09f50bfe9ff84aec746a9b4c7b80a36 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 23 Nov 2025 12:53:12 +0100
Subject: [PATCH 03/15] avcodec/x86/vp8dsp: Directly use negated stride
There is a register available. No change in benchmarks here.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vp8dsp.asm | 44 +++++++++++++++++++--------------------
1 file changed, 21 insertions(+), 23 deletions(-)
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 7dee979e20..6b5ca9f309 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -219,11 +219,11 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
mova m7, [pw_256]
; read 3 lines
- sub srcq, srcstrideq
- movh m0, [srcq]
- movh m1, [srcq+ srcstrideq]
- movh m2, [srcq+2*srcstrideq]
- add srcq, srcstrideq
+ mov picregq, srcstrideq
+ neg picregq
+ movh m0, [srcq+picregq]
+ movh m1, [srcq]
+ movh m2, [srcq+srcstrideq]
.nextrow:
movh m3, [srcq+2*srcstrideq] ; read new row
@@ -255,18 +255,17 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
lea myq, [sixtap_filter_hb+myq*8]
; read 5 lines
- sub srcq, srcstrideq
- sub srcq, srcstrideq
- movh m0, [srcq]
- movh m1, [srcq+srcstrideq]
- movh m2, [srcq+srcstrideq*2]
+ mov picregq, srcstrideq
+ neg picregq
+ movh m0, [srcq+2*picregq]
+ movh m1, [srcq+picregq]
+ movh m2, [srcq]
+ movh m3, [srcq+srcstrideq]
+ movh m4, [srcq+2*srcstrideq]
lea srcq, [srcq+srcstrideq*2]
- add srcq, srcstrideq
- movh m3, [srcq]
- movh m4, [srcq+srcstrideq]
.nextrow:
- movh m5, [srcq+2*srcstrideq] ; read new row
+ movh m5, [srcq+srcstrideq] ; read new row
mova m6, m0
punpcklbw m6, m5
mova m0, m1
@@ -475,15 +474,14 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picre
pxor m7, m7
; read 5 lines
- sub srcq, srcstrideq
- sub srcq, srcstrideq
- movh m0, [srcq]
- movh m1, [srcq+srcstrideq]
- movh m2, [srcq+srcstrideq*2]
+ mov picregq, srcstrideq
+ neg picregq
+ movh m0, [srcq+2*picregq]
+ movh m1, [srcq+picregq]
+ movh m2, [srcq]
+ movh m3, [srcq+srcstrideq]
+ movh m4, [srcq+2*srcstrideq]
lea srcq, [srcq+srcstrideq*2]
- add srcq, srcstrideq
- movh m3, [srcq]
- movh m4, [srcq+srcstrideq]
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
@@ -499,7 +497,7 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picre
paddsw m6, m5
; then calculate positive taps
- movh m5, [srcq+2*srcstrideq] ; read new row
+ movh m5, [srcq+srcstrideq] ; read new row
punpcklbw m5, m7
pmullw m0, [myq+0]
paddsw m6, m0
--
2.49.1
>From 456ecec84197e6be99b1811fb0eda5722df47da9 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 23 Nov 2025 13:15:07 +0100
Subject: [PATCH 04/15] avcodec/x86/vp8dsp: Increment src pointer earlier
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vp8dsp.asm | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 6b5ca9f309..0d37012e9d 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -166,6 +166,7 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h
pmaddubsw m0, m5
pmaddubsw m1, m6
pmaddubsw m2, m7
+ add srcq, srcstrideq
paddsw m0, m1
paddsw m0, m2
pmulhrsw m0, [pw_256]
@@ -174,7 +175,6 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h
; go to next line
add dstq, dststrideq
- add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
RET
@@ -197,6 +197,7 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m6
+ add srcq, srcstrideq
paddsw m0, m1
pmulhrsw m0, m2
packuswb m0, m0
@@ -204,7 +205,6 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h
; go to next line
add dstq, dststrideq
- add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
RET
@@ -234,6 +234,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
punpcklbw m2, m3
pmaddubsw m4, m5
pmaddubsw m2, m6
+ add srcq, srcstrideq
paddsw m4, m2
mova m2, m3
pmulhrsw m4, m7
@@ -242,7 +243,6 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
; go to next line
add dstq, dststrideq
- add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
RET
@@ -275,6 +275,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
pmaddubsw m6, [myq-48]
pmaddubsw m1, [myq-32]
pmaddubsw m7, [myq-16]
+ add srcq, srcstrideq
paddsw m6, m1
paddsw m6, m7
mova m1, m2
@@ -287,7 +288,6 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
; go to next line
add dstq, dststrideq
- add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
RET
@@ -331,6 +331,7 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, h
pmullw m2, [mxq+32]
pmullw m3, [mxq+48]
%endif
+ add srcq, srcstrideq
paddsw m0, m1
paddsw m2, m3
paddsw m0, m2
@@ -341,7 +342,6 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, h
; go to next line
add dstq, dststrideq
- add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
RET
@@ -392,6 +392,7 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h
pmullw m4, [mxq+64]
pmullw m5, [mxq+80]
%endif
+ add srcq, srcstrideq
paddsw m1, m4
paddsw m0, m5
paddsw m1, m2
@@ -404,7 +405,6 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h
; go to next line
add dstq, dststrideq
- add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
RET
@@ -446,6 +446,7 @@ cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picre
paddsw m4, m1
mova m1, m2
pmullw m2, [myq+32]
+ add srcq, srcstrideq
paddsw m4, m2
mova m2, m3
@@ -457,7 +458,6 @@ cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picre
; go to next line
add dstq, dststrideq
- add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
RET
@@ -507,6 +507,7 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picre
paddsw m6, m2
mova m2, m3
pmullw m3, [myq+48]
+ add srcq, srcstrideq
paddsw m6, m3
mova m3, m4
mova m4, m5
@@ -521,7 +522,6 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picre
; go to next line
add dstq, dststrideq
- add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
RET
@@ -543,6 +543,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, p
punpcklbw m1, m2
pmaddubsw m0, m3
pmaddubsw m1, m3
+ lea srcq, [srcq+srcstrideq*2]
psraw m0, 2
psraw m1, 2
pavgw m0, m4
@@ -579,6 +580,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p
pmullw m1, m5
pmullw m2, m4
pmullw m3, m5
+ lea srcq, [srcq+srcstrideq*2]
paddsw m0, m1
paddsw m2, m3
psraw m0, 2
@@ -591,7 +593,6 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p
%endif ; cpuflag(ssse3)
lea dstq, [dstq+dststrideq*2]
- lea srcq, [srcq+srcstrideq*2]
sub heightd, 2
jg .nextrow
RET
@@ -612,6 +613,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride
pshufb m1, m2
pmaddubsw m0, m3
pmaddubsw m1, m3
+ lea srcq, [srcq+srcstrideq*2]
psraw m0, 2
psraw m1, 2
pavgw m0, m4
@@ -649,6 +651,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride
pmullw m1, m5
pmullw m2, m4
pmullw m3, m5
+ lea srcq, [srcq+srcstrideq*2]
paddsw m0, m1
paddsw m2, m3
psraw m0, 2
@@ -661,7 +664,6 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride
%endif ; cpuflag(ssse3)
lea dstq, [dstq+dststrideq*2]
- lea srcq, [srcq+srcstrideq*2]
sub heightd, 2
jg .nextrow
RET
--
2.49.1
>From 936f8412aff35236d0f2c786aafa40d75331a640 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 23 Nov 2025 13:27:35 +0100
Subject: [PATCH 05/15] avcodec/x86/vp8dsp: Avoid reload
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vp8dsp.asm | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 0d37012e9d..e971da68ac 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -535,8 +535,8 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, p
%endif
pxor m4, m4
mova m3, [bilinear_filter_vb+myq-16]
-.nextrow:
movh m0, [srcq+srcstrideq*0]
+.nextrow:
movh m1, [srcq+srcstrideq*1]
movh m2, [srcq+srcstrideq*2]
punpcklbw m0, m1
@@ -558,6 +558,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, p
movh [dstq+dststrideq*0], m0
movhps [dstq+dststrideq*1], m0
%endif
+ mova m0, m2
%else ; cpuflag(ssse3)
cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
shl myd, 4
--
2.49.1
>From 15d229859aa0d7804791f70100fd55738925560a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 23 Nov 2025 15:39:48 +0100
Subject: [PATCH 06/15] avcodec/x86/vp8dsp_init: Remove unused macro
Forgotten in 6a551f14050674fb685920eb1b0640810cacccf9.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vp8dsp_init.c | 10 ----------
1 file changed, 10 deletions(-)
diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c
index 40aa52c7f0..828b038cdf 100644
--- a/libavcodec/x86/vp8dsp_init.c
+++ b/libavcodec/x86/vp8dsp_init.c
@@ -105,16 +105,6 @@ static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
dst + 8, dststride, src + 8, srcstride, height, mx, my); \
}
-#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
-static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
- uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
- ptrdiff_t srcstride, int height, int mx, int my) \
-{ \
- ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
- dst, dststride, src, srcstride, height, mx, my); \
- ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
- dst + 4, dststride, src + 4, srcstride, height, mx, my); \
-}
TAP_W16(sse2, epel, h6)
TAP_W16(sse2, epel, v6)
--
2.49.1
>From 1b99c21a689f61a8dbac5dfd7ec4dc46b3ffd698 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 23 Nov 2025 20:25:26 +0100
Subject: [PATCH 07/15] avcodec/x86/vp8dsp: Avoid unpacking multiple times
Always pair row i with row i+2 for the vertical four-tap filter
and row i+3 for the vertical six-tap filter (instead of pairing
the first with the sixth, the second with the third and the fourth
and the fifth). This allows to unpack each row only once instead
of (at most) three times.
Old benchmarks:
vp8_put_epel4_v4_c: 98.4 ( 1.00x)
vp8_put_epel4_v4_ssse3: 28.6 ( 3.44x)
vp8_put_epel4_v6_c: 131.6 ( 1.00x)
vp8_put_epel4_v6_ssse3: 38.5 ( 3.42x)
vp8_put_epel8_v4_c: 362.5 ( 1.00x)
vp8_put_epel8_v4_sse2: 63.8 ( 5.68x)
vp8_put_epel8_v4_ssse3: 44.4 ( 8.16x)
vp8_put_epel8_v6_c: 538.3 ( 1.00x)
vp8_put_epel8_v6_sse2: 86.5 ( 6.22x)
vp8_put_epel8_v6_ssse3: 57.0 ( 9.44x)
vp8_put_epel16_v6_c: 1044.6 ( 1.00x)
vp8_put_epel16_v6_sse2: 158.0 ( 6.61x)
vp8_put_epel16_v6_ssse3: 106.7 ( 9.79x)
New benchmarks:
vp8_put_epel4_v4_c: 100.0 ( 1.00x)
vp8_put_epel4_v4_ssse3: 28.4 ( 3.52x)
vp8_put_epel4_v6_c: 131.7 ( 1.00x)
vp8_put_epel4_v6_ssse3: 34.3 ( 3.84x)
vp8_put_epel8_v4_c: 364.4 ( 1.00x)
vp8_put_epel8_v4_sse2: 63.7 ( 5.72x)
vp8_put_epel8_v4_ssse3: 43.3 ( 8.42x)
vp8_put_epel8_v6_c: 550.2 ( 1.00x)
vp8_put_epel8_v6_sse2: 86.4 ( 6.37x)
vp8_put_epel8_v6_ssse3: 52.9 (10.40x)
vp8_put_epel16_v6_c: 1052.5 ( 1.00x)
vp8_put_epel16_v6_sse2: 158.3 ( 6.65x)
vp8_put_epel16_v6_ssse3: 98.9 (10.64x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vp8dsp.asm | 68 +++++++++++++++++++++++++--------------
1 file changed, 44 insertions(+), 24 deletions(-)
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index e971da68ac..7cb729a443 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -33,6 +33,15 @@ fourtap_filter_hb_m: times 8 db -6, 123
times 8 db -1, 12
times 8 db 123, -6
+fourtap_filter_b_m: times 8 db -6, 12
+ times 8 db 123, -1
+ times 8 db -9, 50
+ times 8 db 93, -6
+ times 8 db -6, 93
+ times 8 db 50, -9
+ times 8 db -1, 123
+ times 8 db 12, -6
+
sixtap_filter_hb_m: times 8 db 2, 1
times 8 db -11, 108
times 8 db 36, -8
@@ -43,6 +52,16 @@ sixtap_filter_hb_m: times 8 db 2, 1
times 8 db -8, 36
times 8 db 108, -11
+sixtap_filter_b_m: times 8 db 2, 36
+ times 8 db -11, -8
+ times 8 db 108, 1
+ times 8 db 3, 77
+ times 8 db -16, -16
+ times 8 db 77, 3
+ times 8 db 1, 108
+ times 8 db -8, -11
+ times 8 db 36, 2
+
fourtap_filter_v_m: times 8 dw -6
times 8 dw 123
times 8 dw 12
@@ -97,7 +116,9 @@ bilinear_filter_vb_m: times 8 db 7, 1
%if PIC
%define fourtap_filter_hb picregq
+%define fourtap_filter_b picregq
%define sixtap_filter_hb picregq
+%define sixtap_filter_b picregq
%define fourtap_filter_v picregq
%define sixtap_filter_v picregq
%define bilinear_filter_vw picregq
@@ -105,7 +126,9 @@ bilinear_filter_vb_m: times 8 db 7, 1
%define npicregs 1
%else
%define fourtap_filter_hb fourtap_filter_hb_m
+%define fourtap_filter_b fourtap_filter_b_m
%define sixtap_filter_hb sixtap_filter_hb_m
+%define sixtap_filter_b sixtap_filter_b_m
%define fourtap_filter_v fourtap_filter_v_m
%define sixtap_filter_v sixtap_filter_v_m
%define bilinear_filter_vw bilinear_filter_vw_m
@@ -212,10 +235,10 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
shl myd, 4
%if PIC
- lea picregq, [fourtap_filter_hb_m]
+ lea picregq, [fourtap_filter_b_m]
%endif
- mova m5, [fourtap_filter_hb+myq-16]
- mova m6, [fourtap_filter_hb+myq]
+ mova m5, [fourtap_filter_b+myq-16]
+ mova m6, [fourtap_filter_b+myq]
mova m7, [pw_256]
; read 3 lines
@@ -224,21 +247,20 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
movh m0, [srcq+picregq]
movh m1, [srcq]
movh m2, [srcq+srcstrideq]
+ punpcklbw m0, m2
.nextrow:
movh m3, [srcq+2*srcstrideq] ; read new row
- mova m4, m0
+ pmaddubsw m0, m5
+ punpcklbw m1, m3
+ pmaddubsw m4, m1, m6
+ add srcq, srcstrideq
+ paddsw m4, m0
mova m0, m1
- punpcklbw m4, m1
- mova m1, m2
- punpcklbw m2, m3
- pmaddubsw m4, m5
- pmaddubsw m2, m6
- add srcq, srcstrideq
- paddsw m4, m2
- mova m2, m3
pmulhrsw m4, m7
+ mova m1, m2
packuswb m4, m4
+ mova m2, m3
movh [dstq], m4
; go to next line
@@ -250,9 +272,9 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
lea myd, [myq*3]
%if PIC
- lea picregq, [sixtap_filter_hb_m]
+ lea picregq, [sixtap_filter_b_m]
%endif
- lea myq, [sixtap_filter_hb+myq*8]
+ lea myq, [sixtap_filter_b+myq*8]
; read 5 lines
mov picregq, srcstrideq
@@ -263,20 +285,18 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
movh m3, [srcq+srcstrideq]
movh m4, [srcq+2*srcstrideq]
lea srcq, [srcq+srcstrideq*2]
+ punpcklbw m0, m3
+ punpcklbw m1, m4
.nextrow:
movh m5, [srcq+srcstrideq] ; read new row
- mova m6, m0
- punpcklbw m6, m5
+ pmaddubsw m0, [myq-48]
+ punpcklbw m2, m5
+ pmaddubsw m6, m1, [myq-32]
+ pmaddubsw m7, m2, [myq-16]
+ add srcq, srcstrideq
+ paddw m6, m0
mova m0, m1
- punpcklbw m1, m2
- mova m7, m3
- punpcklbw m7, m4
- pmaddubsw m6, [myq-48]
- pmaddubsw m1, [myq-32]
- pmaddubsw m7, [myq-16]
- add srcq, srcstrideq
- paddsw m6, m1
paddsw m6, m7
mova m1, m2
mova m2, m3
--
2.49.1
>From 25836faa4f89001299f9faa75f00a2bc8d55d0ea Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 23 Nov 2025 23:29:24 +0100
Subject: [PATCH 08/15] avcodec/x86/vp8dsp: Don't use MMX registers in
ff_put_vp8_epel4_v6_sse3
Switching to xmm registers allows to process two rows in parallel,
leading to speedups. It is also ABI compliant (no more missing emms).
Old benchmarks:
vp8_put_epel4_v6_c: 132.8 ( 1.00x)
vp8_put_epel4_v6_ssse3: 34.3 ( 3.87x)
New benchmarks:
vp8_put_epel4_v6_c: 131.5 ( 1.00x)
vp8_put_epel4_v6_ssse3: 27.1 ( 4.86x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vp8dsp.asm | 48 +++++++++++++++++++++++++++++++++++----
1 file changed, 43 insertions(+), 5 deletions(-)
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 7cb729a443..4778944ac7 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -162,6 +162,12 @@ SECTION .text
;-------------------------------------------------------------------------------
%macro FILTER_SSSE3 1
+%if %1 == 4
+%define MOV movd
+%else
+%define MOV movq
+%endif
+
cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
lea mxd, [mxq*3]
mova m3, [filter_h6_shuf2]
@@ -269,6 +275,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
jg .nextrow
RET
+INIT_XMM ssse3
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
lea myd, [myq*3]
%if PIC
@@ -279,14 +286,44 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
; read 5 lines
mov picregq, srcstrideq
neg picregq
- movh m0, [srcq+2*picregq]
- movh m1, [srcq+picregq]
- movh m2, [srcq]
- movh m3, [srcq+srcstrideq]
- movh m4, [srcq+2*srcstrideq]
+ MOV m0, [srcq+2*picregq]
+ MOV m1, [srcq+picregq]
+ MOV m2, [srcq]
+ MOV m3, [srcq+srcstrideq]
+ MOV m4, [srcq+2*srcstrideq]
lea srcq, [srcq+srcstrideq*2]
punpcklbw m0, m3
punpcklbw m1, m4
+%if %1 == 4
+ punpcklqdq m0, m1
+
+.next2rows:
+ movd m5, [srcq+srcstrideq]
+ movd m6, [srcq+2*srcstrideq]
+ pmaddubsw m0, [myq-48]
+ punpcklbw m2, m5
+ punpcklqdq m1, m2
+ pmaddubsw m1, [myq-32]
+ punpcklbw m3, m6
+ punpcklqdq m2, m3
+ paddw m0, m1
+ pmaddubsw m1, m2, [myq-16]
+ lea srcq, [srcq+2*srcstrideq]
+ paddsw m1, m0
+ mova m0, m2
+ pmulhrsw m1, [pw_256]
+ mova m2, m4
+ packuswb m1, m1
+ movd [dstq], m1
+ mova m4, m6
+ psrldq m1, 4
+ movd [dstq+dststrideq], m1
+ lea dstq, [dstq+2*dststrideq]
+ mova m1, m3
+ mova m3, m5
+ sub heightd, 2
+ jg .next2rows
+%else
.nextrow:
movh m5, [srcq+srcstrideq] ; read new row
@@ -310,6 +347,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
add dstq, dststrideq
dec heightd ; next row
jg .nextrow
+%endif
RET
%endmacro
--
2.49.1
>From 9755f51400a2668ddb05d92932badf26bb0c9723 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Mon, 24 Nov 2025 09:16:26 +0100
Subject: [PATCH 09/15] avcodec/x86/vp8dsp: Don't use MMX registers in
ff_put_vp8_epel4_v4_ssse3
Switching to xmm registers allows to process two rows in parallel,
leading to speedups. It is also ABI compliant (no more missing emms).
Old benchmarks:
vp8_put_epel4_v4_c: 96.8 ( 1.00x)
vp8_put_epel4_v4_ssse3: 28.2 ( 3.43x)
New benchmarks:
vp8_put_epel4_v4_c: 95.1 ( 1.00x)
vp8_put_epel4_v4_ssse3: 22.8 ( 4.17x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vp8dsp.asm | 36 +++++++++++++++++++++++++++++++-----
1 file changed, 31 insertions(+), 5 deletions(-)
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 4778944ac7..fd60feaf1f 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -238,6 +238,7 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h
jg .nextrow
RET
+INIT_XMM ssse3
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
shl myd, 4
%if PIC
@@ -250,13 +251,38 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
; read 3 lines
mov picregq, srcstrideq
neg picregq
- movh m0, [srcq+picregq]
- movh m1, [srcq]
- movh m2, [srcq+srcstrideq]
+ MOV m0, [srcq+picregq]
+ MOV m1, [srcq]
+ MOV m2, [srcq+srcstrideq]
+ lea srcq, [srcq+2*srcstrideq]
punpcklbw m0, m2
+%if %1 == 4
+.next2rows:
+ movd m3, [srcq]
+ movd m4, [srcq+srcstrideq]
+ punpcklbw m1, m3
+ punpcklqdq m0, m1
+ punpcklbw m2, m4
+ pmaddubsw m0, m5
+ punpcklqdq m1, m2
+ pmaddubsw m1, m6
+ lea srcq, [srcq+2*srcstrideq]
+ paddsw m1, m0
+ pmulhrsw m1, m7
+ mova m0, m2
+ packuswb m1, m1
+ movd [dstq], m1
+ mova m2, m4
+ psrldq m1, 4
+ movd [dstq+dststrideq], m1
+ mova m1, m3
+ lea dstq, [dstq+2*dststrideq]
+ sub heightd, 2
+ jg .next2rows
+%else
.nextrow:
- movh m3, [srcq+2*srcstrideq] ; read new row
+ movh m3, [srcq] ; read new row
pmaddubsw m0, m5
punpcklbw m1, m3
pmaddubsw m4, m1, m6
@@ -273,9 +299,9 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
add dstq, dststrideq
dec heightd ; next row
jg .nextrow
+%endif
RET
-INIT_XMM ssse3
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
lea myd, [myq*3]
%if PIC
--
2.49.1
>From 131c522c30fdcc8259ac120372b83253f1ab6906 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Mon, 24 Nov 2025 13:29:42 +0100
Subject: [PATCH 10/15] avcodec/x86/vp8dsp: Don't use MMX registers in
ff_put_vp8_epel4_h4_ssse3
Doubling the register width allows to use only one pshufb and pmaddubsw.
Old benchmarks:
vp8_put_epel4_h4_c: 82.8 ( 1.00x)
vp8_put_epel4_h4_ssse3: 13.9 ( 5.96x)
New benchmarks:
vp8_put_epel4_h4_c: 82.7 ( 1.00x)
vp8_put_epel4_h4_ssse3: 11.7 ( 7.08x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vp8dsp.asm | 36 ++++++++++++++++++++++++++++++++----
1 file changed, 32 insertions(+), 4 deletions(-)
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index fd60feaf1f..6c365898ce 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -24,6 +24,15 @@
SECTION_RODATA
+fourtap_filter4_b_m: times 4 db -6, 123
+ times 4 db 12, -1
+ times 4 db -9, 93
+ times 4 db 50, -6
+ times 4 db -6, 50
+ times 4 db 93, -9
+ times 4 db -1, 12
+ times 4 db 123, -6
+
fourtap_filter_hb_m: times 8 db -6, 123
times 8 db 12, -1
times 8 db -9, 93
@@ -117,6 +126,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
%if PIC
%define fourtap_filter_hb picregq
%define fourtap_filter_b picregq
+%define fourtap_filter4_b picregq
%define sixtap_filter_hb picregq
%define sixtap_filter_b picregq
%define fourtap_filter_v picregq
@@ -127,6 +137,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
%else
%define fourtap_filter_hb fourtap_filter_hb_m
%define fourtap_filter_b fourtap_filter_b_m
+%define fourtap_filter4_b fourtap_filter4_b_m
%define sixtap_filter_hb sixtap_filter_hb_m
%define sixtap_filter_b sixtap_filter_b_m
%define fourtap_filter_v fourtap_filter_v_m
@@ -136,6 +147,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
%define npicregs 0
%endif
+filter4_h4_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
@@ -208,9 +220,11 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h
jg .nextrow
RET
-cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
- shl mxd, 4
+INIT_XMM ssse3
+cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 6+!!(%1 == 8), dst, dststride, src, srcstride, height, mx, picreg
mova m2, [pw_256]
+%if %1 == 8
+ shl mxd, 4
mova m3, [filter_h2_shuf]
mova m4, [filter_h4_shuf]
%if PIC
@@ -218,19 +232,34 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h
%endif
mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
mova m6, [fourtap_filter_hb+mxq]
+%else
+ shl mxd, 3
+ mova m3, [filter4_h4_shuf]
+%if PIC
+ lea picregq, [fourtap_filter4_b_m]
+%endif
+ mova m5, [fourtap_filter4_b+mxq-8]
+%endif
.nextrow:
+%if %1 == 4
+ movq m0, [srcq-1]
+ pshufb m0, m3
+ pmaddubsw m0, m5
+ movhlps m1, m0
+%else
movu m0, [srcq-1]
mova m1, m0
pshufb m0, m3
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m6
+%endif
add srcq, srcstrideq
paddsw m0, m1
pmulhrsw m0, m2
packuswb m0, m0
- movh [dstq], m0 ; store
+ MOV [dstq], m0 ; store
; go to next line
add dstq, dststrideq
@@ -238,7 +267,6 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h
jg .nextrow
RET
-INIT_XMM ssse3
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
shl myd, 4
%if PIC
--
2.49.1
>From 31ed005d0407e55469bf13d1344469eb1f1af456 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Mon, 24 Nov 2025 16:11:10 +0100
Subject: [PATCH 11/15] avcodec/x86/vp8dsp: Don't use MMX registers in
ff_put_vp8_epel4_h6_ssse3
Doubling the register width allowed to avoid a pshufb and a pmaddubsw.
Old benchmarks:
vp8_put_epel4_h6_c: 115.9 ( 1.00x)
vp8_put_epel4_h6_ssse3: 20.2 ( 5.74x)
vp8_put_epel4_h6v4_c: 276.3 ( 1.00x)
vp8_put_epel4_h6v4_ssse3: 58.6 ( 4.71x)
vp8_put_epel4_h6v6_c: 363.6 ( 1.00x)
vp8_put_epel4_h6v6_ssse3: 62.5 ( 5.82x)
New benchmarks:
vp8_put_epel4_h6_c: 116.4 ( 1.00x)
vp8_put_epel4_h6_ssse3: 16.0 ( 7.29x)
vp8_put_epel4_h6v4_c: 280.9 ( 1.00x)
vp8_put_epel4_h6v4_ssse3: 44.3 ( 6.33x)
vp8_put_epel4_h6v6_c: 365.6 ( 1.00x)
vp8_put_epel4_h6v6_ssse3: 53.1 ( 6.89x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vp8dsp.asm | 50 +++++++++++++++++++++++++++++----------
1 file changed, 38 insertions(+), 12 deletions(-)
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 6c365898ce..2a66e51da6 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -33,6 +33,16 @@ fourtap_filter4_b_m: times 4 db -6, 123
times 4 db -1, 12
times 4 db 123, -6
+sixtap_filter4_hb_m: times 8 db 2, -11
+ times 4 db 108, -8
+ times 4 db 36, 1
+ times 8 db 3, -16
+ times 4 db 77, -16
+ times 4 db 77, 3
+ times 8 db 1, -8
+ times 4 db 36, -11
+ times 4 db 108, 2
+
fourtap_filter_hb_m: times 8 db -6, 123
times 8 db 12, -1
times 8 db -9, 93
@@ -129,6 +139,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
%define fourtap_filter4_b picregq
%define sixtap_filter_hb picregq
%define sixtap_filter_b picregq
+%define sixtap_filter4_hb picregq
%define fourtap_filter_v picregq
%define sixtap_filter_v picregq
%define bilinear_filter_vw picregq
@@ -140,6 +151,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
%define fourtap_filter4_b fourtap_filter4_b_m
%define sixtap_filter_hb sixtap_filter_hb_m
%define sixtap_filter_b sixtap_filter_b_m
+%define sixtap_filter4_hb sixtap_filter4_hb_m
%define fourtap_filter_v fourtap_filter_v_m
%define sixtap_filter_v sixtap_filter_v_m
%define bilinear_filter_vw bilinear_filter_vw_m
@@ -148,6 +160,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
%endif
filter4_h4_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
+filter4_h6_shuf: db 1, 3, 2, 4, 3, 5, 4, 6, 2, 4, 3, 5, 4, 6, 5, 7
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
@@ -180,7 +193,16 @@ SECTION .text
%define MOV movq
%endif
-cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
+cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 6+2*(%1==8), dst, dststride, src, srcstride, height, mx, picreg
+%if %1 == 4
+ mova m3, [filter4_h6_shuf]
+%if PIC
+ lea picregq, [sixtap_filter4_hb_m]
+%endif
+ shl mxd, 4
+ mova m4, [sixtap_filter4_hb+mxq-32]
+ mova m5, [sixtap_filter4_hb+mxq-16]
+%else
lea mxd, [mxq*3]
mova m3, [filter_h6_shuf2]
mova m4, [filter_h6_shuf3]
@@ -190,29 +212,35 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h
mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
mova m6, [sixtap_filter_hb+mxq*8-32]
mova m7, [sixtap_filter_hb+mxq*8-16]
+%endif
.nextrow:
+%if %1 == 4
+ ; we need nine bytes, so two loads
+ movq m1, [srcq-1]
+ movq m0, [srcq-2]
+ punpcklbw m0, m1
+ pshufb m1, m3
+ pmaddubsw m1, m5
+ pmaddubsw m0, m4
+ movhlps m2, m1
+%else
movu m0, [srcq-2]
mova m1, m0
mova m2, m0
-%if mmsize == 8
-; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
-; shuffle with a memory operand
- punpcklbw m0, [srcq+3]
-%else
pshufb m0, [filter_h6_shuf1]
-%endif
pshufb m1, m3
pshufb m2, m4
pmaddubsw m0, m5
pmaddubsw m1, m6
pmaddubsw m2, m7
+%endif
add srcq, srcstrideq
- paddsw m0, m1
+ paddw m0, m1
paddsw m0, m2
pmulhrsw m0, [pw_256]
packuswb m0, m0
- movh [dstq], m0 ; store
+ MOV [dstq], m0 ; store
; go to next line
add dstq, dststrideq
@@ -220,7 +248,6 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h
jg .nextrow
RET
-INIT_XMM ssse3
cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 6+!!(%1 == 8), dst, dststride, src, srcstride, height, mx, picreg
mova m2, [pw_256]
%if %1 == 8
@@ -405,9 +432,8 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
RET
%endmacro
-INIT_MMX ssse3
-FILTER_SSSE3 4
INIT_XMM ssse3
+FILTER_SSSE3 4
FILTER_SSSE3 8
INIT_XMM sse2
--
2.49.1
>From a406799d622e86a73853ab9be9ca77f6367c3d9c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Mon, 24 Nov 2025 20:32:58 +0100
Subject: [PATCH 12/15] avcodec/x86/vp8dsp: Reduce number of coefficient tables
By changing the permutations used in the epel8_h{4,6} case
we can simply reuse the coefficient tables from the vertical epel
filters.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vp8dsp.asm | 54 ++++++++++++---------------------------
1 file changed, 17 insertions(+), 37 deletions(-)
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 2a66e51da6..340f6cc818 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -43,15 +43,6 @@ sixtap_filter4_hb_m: times 8 db 2, -11
times 4 db 36, -11
times 4 db 108, 2
-fourtap_filter_hb_m: times 8 db -6, 123
- times 8 db 12, -1
- times 8 db -9, 93
- times 8 db 50, -6
- times 8 db -6, 50
- times 8 db 93, -9
- times 8 db -1, 12
- times 8 db 123, -6
-
fourtap_filter_b_m: times 8 db -6, 12
times 8 db 123, -1
times 8 db -9, 50
@@ -61,16 +52,6 @@ fourtap_filter_b_m: times 8 db -6, 12
times 8 db -1, 123
times 8 db 12, -6
-sixtap_filter_hb_m: times 8 db 2, 1
- times 8 db -11, 108
- times 8 db 36, -8
- times 8 db 3, 3
- times 8 db -16, 77
- times 8 db 77, -16
- times 8 db 1, 2
- times 8 db -8, 36
- times 8 db 108, -11
-
sixtap_filter_b_m: times 8 db 2, 36
times 8 db -11, -8
times 8 db 108, 1
@@ -134,10 +115,8 @@ bilinear_filter_vb_m: times 8 db 7, 1
times 8 db 1, 7
%if PIC
-%define fourtap_filter_hb picregq
%define fourtap_filter_b picregq
%define fourtap_filter4_b picregq
-%define sixtap_filter_hb picregq
%define sixtap_filter_b picregq
%define sixtap_filter4_hb picregq
%define fourtap_filter_v picregq
@@ -146,10 +125,8 @@ bilinear_filter_vb_m: times 8 db 7, 1
%define bilinear_filter_vb picregq
%define npicregs 1
%else
-%define fourtap_filter_hb fourtap_filter_hb_m
%define fourtap_filter_b fourtap_filter_b_m
%define fourtap_filter4_b fourtap_filter4_b_m
-%define sixtap_filter_hb sixtap_filter_hb_m
%define sixtap_filter_b sixtap_filter_b_m
%define sixtap_filter4_hb sixtap_filter4_hb_m
%define fourtap_filter_v fourtap_filter_v_m
@@ -161,12 +138,15 @@ bilinear_filter_vb_m: times 8 db 7, 1
filter4_h4_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
filter4_h6_shuf: db 1, 3, 2, 4, 3, 5, 4, 6, 2, 4, 3, 5, 4, 6, 5, 7
-filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
-filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
-filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
+filter_h4_shuf1: db 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9
+filter_h4_shuf2: db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
+
+filter_h6_shuf1: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
+filter_h6_shuf2: db 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11
+filter_h6_shuf3: db 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11, 9, 12
+
+filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
pw_20091: times 4 dw 20091
pw_17734: times 4 dw 17734
@@ -207,11 +187,11 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 6+2*(%1==8), dst, dststride, src, sr
mova m3, [filter_h6_shuf2]
mova m4, [filter_h6_shuf3]
%if PIC
- lea picregq, [sixtap_filter_hb_m]
+ lea picregq, [sixtap_filter_b_m]
%endif
- mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
- mova m6, [sixtap_filter_hb+mxq*8-32]
- mova m7, [sixtap_filter_hb+mxq*8-16]
+ mova m5, [sixtap_filter_b+mxq*8-48] ; set up 6tap filter in bytes
+ mova m6, [sixtap_filter_b+mxq*8-32]
+ mova m7, [sixtap_filter_b+mxq*8-16]
%endif
.nextrow:
@@ -252,13 +232,13 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 6+!!(%1 == 8), dst, dststride, src,
mova m2, [pw_256]
%if %1 == 8
shl mxd, 4
- mova m3, [filter_h2_shuf]
- mova m4, [filter_h4_shuf]
+ mova m3, [filter_h4_shuf1]
+ mova m4, [filter_h4_shuf2]
%if PIC
- lea picregq, [fourtap_filter_hb_m]
+ lea picregq, [fourtap_filter_b_m]
%endif
- mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
- mova m6, [fourtap_filter_hb+mxq]
+ mova m5, [fourtap_filter_b+mxq-16] ; set up 4tap filter in bytes
+ mova m6, [fourtap_filter_b+mxq]
%else
shl mxd, 3
mova m3, [filter4_h4_shuf]
--
2.49.1
>From 61379497c16bfd4048882f93461ee5d094431e1a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Mon, 24 Nov 2025 22:36:45 +0100
Subject: [PATCH 13/15] avcodec/x86/vp8dsp: Don't use saturated addition when
unnecessary
For the epel functions, there can be no overflow as long as the sum
contains only one of the two large central coefficients; for bilinear
functions, there can be no overflow whatsoever.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vp8dsp.asm | 38 +++++++++++++++++++-------------------
1 file changed, 19 insertions(+), 19 deletions(-)
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 340f6cc818..22356f687b 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -450,10 +450,10 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, h
pmullw m3, [mxq+48]
%endif
add srcq, srcstrideq
- paddsw m0, m1
- paddsw m2, m3
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m4
paddsw m0, m2
- paddsw m0, m4
psraw m0, 7
packuswb m0, m7
movh [dstq], m0 ; store
@@ -511,12 +511,12 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h
pmullw m5, [mxq+80]
%endif
add srcq, srcstrideq
- paddsw m1, m4
- paddsw m0, m5
- paddsw m1, m2
- paddsw m0, m3
+ paddw m1, m4
+ paddw m0, m5
+ paddw m1, m2
+ paddw m0, m3
+ paddw m1, m6
paddsw m0, m1
- paddsw m0, m6
psraw m0, 7
packuswb m0, m7
movh [dstq], m0 ; store
@@ -556,20 +556,20 @@ cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picre
mova m3, m4
pmullw m0, [myq+0]
pmullw m4, m5
- paddsw m4, m0
+ paddw m4, m0
; then calculate positive taps
mova m0, m1
pmullw m1, [myq+16]
- paddsw m4, m1
+ paddw m4, m1
mova m1, m2
pmullw m2, [myq+32]
+ paddw m4, m6
add srcq, srcstrideq
paddsw m4, m2
mova m2, m3
; round/clip/store
- paddsw m4, m6
psraw m4, 7
packuswb m4, m7
movh [dstq], m4
@@ -612,17 +612,18 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picre
pmullw m5, [myq+16]
mova m6, m4
pmullw m6, [myq+64]
- paddsw m6, m5
+ paddw m6, m5
; then calculate positive taps
movh m5, [srcq+srcstrideq] ; read new row
punpcklbw m5, m7
pmullw m0, [myq+0]
- paddsw m6, m0
+ paddw m6, [pw_64]
+ paddw m6, m0
mova m0, m1
mova m1, m2
pmullw m2, [myq+32]
- paddsw m6, m2
+ paddw m6, m2
mova m2, m3
pmullw m3, [myq+48]
add srcq, srcstrideq
@@ -633,7 +634,6 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picre
paddsw m6, m5
; round/clip/store
- paddsw m6, [pw_64]
psraw m6, 7
packuswb m6, m7
movh [dstq], m6
@@ -700,8 +700,8 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p
pmullw m2, m4
pmullw m3, m5
lea srcq, [srcq+srcstrideq*2]
- paddsw m0, m1
- paddsw m2, m3
+ paddw m0, m1
+ paddw m2, m3
psraw m0, 2
psraw m2, 2
pavgw m0, m6
@@ -771,8 +771,8 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride
pmullw m2, m4
pmullw m3, m5
lea srcq, [srcq+srcstrideq*2]
- paddsw m0, m1
- paddsw m2, m3
+ paddw m0, m1
+ paddw m2, m3
psraw m0, 2
psraw m2, 2
pavgw m0, m6
--
2.49.1
>From 25a28953d4737b9a466d24c47170d2c99f651db8 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 2 Dec 2025 19:49:17 +0100
Subject: [PATCH 14/15] avcodec/riscv/vp8dsp_rvv: Remove unused functions
Only the sixtap functions are used for size 16.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/riscv/vp8dsp_init.c | 5 -----
libavcodec/riscv/vp8dsp_rvv.S | 9 ++++++++-
2 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 3e35c72198..fecf6ef9b0 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -90,27 +90,22 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_rvv;
c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_rvv;
c->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_rvv;
- c->put_vp8_epel_pixels_tab[0][0][1] = ff_put_vp8_epel16_h4_rvv;
c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_rvv;
c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_rvv;
c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_rvv;
c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_rvv;
c->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_rvv;
- c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
#if __riscv_xlen <= 64
c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv;
c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
- c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv;
c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
- c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv;
c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
- c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv;
c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
#endif
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 2ee7029c60..ed08f72cdc 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -537,7 +537,14 @@ func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x, zba
endfunc
.endm
-.irp len,16,8,4
+# Only the sixtaps versions are used for epel16.
+epel 16 6 h
+epel 16 6 v
+#if __riscv_xlen <= 64
+epel_hv 16 6 6
+#endif
+
+.irp len,8,4
epel \len 6 h
epel \len 4 h
epel \len 6 v
--
2.49.1
>From 83fee0147bdb91683c4aaeadc883a5e5a7066dd7 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Mon, 24 Nov 2025 23:13:16 +0100
Subject: [PATCH 15/15] avcodec/vp8dsp: Don't compile unused functions
The width 16 epel functions never use four taps in any direction*,
so don't build said functions. Saves 4352B of .text and 89B of
.text.unlikely here.
*: mx and my in vp8_mc_luma() are always even.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/vp8dsp.c | 11 +++++------
tests/checkasm/vp8dsp.c | 3 ++-
2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c
index 5543303adb..eabe3edb27 100644
--- a/libavcodec/vp8dsp.c
+++ b/libavcodec/vp8dsp.c
@@ -558,26 +558,21 @@ put_vp8_epel ## SIZE ## _h ## HTAPS ## v ## VTAPS ## _c(uint8_t *dst, \
} \
}
-VP8_EPEL_H(16, 4)
VP8_EPEL_H(8, 4)
VP8_EPEL_H(4, 4)
VP8_EPEL_H(16, 6)
VP8_EPEL_H(8, 6)
VP8_EPEL_H(4, 6)
-VP8_EPEL_V(16, 4)
VP8_EPEL_V(8, 4)
VP8_EPEL_V(4, 4)
VP8_EPEL_V(16, 6)
VP8_EPEL_V(8, 6)
VP8_EPEL_V(4, 6)
-VP8_EPEL_HV(16, 4, 4)
VP8_EPEL_HV(8, 4, 4)
VP8_EPEL_HV(4, 4, 4)
-VP8_EPEL_HV(16, 4, 6)
VP8_EPEL_HV(8, 4, 6)
VP8_EPEL_HV(4, 4, 6)
-VP8_EPEL_HV(16, 6, 4)
VP8_EPEL_HV(8, 6, 4)
VP8_EPEL_HV(4, 6, 4)
VP8_EPEL_HV(16, 6, 6)
@@ -667,7 +662,11 @@ VP8_BILINEAR(4)
av_cold void ff_vp78dsp_init(VP8DSPContext *dsp)
{
- VP78_MC_FUNC(0, 16);
+ dsp->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_c;
+ dsp->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_c;
+ dsp->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_c;
+ dsp->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_c;
+
VP78_MC_FUNC(1, 8);
VP78_MC_FUNC(2, 4);
diff --git a/tests/checkasm/vp8dsp.c b/tests/checkasm/vp8dsp.c
index a12c295a2a..4d6704d5a9 100644
--- a/tests/checkasm/vp8dsp.c
+++ b/tests/checkasm/vp8dsp.c
@@ -510,7 +510,8 @@ static void checkasm_check_vp78dsp(VP8DSPContext *d, bool is_vp7)
void checkasm_check_vp8dsp(void)
{
- VP8DSPContext d;
+ // Needs to be zeroed because not all size 16 epel functions exist.
+ VP8DSPContext d = { 0 };
ff_vp78dsp_init(&d);
check_mc(&d);
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-12-02 18:55 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-12-02 18:54 [FFmpeg-devel] [PATCH] Avoid MMX in VP8 (PR #21081) mkver via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git