* [FFmpeg-devel] [PATCH] Port VP3 loopfilters to SSE2 (PR #20686)
@ 2025-10-10 14:02 mkver via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-10-10 14:02 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: mkver
PR #20686 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20686
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20686.patch
Also make them bitexact (they currently are not for extreme edge cases that don't happen in practice).
>From 98f43a540c2957624cca4024f4661b0a87906597 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 10 Oct 2025 14:58:58 +0200
Subject: [PATCH 1/4] tests/checkasm: Add VP3 loop filter test
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
tests/checkasm/Makefile | 1 +
tests/checkasm/checkasm.c | 3 +
tests/checkasm/checkasm.h | 1 +
tests/checkasm/vp3dsp.c | 117 ++++++++++++++++++++++++++++++++++++++
tests/fate/checkasm.mak | 1 +
5 files changed, 123 insertions(+)
create mode 100644 tests/checkasm/vp3dsp.c
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 7a9566eb8a..e47070d90f 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -22,6 +22,7 @@ AVCODECOBJS-$(CONFIG_ME_CMP) += motion.o
AVCODECOBJS-$(CONFIG_MPEGVIDEOENCDSP) += mpegvideoencdsp.o
AVCODECOBJS-$(CONFIG_QPELDSP) += qpeldsp.o
AVCODECOBJS-$(CONFIG_VC1DSP) += vc1dsp.o
+AVCODECOBJS-$(CONFIG_VP3DSP) += vp3dsp.o
AVCODECOBJS-$(CONFIG_VP8DSP) += vp8dsp.o
AVCODECOBJS-$(CONFIG_VIDEODSP) += videodsp.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 83aa26624d..4469e043f5 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -251,6 +251,9 @@ static const struct {
#if CONFIG_VC1DSP
{ "vc1dsp", checkasm_check_vc1dsp },
#endif
+ #if CONFIG_VP3DSP
+ { "vp3dsp", checkasm_check_vp3dsp },
+ #endif
#if CONFIG_VP8DSP
{ "vp8dsp", checkasm_check_vp8dsp },
#endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index bd7a896447..e1ccd4011b 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -152,6 +152,7 @@ void checkasm_check_vf_gblur(void);
void checkasm_check_vf_hflip(void);
void checkasm_check_vf_threshold(void);
void checkasm_check_vf_sobel(void);
+void checkasm_check_vp3dsp(void);
void checkasm_check_vp8dsp(void);
void checkasm_check_vp9dsp(void);
void checkasm_check_videodsp(void);
diff --git a/tests/checkasm/vp3dsp.c b/tests/checkasm/vp3dsp.c
new file mode 100644
index 0000000000..03466e7425
--- /dev/null
+++ b/tests/checkasm/vp3dsp.c
@@ -0,0 +1,117 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+
+#include "checkasm.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/macros.h"
+#include "libavutil/mem_internal.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/vp3dsp.h"
+
+enum {
+ MAX_STRIDE = 64,
+ MIN_STRIDE = 8,
+ /// Horizontal tests operate on 4x8 blocks
+ HORIZONTAL_BUF_SIZE = ((8 /* lines */ - 1) * MAX_STRIDE + 4 /* width */ + 7 /* misalignment */),
+ /// Vertical tests operate on 8x4 blocks
+ VERTICAL_BUF_SIZE = ((4 /* lines */ - 1) * MAX_STRIDE + 8 /* width */ + 7 /* misalignment */),
+};
+
+#define randomize_buffers(buf0, buf1, size) \
+ do { \
+ static_assert(sizeof(buf0[0]) == 1 && sizeof(buf1[0]) == 1, \
+ "Pointer arithmetic needs to be adapted"); \
+ for (size_t k = 0; k < (size & ~3); k += 4) { \
+ uint32_t r = rnd(); \
+ AV_WN32A(buf0 + k, r); \
+ AV_WN32A(buf1 + k, r); \
+ } \
+ for (size_t k = size & ~3; k < size; ++k) \
+ buf0[k] = buf1[k] = rnd(); \
+ } while (0)
+
+
+static void vp3_check_loop_filter(void)
+{
+ DECLARE_ALIGNED(8, uint8_t, hor_buf0)[HORIZONTAL_BUF_SIZE];
+ DECLARE_ALIGNED(8, uint8_t, hor_buf1)[HORIZONTAL_BUF_SIZE];
+ DECLARE_ALIGNED(8, uint8_t, ver_buf0)[VERTICAL_BUF_SIZE];
+ DECLARE_ALIGNED(8, uint8_t, ver_buf1)[VERTICAL_BUF_SIZE];
+ DECLARE_ALIGNED(8, int, bounding_values_array)[256 + 2];
+ int *const bounding_values = bounding_values_array + 127;
+ VP3DSPContext vp3dsp;
+ static const struct {
+ const char *name;
+ size_t offset;
+ int lines_above, lines_below;
+ int pixels_left, pixels_right;
+ unsigned alignment;
+ int horizontal;
+ } tests[] = {
+#define TEST(NAME) .name = #NAME, .offset = offsetof(VP3DSPContext, NAME)
+ { TEST(v_loop_filter_unaligned), 2, 1, 0, 7, 1, 0 },
+ { TEST(h_loop_filter_unaligned), 0, 7, 2, 1, 1, 1 },
+ { TEST(v_loop_filter), 2, 1, 0, 7, 8, 0 },
+ { TEST(h_loop_filter), 0, 7, 2, 1, 8, 1 },
+ };
+ declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *src, ptrdiff_t stride, int *bounding_values);
+
+ ff_vp3dsp_init(&vp3dsp, AV_CODEC_FLAG_BITEXACT);
+
+ int filter_limit = rnd() % 128;
+
+ ff_vp3dsp_set_bounding_values(bounding_values_array, filter_limit);
+
+ for (size_t i = 0; i < FF_ARRAY_ELEMS(tests); ++i) {
+ void (*loop_filter)(uint8_t *, ptrdiff_t, int*) = *(void(**)(uint8_t *, ptrdiff_t, int*))((char*)&vp3dsp + tests[i].offset);
+
+ if (check_func(loop_filter, "%s", tests[i].name)) {
+ uint8_t *buf0 = tests[i].horizontal ? hor_buf0 : ver_buf0;
+ uint8_t *buf1 = tests[i].horizontal ? hor_buf1 : ver_buf1;
+ size_t bufsize = tests[i].horizontal ? HORIZONTAL_BUF_SIZE : VERTICAL_BUF_SIZE;
+ ptrdiff_t stride = (rnd() % (MAX_STRIDE / MIN_STRIDE) + 1) * MIN_STRIDE;
+ // Don't always use pointers that are aligned to 8.
+ size_t offset = FFALIGN(tests[i].pixels_left, tests[i].alignment) +
+ (rnd() % (MIN_STRIDE / tests[i].alignment)) * tests[i].alignment
+ + stride * tests[i].lines_above;
+ uint8_t *dst0 = buf0 + offset, *dst1 = buf1 + offset;
+
+ if (rnd() & 1) {
+ // Flip stride.
+ dst1 += (tests[i].lines_below - tests[i].lines_above) * stride;
+ dst0 += (tests[i].lines_below - tests[i].lines_above) * stride;
+ stride = -stride;
+ }
+
+ randomize_buffers(buf0, buf1, bufsize);
+ call_ref(dst0, stride, bounding_values);
+ call_new(dst1, stride, bounding_values);
+ if (memcmp(buf0, buf1, bufsize))
+ fail();
+ bench_new(dst0, stride, bounding_values);
+ }
+ }
+}
+
+void checkasm_check_vp3dsp(void)
+{
+ vp3_check_loop_filter();
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index 6d16a65521..ca1cd0dea3 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -74,6 +74,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \
fate-checkasm-vf_sobel \
fate-checkasm-videodsp \
fate-checkasm-vorbisdsp \
+ fate-checkasm-vp3dsp \
fate-checkasm-vp8dsp \
fate-checkasm-vp9dsp \
fate-checkasm-vvc_alf \
--
2.49.1
>From 127e781edf1d18e358409505b4402e23c2aa142a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 9 Oct 2025 14:58:05 +0200
Subject: [PATCH 2/4] avcodec/x86/vp3dsp: Port loop filters to SSE2
The old code operated on bytes and did lots of tricks
due to their limited range; it did not completely succeed,
which is why the old versions were not used when bitexact
output was requested.
In contrast, the new version is much simpler: It operates
on signed 16 bit words whose range is more than sufficient.
This means that these functions don't need a check for bitexactness
(and can be used in FATE).
Old benchmarks (for this, the AV_CODEC_FLAG_BITEXACT check has been
removed from checkasm):
h_loop_filter_c: 29.8 ( 1.00x)
h_loop_filter_mmxext: 32.2 ( 0.93x)
h_loop_filter_unaligned_c: 29.9 ( 1.00x)
h_loop_filter_unaligned_mmxext: 31.4 ( 0.95x)
v_loop_filter_c: 39.3 ( 1.00x)
v_loop_filter_mmxext: 14.2 ( 2.78x)
v_loop_filter_unaligned_c: 38.9 ( 1.00x)
v_loop_filter_unaligned_mmxext: 14.3 ( 2.72x)
New benchmarks:
h_loop_filter_c: 29.2 ( 1.00x)
h_loop_filter_sse2: 28.6 ( 1.02x)
h_loop_filter_unaligned_c: 29.0 ( 1.00x)
h_loop_filter_unaligned_sse2: 26.9 ( 1.08x)
v_loop_filter_c: 38.3 ( 1.00x)
v_loop_filter_sse2: 11.0 ( 3.47x)
v_loop_filter_unaligned_c: 35.5 ( 1.00x)
v_loop_filter_unaligned_sse2: 11.2 ( 3.18x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/vp3.c | 2 +-
libavcodec/vp3dsp.c | 5 ++
libavcodec/x86/vp3dsp.asm | 158 +++++++++++++++++++----------------
libavcodec/x86/vp3dsp_init.c | 18 ++--
tests/checkasm/vp3dsp.c | 4 +-
5 files changed, 101 insertions(+), 86 deletions(-)
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index 59ad41bb85..549c698b77 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -325,7 +325,7 @@ typedef struct Vp3DecodeContext {
HuffTable huffman_table[5 * 16];
uint8_t filter_limit_values[64];
- DECLARE_ALIGNED(8, int, bounding_values_array)[256 + 2];
+ DECLARE_ALIGNED(16, int, bounding_values_array)[256 + 4];
VP4Predictor * dc_pred_row; /* dc_pred_row[y_superblock_width * 4] */
} Vp3DecodeContext;
diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c
index b4621f07e7..e993d165d9 100644
--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@@ -494,5 +494,10 @@ void ff_vp3dsp_set_bounding_values(int * bounding_values_array, int filter_limit
}
if (value)
bounding_values[128] = value;
+#if ARCH_X86
+ bounding_values[129] = bounding_values[130] =
+ bounding_values[131] = bounding_values[132] = filter_limit * 0x00020002U;
+#else
bounding_values[129] = bounding_values[130] = filter_limit * 0x02020202U;
+#endif
}
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 005ecbc9a0..f2fc1efd32 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -33,113 +33,125 @@ vp3_idct_data: times 8 dw 64277
times 8 dw 25080
times 8 dw 12785
-pb_7: times 8 db 0x07
-pb_1F: times 8 db 0x1f
-pb_81: times 8 db 0x81
-
-cextern pb_1
-cextern pb_3
cextern pb_80
cextern pb_FE
+cextern pw_4
cextern pw_8
SECTION .text
-; this is off by one or two for some cases when filter_limit is greater than 63
-; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
-; out: p1 in mm4, p2 in mm3
+; in: p0 in m5, p1 in m4, p2 in m2, p3 in m1, all unpacked;
+; m0 must be zeroed
+; out: p1 in m4, p2 in m2
%macro VP3_LOOP_FILTER 0
- movq m7, m6
- pand m6, [pb_7] ; p0&7
- psrlw m7, 3
- pand m7, [pb_1F] ; p0>>3
- movq m3, m2 ; p2
- pxor m2, m4
- pand m2, [pb_1] ; (p2^p1)&1
- movq m5, m2
- paddb m2, m2
- paddb m2, m5 ; 3*(p2^p1)&1
- paddb m2, m6 ; extra bits lost in shifts
- pcmpeqb m0, m0
- pxor m1, m0 ; 255 - p3
- pavgb m1, m2 ; (256 - p3 + extrabits) >> 1
- pxor m0, m4 ; 255 - p1
- pavgb m0, m3 ; (256 + p2-p1) >> 1
- paddb m1, [pb_3]
- pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2
- pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3
- paddusb m7, m1 ; d+128+1
- movq m6, [pb_81]
- psubusb m6, m7
- psubusb m7, [pb_81]
+ psubw m5, m1
+ mova m3, m2
+ paddw m5, [pw_4]
+ psubw m3, m4
+ mova m1, m3
+ paddw m1, m5
+ mova m5, [r2+516] ; 2 * filter limit
+ paddw m3, m3
+ paddw m3, m1
+ psraw m3, 3
- movq m5, [r2+516] ; flim
- pminub m6, m5
- pminub m7, m5
- movq m0, m6
- movq m1, m7
- paddb m6, m6
- paddb m7, m7
- pminub m6, m5
- pminub m7, m5
- psubb m6, m0
- psubb m7, m1
- paddusb m4, m7
- psubusb m4, m6
- psubusb m3, m7
- paddusb m3, m6
+ ; We use that clamp(2clamp(x,2f),2f)-clamp(x,2f)
+ ; (with f = filter limit and clamping to the interval [-f,f])
+ ; gives the desired filter value
+ psubw m0, m5
+ pminsw m3, m5
+ pmaxsw m3, m0
+ mova m1, m3
+ paddw m1, m1
+ pminsw m1, m5
+ pmaxsw m1, m0
+ psubw m1, m3
+ psubw m2, m1
+ paddw m4, m1
+
+ packuswb m4, m4
+ packuswb m2, m2
%endmacro
%macro STORE_4_WORDS 1
+%if ARCH_X86_64
+ movq r2, %1
+ mov [r0 -1], r2w
+ shr r2, 16
+ mov [r0+r1 -1], r2w
+ shr r2, 16
+%else
movd r2d, %1
mov [r0 -1], r2w
psrlq %1, 32
- shr r2, 16
+ shr r2d, 16
mov [r0+r1 -1], r2w
movd r2d, %1
+%endif
mov [r0+r1*2-1], r2w
- shr r2, 16
+ shr r2d, 16
mov [r0+r3 -1], r2w
%endmacro
-INIT_MMX mmxext
-cglobal vp3_v_loop_filter, 3, 4
- mov r3, r1
+INIT_XMM sse2
+cglobal vp3_v_loop_filter, 3, 3, 6
+ movq m1, [r0+r1 ]
neg r1
- movq m6, [r0+r1*2]
- movq m4, [r0+r1 ]
movq m2, [r0 ]
- movq m1, [r0+r3 ]
+ movq m4, [r0+r1 ]
+ movq m5, [r0+r1*2]
+
+ pxor m0, m0
+ punpcklbw m1, m0
+ punpcklbw m2, m0
+ punpcklbw m4, m0
+ punpcklbw m5, m0
VP3_LOOP_FILTER
movq [r0+r1], m4
- movq [r0 ], m3
+ movq [r0 ], m2
RET
-cglobal vp3_h_loop_filter, 3, 4
+%macro TRANSPOSE4x4 1
+ movd %1, [r0 -2]
+ movd m2, [r0+r1 -2]
+ movd m3, [r0+r1*2-2]
+ movd m4, [r0+r3 -2]
+ punpcklbw %1, m2
+ punpcklbw m3, m4
+ punpcklwd %1, m3
+%endmacro
+
+INIT_XMM sse2
+cglobal vp3_h_loop_filter, 3, 4, 6
lea r3, [r1*3]
- movd m6, [r0 -2]
- movd m4, [r0+r1 -2]
- movd m2, [r0+r1*2-2]
- movd m1, [r0+r3 -2]
- lea r0, [r0+r1*4 ]
- punpcklbw m6, [r0 -2]
- punpcklbw m4, [r0+r1 -2]
- punpcklbw m2, [r0+r1*2-2]
- punpcklbw m1, [r0+r3 -2]
+ TRANSPOSE4x4 m5
+ lea r0, [r0+r1*4]
+ TRANSPOSE4x4 m0
+ mova m2, m5
+ punpckldq m5, m0
+ punpckhdq m2, m0
+ pxor m0, m0
+ mova m4, m5
+ punpcklbw m5, m0
+ punpckhbw m4, m0
+ mova m1, m2
+ punpcklbw m2, m0
+ punpckhbw m1, m0
+
+ VP3_LOOP_FILTER
+
+ punpcklbw m4, m2
+ mova m2, m4
+ punpckhqdq m2, m2
+
+ STORE_4_WORDS m2
sub r0, r3
sub r0, r1
-
- TRANSPOSE4x4B 6, 4, 2, 1, 0
- VP3_LOOP_FILTER
- SBUTTERFLY bw, 4, 3, 5
-
STORE_4_WORDS m4
- lea r0, [r0+r1*4 ]
- STORE_4_WORDS m3
RET
%macro PAVGB_NO_RND 0
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index edac1764cb..b3dcd8c86d 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -18,12 +18,12 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <stddef.h>
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
-#include "libavcodec/avcodec.h"
#include "libavcodec/vp3dsp.h"
void ff_vp3_idct_put_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
@@ -31,10 +31,10 @@ void ff_vp3_idct_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block);
-void ff_vp3_v_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
- int *bounding_values);
-void ff_vp3_h_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
- int *bounding_values);
+void ff_vp3_v_loop_filter_sse2(uint8_t *src, ptrdiff_t stride,
+ int *bounding_values);
+void ff_vp3_h_loop_filter_sse2(uint8_t *src, ptrdiff_t stride,
+ int *bounding_values);
void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
const uint8_t *b, ptrdiff_t stride,
@@ -50,15 +50,13 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
-
- if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
- c->v_loop_filter = c->v_loop_filter_unaligned = ff_vp3_v_loop_filter_mmxext;
- c->h_loop_filter = c->h_loop_filter_unaligned = ff_vp3_h_loop_filter_mmxext;
- }
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->idct_put = ff_vp3_idct_put_sse2;
c->idct_add = ff_vp3_idct_add_sse2;
+
+ c->v_loop_filter = c->v_loop_filter_unaligned = ff_vp3_v_loop_filter_sse2;
+ c->h_loop_filter = c->h_loop_filter_unaligned = ff_vp3_h_loop_filter_sse2;
}
}
diff --git a/tests/checkasm/vp3dsp.c b/tests/checkasm/vp3dsp.c
index 03466e7425..04871bf838 100644
--- a/tests/checkasm/vp3dsp.c
+++ b/tests/checkasm/vp3dsp.c
@@ -55,7 +55,7 @@ static void vp3_check_loop_filter(void)
DECLARE_ALIGNED(8, uint8_t, hor_buf1)[HORIZONTAL_BUF_SIZE];
DECLARE_ALIGNED(8, uint8_t, ver_buf0)[VERTICAL_BUF_SIZE];
DECLARE_ALIGNED(8, uint8_t, ver_buf1)[VERTICAL_BUF_SIZE];
- DECLARE_ALIGNED(8, int, bounding_values_array)[256 + 2];
+ DECLARE_ALIGNED(16, int, bounding_values_array)[256 + 4];
int *const bounding_values = bounding_values_array + 127;
VP3DSPContext vp3dsp;
static const struct {
@@ -72,7 +72,7 @@ static void vp3_check_loop_filter(void)
{ TEST(v_loop_filter), 2, 1, 0, 7, 8, 0 },
{ TEST(h_loop_filter), 0, 7, 2, 1, 8, 1 },
};
- declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *src, ptrdiff_t stride, int *bounding_values);
+ declare_func(void, uint8_t *src, ptrdiff_t stride, int *bounding_values);
ff_vp3dsp_init(&vp3dsp, AV_CODEC_FLAG_BITEXACT);
--
2.49.1
>From 3479807922e22474f8cd88dbf9ef0dd0d9b248fe Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 9 Oct 2025 18:56:37 +0200
Subject: [PATCH 3/4] avcodec/vp3dsp: Remove unused flags parameter from
ff_vp3dsp_init()
No longer necessary now that the x86 loop filter functions are
bitexact.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/arm/vp3dsp_init_arm.c | 2 +-
libavcodec/mips/vp3dsp_init_mips.c | 2 +-
libavcodec/ppc/vp3dsp_altivec.c | 2 +-
libavcodec/vp3.c | 2 +-
libavcodec/vp3dsp.c | 10 +++++-----
libavcodec/vp3dsp.h | 10 +++++-----
libavcodec/vp56.c | 2 +-
libavcodec/x86/vp3dsp_init.c | 2 +-
tests/checkasm/vp3dsp.c | 3 +--
9 files changed, 17 insertions(+), 18 deletions(-)
diff --git a/libavcodec/arm/vp3dsp_init_arm.c b/libavcodec/arm/vp3dsp_init_arm.c
index 65ea53fe0f..905c3dd624 100644
--- a/libavcodec/arm/vp3dsp_init_arm.c
+++ b/libavcodec/arm/vp3dsp_init_arm.c
@@ -30,7 +30,7 @@ void ff_vp3_idct_dc_add_neon(uint8_t *dest, ptrdiff_t stride, int16_t *data);
void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
-av_cold void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags)
+av_cold void ff_vp3dsp_init_arm(VP3DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/mips/vp3dsp_init_mips.c b/libavcodec/mips/vp3dsp_init_mips.c
index 4252ff790e..7388386d2b 100644
--- a/libavcodec/mips/vp3dsp_init_mips.c
+++ b/libavcodec/mips/vp3dsp_init_mips.c
@@ -26,7 +26,7 @@
#include "libavcodec/vp3dsp.h"
#include "vp3dsp_mips.h"
-av_cold void ff_vp3dsp_init_mips(VP3DSPContext *c, int flags)
+av_cold void ff_vp3dsp_init_mips(VP3DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/ppc/vp3dsp_altivec.c b/libavcodec/ppc/vp3dsp_altivec.c
index a9a48d145b..30551a1a20 100644
--- a/libavcodec/ppc/vp3dsp_altivec.c
+++ b/libavcodec/ppc/vp3dsp_altivec.c
@@ -192,7 +192,7 @@ static void vp3_idct_add_altivec(uint8_t *dst, ptrdiff_t stride, int16_t block[6
#endif /* HAVE_ALTIVEC */
-av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags)
+av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index 549c698b77..406c4f499b 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -2385,7 +2385,7 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
ff_hpeldsp_init(&s->hdsp, avctx->flags | AV_CODEC_FLAG_BITEXACT);
ff_videodsp_init(&s->vdsp, 8);
- ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
+ ff_vp3dsp_init(&s->vp3dsp);
for (int i = 0; i < 64; i++) {
#define TRANSPOSE(x) (((x) >> 3) | (((x) & 7) << 3))
diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c
index e993d165d9..b96b4dea68 100644
--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@@ -445,7 +445,7 @@ static void put_no_rnd_pixels_l2(uint8_t *dst, const uint8_t *src1,
}
}
-av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags)
+av_cold void ff_vp3dsp_init(VP3DSPContext *c)
{
c->put_no_rnd_pixels_l2 = put_no_rnd_pixels_l2;
@@ -456,13 +456,13 @@ av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags)
c->h_loop_filter = c->h_loop_filter_unaligned = vp3_h_loop_filter_8_c;
#if ARCH_ARM
- ff_vp3dsp_init_arm(c, flags);
+ ff_vp3dsp_init_arm(c);
#elif ARCH_PPC
- ff_vp3dsp_init_ppc(c, flags);
+ ff_vp3dsp_init_ppc(c);
#elif ARCH_X86
- ff_vp3dsp_init_x86(c, flags);
+ ff_vp3dsp_init_x86(c);
#elif ARCH_MIPS
- ff_vp3dsp_init_mips(c, flags);
+ ff_vp3dsp_init_mips(c);
#endif
}
diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h
index 3b849ec05d..1d5dd4b738 100644
--- a/libavcodec/vp3dsp.h
+++ b/libavcodec/vp3dsp.h
@@ -53,11 +53,11 @@ void ff_vp3dsp_h_loop_filter_12(uint8_t *first_pixel, ptrdiff_t stride, int *bou
void ff_vp3dsp_idct10_put(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vp3dsp_idct10_add(uint8_t *dest, ptrdiff_t stride, int16_t *block);
-void ff_vp3dsp_init(VP3DSPContext *c, int flags);
-void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags);
-void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags);
-void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags);
-void ff_vp3dsp_init_mips(VP3DSPContext *c, int flags);
+void ff_vp3dsp_init(VP3DSPContext *c);
+void ff_vp3dsp_init_arm(VP3DSPContext *c);
+void ff_vp3dsp_init_ppc(VP3DSPContext *c);
+void ff_vp3dsp_init_x86(VP3DSPContext *c);
+void ff_vp3dsp_init_mips(VP3DSPContext *c);
void ff_vp3dsp_set_bounding_values(int * bound_values_array, int filter_limit);
diff --git a/libavcodec/vp56.c b/libavcodec/vp56.c
index 4f2381f64a..dc3ae70c66 100644
--- a/libavcodec/vp56.c
+++ b/libavcodec/vp56.c
@@ -828,7 +828,7 @@ av_cold int ff_vp56_init_context(AVCodecContext *avctx, VP56Context *s,
ff_h264chroma_init(&s->h264chroma, 8);
ff_hpeldsp_init(&s->hdsp, avctx->flags);
ff_videodsp_init(&s->vdsp, 8);
- ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
+ ff_vp3dsp_init(&s->vp3dsp);
for (i = 0; i < 64; i++) {
#define TRANSPOSE(x) (((x) >> 3) | (((x) & 7) << 3))
s->idct_scantable[i] = TRANSPOSE(ff_zigzag_direct[i]);
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index b3dcd8c86d..42daf99981 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -40,7 +40,7 @@ void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
const uint8_t *b, ptrdiff_t stride,
int h);
-av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
+av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
diff --git a/tests/checkasm/vp3dsp.c b/tests/checkasm/vp3dsp.c
index 04871bf838..81f096dc2a 100644
--- a/tests/checkasm/vp3dsp.c
+++ b/tests/checkasm/vp3dsp.c
@@ -23,7 +23,6 @@
#include "libavutil/intreadwrite.h"
#include "libavutil/macros.h"
#include "libavutil/mem_internal.h"
-#include "libavcodec/avcodec.h"
#include "libavcodec/vp3dsp.h"
enum {
@@ -74,7 +73,7 @@ static void vp3_check_loop_filter(void)
};
declare_func(void, uint8_t *src, ptrdiff_t stride, int *bounding_values);
- ff_vp3dsp_init(&vp3dsp, AV_CODEC_FLAG_BITEXACT);
+ ff_vp3dsp_init(&vp3dsp);
int filter_limit = rnd() % 128;
--
2.49.1
>From 2d36506fab2422df4523f0d1a6a38f3bb98732dd Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 10 Oct 2025 15:49:04 +0200
Subject: [PATCH 4/4] avcodec/vp3: Optimize alignment check away when possible
Check only on arches that need said check.
(Btw: I do not see how h_loop_filter benefits from alignment
at all and why h_loop_filter_unaligned exists.)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/vp3.c | 2 +-
libavcodec/vp3dsp.h | 4 ++++
tests/checkasm/vp3dsp.c | 4 ++--
3 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index 406c4f499b..0613254c14 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -2031,7 +2031,7 @@ static int vp4_mc_loop_filter(Vp3DecodeContext *s, int plane, int motion_x, int
plane_height);
#define safe_loop_filter(name, ptr, stride, bounding_values) \
- if ((uintptr_t)(ptr) & 7) \
+ if (VP3_LOOP_FILTER_NO_UNALIGNED_SUPPORT && (uintptr_t)(ptr) & 7) \
s->vp3dsp.name##_unaligned(ptr, stride, bounding_values); \
else \
s->vp3dsp.name(ptr, stride, bounding_values);
diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h
index 1d5dd4b738..7512676379 100644
--- a/libavcodec/vp3dsp.h
+++ b/libavcodec/vp3dsp.h
@@ -22,6 +22,10 @@
#include <stddef.h>
#include <stdint.h>
+// If this is one, {v,h}_loop_filter expect src to be aligned on eight bytes;
+// otherwise they don't have any alignment requirements for src.
+#define VP3_LOOP_FILTER_NO_UNALIGNED_SUPPORT (ARCH_ARM || ARCH_MIPS)
+
typedef struct VP3DSPContext {
/**
* Copy 8xH pixels from source to destination buffer using a bilinear
diff --git a/tests/checkasm/vp3dsp.c b/tests/checkasm/vp3dsp.c
index 81f096dc2a..4f985ae62a 100644
--- a/tests/checkasm/vp3dsp.c
+++ b/tests/checkasm/vp3dsp.c
@@ -68,8 +68,8 @@ static void vp3_check_loop_filter(void)
#define TEST(NAME) .name = #NAME, .offset = offsetof(VP3DSPContext, NAME)
{ TEST(v_loop_filter_unaligned), 2, 1, 0, 7, 1, 0 },
{ TEST(h_loop_filter_unaligned), 0, 7, 2, 1, 1, 1 },
- { TEST(v_loop_filter), 2, 1, 0, 7, 8, 0 },
- { TEST(h_loop_filter), 0, 7, 2, 1, 8, 1 },
+ { TEST(v_loop_filter), 2, 1, 0, 7, VP3_LOOP_FILTER_NO_UNALIGNED_SUPPORT ? 8 : 1, 0 },
+ { TEST(h_loop_filter), 0, 7, 2, 1, VP3_LOOP_FILTER_NO_UNALIGNED_SUPPORT ? 8 : 1, 1 },
};
declare_func(void, uint8_t *src, ptrdiff_t stride, int *bounding_values);
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-10-10 14:03 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-10 14:02 [FFmpeg-devel] [PATCH] Port VP3 loopfilters to SSE2 (PR #20686) mkver via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git