[FFmpeg-devel] [PATCH] Port VP3 loopfilters to SSE2 (PR #20686)

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] [PATCH] Port VP3 loopfilters to SSE2 (PR #20686)
@ 2025-10-10 14:02 mkver via ffmpeg-devel
  0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-10-10 14:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: mkver

PR #20686 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20686
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20686.patch

Also make them bitexact (they currently are not for extreme edge cases that don't happen in practice).


>From 98f43a540c2957624cca4024f4661b0a87906597 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 10 Oct 2025 14:58:58 +0200
Subject: [PATCH 1/4] tests/checkasm: Add VP3 loop filter test

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 tests/checkasm/Makefile   |   1 +
 tests/checkasm/checkasm.c |   3 +
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/vp3dsp.c   | 117 ++++++++++++++++++++++++++++++++++++++
 tests/fate/checkasm.mak   |   1 +
 5 files changed, 123 insertions(+)
 create mode 100644 tests/checkasm/vp3dsp.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 7a9566eb8a..e47070d90f 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -22,6 +22,7 @@ AVCODECOBJS-$(CONFIG_ME_CMP)            += motion.o
 AVCODECOBJS-$(CONFIG_MPEGVIDEOENCDSP)   += mpegvideoencdsp.o
 AVCODECOBJS-$(CONFIG_QPELDSP)           += qpeldsp.o
 AVCODECOBJS-$(CONFIG_VC1DSP)            += vc1dsp.o
+AVCODECOBJS-$(CONFIG_VP3DSP)            += vp3dsp.o
 AVCODECOBJS-$(CONFIG_VP8DSP)            += vp8dsp.o
 AVCODECOBJS-$(CONFIG_VIDEODSP)          += videodsp.o
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 83aa26624d..4469e043f5 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -251,6 +251,9 @@ static const struct {
     #if CONFIG_VC1DSP
         { "vc1dsp", checkasm_check_vc1dsp },
     #endif
+    #if CONFIG_VP3DSP
+        { "vp3dsp", checkasm_check_vp3dsp },
+    #endif
     #if CONFIG_VP8DSP
         { "vp8dsp", checkasm_check_vp8dsp },
     #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index bd7a896447..e1ccd4011b 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -152,6 +152,7 @@ void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
 void checkasm_check_vf_sobel(void);
+void checkasm_check_vp3dsp(void);
 void checkasm_check_vp8dsp(void);
 void checkasm_check_vp9dsp(void);
 void checkasm_check_videodsp(void);
diff --git a/tests/checkasm/vp3dsp.c b/tests/checkasm/vp3dsp.c
new file mode 100644
index 0000000000..03466e7425
--- /dev/null
+++ b/tests/checkasm/vp3dsp.c
@@ -0,0 +1,117 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+
+#include "checkasm.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/macros.h"
+#include "libavutil/mem_internal.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/vp3dsp.h"
+
+enum {
+    MAX_STRIDE          = 64,
+    MIN_STRIDE          = 8,
+    /// Horizontal tests operate on 4x8 blocks
+    HORIZONTAL_BUF_SIZE = ((8 /* lines */ - 1) * MAX_STRIDE + 4 /* width */ + 7 /* misalignment */),
+    /// Vertical tests operate on 8x4 blocks
+    VERTICAL_BUF_SIZE   = ((4 /* lines */ - 1) * MAX_STRIDE + 8 /* width */ + 7 /* misalignment */),
+};
+
+#define randomize_buffers(buf0, buf1, size)                \
+    do {                                                   \
+        static_assert(sizeof(buf0[0]) == 1 && sizeof(buf1[0]) == 1, \
+                      "Pointer arithmetic needs to be adapted"); \
+        for (size_t k = 0; k < (size & ~3); k += 4) {      \
+            uint32_t r = rnd();                            \
+            AV_WN32A(buf0 + k, r);                         \
+            AV_WN32A(buf1 + k, r);                         \
+        }                                                  \
+        for (size_t k = size & ~3; k < size; ++k)          \
+            buf0[k] = buf1[k] = rnd();                     \
+    } while (0)
+
+
+static void vp3_check_loop_filter(void)
+{
+    DECLARE_ALIGNED(8, uint8_t, hor_buf0)[HORIZONTAL_BUF_SIZE];
+    DECLARE_ALIGNED(8, uint8_t, hor_buf1)[HORIZONTAL_BUF_SIZE];
+    DECLARE_ALIGNED(8, uint8_t, ver_buf0)[VERTICAL_BUF_SIZE];
+    DECLARE_ALIGNED(8, uint8_t, ver_buf1)[VERTICAL_BUF_SIZE];
+    DECLARE_ALIGNED(8, int, bounding_values_array)[256 + 2];
+    int *const bounding_values = bounding_values_array + 127;
+    VP3DSPContext vp3dsp;
+    static const struct {
+        const char *name;
+        size_t offset;
+        int lines_above, lines_below;
+        int pixels_left, pixels_right;
+        unsigned alignment;
+        int horizontal;
+    } tests[] = {
+#define TEST(NAME) .name = #NAME, .offset = offsetof(VP3DSPContext, NAME)
+        { TEST(v_loop_filter_unaligned), 2, 1, 0, 7, 1, 0 },
+        { TEST(h_loop_filter_unaligned), 0, 7, 2, 1, 1, 1 },
+        { TEST(v_loop_filter),           2, 1, 0, 7, 8, 0 },
+        { TEST(h_loop_filter),           0, 7, 2, 1, 8, 1 },
+    };
+    declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *src, ptrdiff_t stride, int *bounding_values);
+
+    ff_vp3dsp_init(&vp3dsp, AV_CODEC_FLAG_BITEXACT);
+
+    int filter_limit = rnd() % 128;
+
+    ff_vp3dsp_set_bounding_values(bounding_values_array, filter_limit);
+
+    for (size_t i = 0; i < FF_ARRAY_ELEMS(tests); ++i) {
+        void (*loop_filter)(uint8_t *, ptrdiff_t, int*) = *(void(**)(uint8_t *, ptrdiff_t, int*))((char*)&vp3dsp + tests[i].offset);
+
+        if (check_func(loop_filter, "%s", tests[i].name)) {
+            uint8_t  *buf0 = tests[i].horizontal ? hor_buf0 : ver_buf0;
+            uint8_t  *buf1 = tests[i].horizontal ? hor_buf1 : ver_buf1;
+            size_t bufsize = tests[i].horizontal ? HORIZONTAL_BUF_SIZE : VERTICAL_BUF_SIZE;
+            ptrdiff_t stride = (rnd() % (MAX_STRIDE / MIN_STRIDE) + 1) * MIN_STRIDE;
+            // Don't always use pointers that are aligned to 8.
+            size_t offset = FFALIGN(tests[i].pixels_left, tests[i].alignment) +
+                            (rnd() % (MIN_STRIDE / tests[i].alignment)) * tests[i].alignment
+                            + stride * tests[i].lines_above;
+            uint8_t *dst0 = buf0 + offset, *dst1 = buf1 + offset;
+
+            if (rnd() & 1) {
+                // Flip stride.
+                dst1  += (tests[i].lines_below - tests[i].lines_above) * stride;
+                dst0  += (tests[i].lines_below - tests[i].lines_above) * stride;
+                stride = -stride;
+            }
+
+            randomize_buffers(buf0, buf1, bufsize);
+            call_ref(dst0, stride, bounding_values);
+            call_new(dst1, stride, bounding_values);
+            if (memcmp(buf0, buf1, bufsize))
+                fail();
+            bench_new(dst0, stride, bounding_values);
+        }
+    }
+}
+
+void checkasm_check_vp3dsp(void)
+{
+    vp3_check_loop_filter();
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index 6d16a65521..ca1cd0dea3 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -74,6 +74,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp                                 \
                 fate-checkasm-vf_sobel                                  \
                 fate-checkasm-videodsp                                  \
                 fate-checkasm-vorbisdsp                                 \
+                fate-checkasm-vp3dsp                                    \
                 fate-checkasm-vp8dsp                                    \
                 fate-checkasm-vp9dsp                                    \
                 fate-checkasm-vvc_alf                                   \
-- 
2.49.1


>From 127e781edf1d18e358409505b4402e23c2aa142a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 9 Oct 2025 14:58:05 +0200
Subject: [PATCH 2/4] avcodec/x86/vp3dsp: Port loop filters to SSE2

The old code operated on bytes and did lots of tricks
due to their limited range; it did not completely succeed,
which is why the old versions were not used when bitexact
output was requested.

In contrast, the new version is much simpler: It operates
on signed 16 bit words whose range is more than sufficient.
This means that these functions don't need a check for bitexactness
(and can be used in FATE).

Old benchmarks (for this, the AV_CODEC_FLAG_BITEXACT check has been
removed from checkasm):
h_loop_filter_c:                                        29.8 ( 1.00x)
h_loop_filter_mmxext:                                   32.2 ( 0.93x)
h_loop_filter_unaligned_c:                              29.9 ( 1.00x)
h_loop_filter_unaligned_mmxext:                         31.4 ( 0.95x)
v_loop_filter_c:                                        39.3 ( 1.00x)
v_loop_filter_mmxext:                                   14.2 ( 2.78x)
v_loop_filter_unaligned_c:                              38.9 ( 1.00x)
v_loop_filter_unaligned_mmxext:                         14.3 ( 2.72x)

New benchmarks:
h_loop_filter_c:                                        29.2 ( 1.00x)
h_loop_filter_sse2:                                     28.6 ( 1.02x)
h_loop_filter_unaligned_c:                              29.0 ( 1.00x)
h_loop_filter_unaligned_sse2:                           26.9 ( 1.08x)
v_loop_filter_c:                                        38.3 ( 1.00x)
v_loop_filter_sse2:                                     11.0 ( 3.47x)
v_loop_filter_unaligned_c:                              35.5 ( 1.00x)
v_loop_filter_unaligned_sse2:                           11.2 ( 3.18x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/vp3.c             |   2 +-
 libavcodec/vp3dsp.c          |   5 ++
 libavcodec/x86/vp3dsp.asm    | 158 +++++++++++++++++++----------------
 libavcodec/x86/vp3dsp_init.c |  18 ++--
 tests/checkasm/vp3dsp.c      |   4 +-
 5 files changed, 101 insertions(+), 86 deletions(-)

diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index 59ad41bb85..549c698b77 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -325,7 +325,7 @@ typedef struct Vp3DecodeContext {
     HuffTable huffman_table[5 * 16];
 
     uint8_t filter_limit_values[64];
-    DECLARE_ALIGNED(8, int, bounding_values_array)[256 + 2];
+    DECLARE_ALIGNED(16, int, bounding_values_array)[256 + 4];
 
     VP4Predictor * dc_pred_row; /* dc_pred_row[y_superblock_width * 4] */
 } Vp3DecodeContext;
diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c
index b4621f07e7..e993d165d9 100644
--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@@ -494,5 +494,10 @@ void ff_vp3dsp_set_bounding_values(int * bounding_values_array, int filter_limit
     }
     if (value)
         bounding_values[128] = value;
+#if ARCH_X86
+    bounding_values[129] = bounding_values[130] =
+    bounding_values[131] = bounding_values[132] = filter_limit * 0x00020002U;
+#else
     bounding_values[129] = bounding_values[130] = filter_limit * 0x02020202U;
+#endif
 }
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 005ecbc9a0..f2fc1efd32 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -33,113 +33,125 @@ vp3_idct_data: times 8 dw 64277
                times 8 dw 25080
                times 8 dw 12785
 
-pb_7:  times 8 db 0x07
-pb_1F: times 8 db 0x1f
-pb_81: times 8 db 0x81
-
-cextern pb_1
-cextern pb_3
 cextern pb_80
 cextern pb_FE
 
+cextern pw_4
 cextern pw_8
 
 SECTION .text
 
-; this is off by one or two for some cases when filter_limit is greater than 63
-; in:  p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
-; out: p1 in mm4, p2 in mm3
+; in:  p0 in m5, p1 in m4, p2 in m2, p3 in m1, all unpacked;
+;      m0 must be zeroed
+; out: p1 in m4, p2 in m2
 %macro VP3_LOOP_FILTER 0
-    movq          m7, m6
-    pand          m6, [pb_7]    ; p0&7
-    psrlw         m7, 3
-    pand          m7, [pb_1F]   ; p0>>3
-    movq          m3, m2        ; p2
-    pxor          m2, m4
-    pand          m2, [pb_1]    ; (p2^p1)&1
-    movq          m5, m2
-    paddb         m2, m2
-    paddb         m2, m5        ; 3*(p2^p1)&1
-    paddb         m2, m6        ; extra bits lost in shifts
-    pcmpeqb       m0, m0
-    pxor          m1, m0        ; 255 - p3
-    pavgb         m1, m2        ; (256 - p3 + extrabits) >> 1
-    pxor          m0, m4        ; 255 - p1
-    pavgb         m0, m3        ; (256 + p2-p1) >> 1
-    paddb         m1, [pb_3]
-    pavgb         m1, m0        ; 128+2+(   p2-p1  - p3) >> 2
-    pavgb         m1, m0        ; 128+1+(3*(p2-p1) - p3) >> 3
-    paddusb       m7, m1        ; d+128+1
-    movq          m6, [pb_81]
-    psubusb       m6, m7
-    psubusb       m7, [pb_81]
+    psubw         m5, m1
+    mova          m3, m2
+    paddw         m5, [pw_4]
+    psubw         m3, m4
+    mova          m1, m3
+    paddw         m1, m5
+    mova          m5, [r2+516]  ; 2 * filter limit
+    paddw         m3, m3
+    paddw         m3, m1
+    psraw         m3, 3
 
-    movq          m5, [r2+516]  ; flim
-    pminub        m6, m5
-    pminub        m7, m5
-    movq          m0, m6
-    movq          m1, m7
-    paddb         m6, m6
-    paddb         m7, m7
-    pminub        m6, m5
-    pminub        m7, m5
-    psubb         m6, m0
-    psubb         m7, m1
-    paddusb       m4, m7
-    psubusb       m4, m6
-    psubusb       m3, m7
-    paddusb       m3, m6
+    ; We use that clamp(2clamp(x,2f),2f)-clamp(x,2f)
+    ; (with f = filter limit and clamping to the interval [-f,f])
+    ; gives the desired filter value
+    psubw         m0, m5
+    pminsw        m3, m5
+    pmaxsw        m3, m0
+    mova          m1, m3
+    paddw         m1, m1
+    pminsw        m1, m5
+    pmaxsw        m1, m0
+    psubw         m1, m3
+    psubw         m2, m1
+    paddw         m4, m1
+
+    packuswb      m4, m4
+    packuswb      m2, m2
 %endmacro
 
 %macro STORE_4_WORDS 1
+%if ARCH_X86_64
+    movq          r2, %1
+    mov  [r0     -1], r2w
+    shr           r2, 16
+    mov  [r0+r1  -1], r2w
+    shr           r2, 16
+%else
     movd         r2d, %1
     mov  [r0     -1], r2w
     psrlq         %1, 32
-    shr           r2, 16
+    shr          r2d, 16
     mov  [r0+r1  -1], r2w
     movd         r2d, %1
+%endif
     mov  [r0+r1*2-1], r2w
-    shr           r2, 16
+    shr          r2d, 16
     mov  [r0+r3  -1], r2w
 %endmacro
 
-INIT_MMX mmxext
-cglobal vp3_v_loop_filter, 3, 4
-    mov           r3, r1
+INIT_XMM sse2
+cglobal vp3_v_loop_filter, 3, 3, 6
+    movq          m1, [r0+r1  ]
     neg           r1
-    movq          m6, [r0+r1*2]
-    movq          m4, [r0+r1  ]
     movq          m2, [r0     ]
-    movq          m1, [r0+r3  ]
+    movq          m4, [r0+r1  ]
+    movq          m5, [r0+r1*2]
+
+    pxor          m0, m0
+    punpcklbw     m1, m0
+    punpcklbw     m2, m0
+    punpcklbw     m4, m0
+    punpcklbw     m5, m0
 
     VP3_LOOP_FILTER
 
     movq     [r0+r1], m4
-    movq     [r0   ], m3
+    movq     [r0   ], m2
     RET
 
-cglobal vp3_h_loop_filter, 3, 4
+%macro TRANSPOSE4x4 1
+    movd         %1, [r0     -2]
+    movd         m2, [r0+r1  -2]
+    movd         m3, [r0+r1*2-2]
+    movd         m4, [r0+r3  -2]
+    punpcklbw    %1, m2
+    punpcklbw    m3, m4
+    punpcklwd    %1, m3
+%endmacro
+
+INIT_XMM sse2
+cglobal vp3_h_loop_filter, 3, 4, 6
     lea           r3, [r1*3]
 
-    movd          m6, [r0     -2]
-    movd          m4, [r0+r1  -2]
-    movd          m2, [r0+r1*2-2]
-    movd          m1, [r0+r3  -2]
-    lea           r0, [r0+r1*4  ]
-    punpcklbw     m6, [r0     -2]
-    punpcklbw     m4, [r0+r1  -2]
-    punpcklbw     m2, [r0+r1*2-2]
-    punpcklbw     m1, [r0+r3  -2]
+    TRANSPOSE4x4  m5
+    lea           r0, [r0+r1*4]
+    TRANSPOSE4x4  m0
+    mova          m2, m5
+    punpckldq     m5, m0
+    punpckhdq     m2, m0
+    pxor          m0, m0
+    mova          m4, m5
+    punpcklbw     m5, m0
+    punpckhbw     m4, m0
+    mova          m1, m2
+    punpcklbw     m2, m0
+    punpckhbw     m1, m0
+
+    VP3_LOOP_FILTER
+
+    punpcklbw     m4, m2
+    mova          m2, m4
+    punpckhqdq    m2, m2
+
+    STORE_4_WORDS m2
     sub           r0, r3
     sub           r0, r1
-
-    TRANSPOSE4x4B  6, 4, 2, 1, 0
-    VP3_LOOP_FILTER
-    SBUTTERFLY    bw, 4, 3, 5
-
     STORE_4_WORDS m4
-    lea           r0, [r0+r1*4  ]
-    STORE_4_WORDS m3
     RET
 
 %macro PAVGB_NO_RND 0
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index edac1764cb..b3dcd8c86d 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -18,12 +18,12 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include <stddef.h>
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/avcodec.h"
 #include "libavcodec/vp3dsp.h"
 
 void ff_vp3_idct_put_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
@@ -31,10 +31,10 @@ void ff_vp3_idct_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 
 void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 
-void ff_vp3_v_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
-                                 int *bounding_values);
-void ff_vp3_h_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
-                                 int *bounding_values);
+void ff_vp3_v_loop_filter_sse2(uint8_t *src, ptrdiff_t stride,
+                               int *bounding_values);
+void ff_vp3_h_loop_filter_sse2(uint8_t *src, ptrdiff_t stride,
+                               int *bounding_values);
 
 void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
                                      const uint8_t *b, ptrdiff_t stride,
@@ -50,15 +50,13 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
-
-        if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
-            c->v_loop_filter = c->v_loop_filter_unaligned = ff_vp3_v_loop_filter_mmxext;
-            c->h_loop_filter = c->h_loop_filter_unaligned = ff_vp3_h_loop_filter_mmxext;
-        }
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->idct_put  = ff_vp3_idct_put_sse2;
         c->idct_add  = ff_vp3_idct_add_sse2;
+
+        c->v_loop_filter = c->v_loop_filter_unaligned = ff_vp3_v_loop_filter_sse2;
+        c->h_loop_filter = c->h_loop_filter_unaligned = ff_vp3_h_loop_filter_sse2;
     }
 }
diff --git a/tests/checkasm/vp3dsp.c b/tests/checkasm/vp3dsp.c
index 03466e7425..04871bf838 100644
--- a/tests/checkasm/vp3dsp.c
+++ b/tests/checkasm/vp3dsp.c
@@ -55,7 +55,7 @@ static void vp3_check_loop_filter(void)
     DECLARE_ALIGNED(8, uint8_t, hor_buf1)[HORIZONTAL_BUF_SIZE];
     DECLARE_ALIGNED(8, uint8_t, ver_buf0)[VERTICAL_BUF_SIZE];
     DECLARE_ALIGNED(8, uint8_t, ver_buf1)[VERTICAL_BUF_SIZE];
-    DECLARE_ALIGNED(8, int, bounding_values_array)[256 + 2];
+    DECLARE_ALIGNED(16, int, bounding_values_array)[256 + 4];
     int *const bounding_values = bounding_values_array + 127;
     VP3DSPContext vp3dsp;
     static const struct {
@@ -72,7 +72,7 @@ static void vp3_check_loop_filter(void)
         { TEST(v_loop_filter),           2, 1, 0, 7, 8, 0 },
         { TEST(h_loop_filter),           0, 7, 2, 1, 8, 1 },
     };
-    declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *src, ptrdiff_t stride, int *bounding_values);
+    declare_func(void, uint8_t *src, ptrdiff_t stride, int *bounding_values);
 
     ff_vp3dsp_init(&vp3dsp, AV_CODEC_FLAG_BITEXACT);
 
-- 
2.49.1


>From 3479807922e22474f8cd88dbf9ef0dd0d9b248fe Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 9 Oct 2025 18:56:37 +0200
Subject: [PATCH 3/4] avcodec/vp3dsp: Remove unused flags parameter from
 ff_vp3dsp_init()

No longer necessary now that the x86 loop filter functions are
bitexact.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/arm/vp3dsp_init_arm.c   |  2 +-
 libavcodec/mips/vp3dsp_init_mips.c |  2 +-
 libavcodec/ppc/vp3dsp_altivec.c    |  2 +-
 libavcodec/vp3.c                   |  2 +-
 libavcodec/vp3dsp.c                | 10 +++++-----
 libavcodec/vp3dsp.h                | 10 +++++-----
 libavcodec/vp56.c                  |  2 +-
 libavcodec/x86/vp3dsp_init.c       |  2 +-
 tests/checkasm/vp3dsp.c            |  3 +--
 9 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/libavcodec/arm/vp3dsp_init_arm.c b/libavcodec/arm/vp3dsp_init_arm.c
index 65ea53fe0f..905c3dd624 100644
--- a/libavcodec/arm/vp3dsp_init_arm.c
+++ b/libavcodec/arm/vp3dsp_init_arm.c
@@ -30,7 +30,7 @@ void ff_vp3_idct_dc_add_neon(uint8_t *dest, ptrdiff_t stride, int16_t *data);
 void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
 void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
 
-av_cold void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags)
+av_cold void ff_vp3dsp_init_arm(VP3DSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
diff --git a/libavcodec/mips/vp3dsp_init_mips.c b/libavcodec/mips/vp3dsp_init_mips.c
index 4252ff790e..7388386d2b 100644
--- a/libavcodec/mips/vp3dsp_init_mips.c
+++ b/libavcodec/mips/vp3dsp_init_mips.c
@@ -26,7 +26,7 @@
 #include "libavcodec/vp3dsp.h"
 #include "vp3dsp_mips.h"
 
-av_cold void ff_vp3dsp_init_mips(VP3DSPContext *c, int flags)
+av_cold void ff_vp3dsp_init_mips(VP3DSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
diff --git a/libavcodec/ppc/vp3dsp_altivec.c b/libavcodec/ppc/vp3dsp_altivec.c
index a9a48d145b..30551a1a20 100644
--- a/libavcodec/ppc/vp3dsp_altivec.c
+++ b/libavcodec/ppc/vp3dsp_altivec.c
@@ -192,7 +192,7 @@ static void vp3_idct_add_altivec(uint8_t *dst, ptrdiff_t stride, int16_t block[6
 
 #endif /* HAVE_ALTIVEC */
 
-av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags)
+av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c)
 {
 #if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index 549c698b77..406c4f499b 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -2385,7 +2385,7 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
     avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
     ff_hpeldsp_init(&s->hdsp, avctx->flags | AV_CODEC_FLAG_BITEXACT);
     ff_videodsp_init(&s->vdsp, 8);
-    ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
+    ff_vp3dsp_init(&s->vp3dsp);
 
     for (int i = 0; i < 64; i++) {
 #define TRANSPOSE(x) (((x) >> 3) | (((x) & 7) << 3))
diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c
index e993d165d9..b96b4dea68 100644
--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@@ -445,7 +445,7 @@ static void put_no_rnd_pixels_l2(uint8_t *dst, const uint8_t *src1,
     }
 }
 
-av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags)
+av_cold void ff_vp3dsp_init(VP3DSPContext *c)
 {
     c->put_no_rnd_pixels_l2 = put_no_rnd_pixels_l2;
 
@@ -456,13 +456,13 @@ av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags)
     c->h_loop_filter = c->h_loop_filter_unaligned = vp3_h_loop_filter_8_c;
 
 #if ARCH_ARM
-    ff_vp3dsp_init_arm(c, flags);
+    ff_vp3dsp_init_arm(c);
 #elif ARCH_PPC
-    ff_vp3dsp_init_ppc(c, flags);
+    ff_vp3dsp_init_ppc(c);
 #elif ARCH_X86
-    ff_vp3dsp_init_x86(c, flags);
+    ff_vp3dsp_init_x86(c);
 #elif ARCH_MIPS
-    ff_vp3dsp_init_mips(c, flags);
+    ff_vp3dsp_init_mips(c);
 #endif
 }
 
diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h
index 3b849ec05d..1d5dd4b738 100644
--- a/libavcodec/vp3dsp.h
+++ b/libavcodec/vp3dsp.h
@@ -53,11 +53,11 @@ void ff_vp3dsp_h_loop_filter_12(uint8_t *first_pixel, ptrdiff_t stride, int *bou
 void ff_vp3dsp_idct10_put(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vp3dsp_idct10_add(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 
-void ff_vp3dsp_init(VP3DSPContext *c, int flags);
-void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags);
-void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags);
-void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags);
-void ff_vp3dsp_init_mips(VP3DSPContext *c, int flags);
+void ff_vp3dsp_init(VP3DSPContext *c);
+void ff_vp3dsp_init_arm(VP3DSPContext *c);
+void ff_vp3dsp_init_ppc(VP3DSPContext *c);
+void ff_vp3dsp_init_x86(VP3DSPContext *c);
+void ff_vp3dsp_init_mips(VP3DSPContext *c);
 
 void ff_vp3dsp_set_bounding_values(int * bound_values_array, int filter_limit);
 
diff --git a/libavcodec/vp56.c b/libavcodec/vp56.c
index 4f2381f64a..dc3ae70c66 100644
--- a/libavcodec/vp56.c
+++ b/libavcodec/vp56.c
@@ -828,7 +828,7 @@ av_cold int ff_vp56_init_context(AVCodecContext *avctx, VP56Context *s,
     ff_h264chroma_init(&s->h264chroma, 8);
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
     ff_videodsp_init(&s->vdsp, 8);
-    ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
+    ff_vp3dsp_init(&s->vp3dsp);
     for (i = 0; i < 64; i++) {
 #define TRANSPOSE(x) (((x) >> 3) | (((x) & 7) << 3))
         s->idct_scantable[i] = TRANSPOSE(ff_zigzag_direct[i]);
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index b3dcd8c86d..42daf99981 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -40,7 +40,7 @@ void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
                                      const uint8_t *b, ptrdiff_t stride,
                                      int h);
 
-av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
+av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
diff --git a/tests/checkasm/vp3dsp.c b/tests/checkasm/vp3dsp.c
index 04871bf838..81f096dc2a 100644
--- a/tests/checkasm/vp3dsp.c
+++ b/tests/checkasm/vp3dsp.c
@@ -23,7 +23,6 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/macros.h"
 #include "libavutil/mem_internal.h"
-#include "libavcodec/avcodec.h"
 #include "libavcodec/vp3dsp.h"
 
 enum {
@@ -74,7 +73,7 @@ static void vp3_check_loop_filter(void)
     };
     declare_func(void, uint8_t *src, ptrdiff_t stride, int *bounding_values);
 
-    ff_vp3dsp_init(&vp3dsp, AV_CODEC_FLAG_BITEXACT);
+    ff_vp3dsp_init(&vp3dsp);
 
     int filter_limit = rnd() % 128;
 
-- 
2.49.1


>From 2d36506fab2422df4523f0d1a6a38f3bb98732dd Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 10 Oct 2025 15:49:04 +0200
Subject: [PATCH 4/4] avcodec/vp3: Optimize alignment check away when possible

Check only on arches that need said check.

(Btw: I do not see how h_loop_filter benefits from alignment
at all and why h_loop_filter_unaligned exists.)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/vp3.c        | 2 +-
 libavcodec/vp3dsp.h     | 4 ++++
 tests/checkasm/vp3dsp.c | 4 ++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index 406c4f499b..0613254c14 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -2031,7 +2031,7 @@ static int vp4_mc_loop_filter(Vp3DecodeContext *s, int plane, int motion_x, int
              plane_height);
 
 #define safe_loop_filter(name, ptr, stride, bounding_values) \
-    if ((uintptr_t)(ptr) & 7) \
+    if (VP3_LOOP_FILTER_NO_UNALIGNED_SUPPORT && (uintptr_t)(ptr) & 7) \
         s->vp3dsp.name##_unaligned(ptr, stride, bounding_values); \
     else \
         s->vp3dsp.name(ptr, stride, bounding_values);
diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h
index 1d5dd4b738..7512676379 100644
--- a/libavcodec/vp3dsp.h
+++ b/libavcodec/vp3dsp.h
@@ -22,6 +22,10 @@
 #include <stddef.h>
 #include <stdint.h>
 
+// If this is one, {v,h}_loop_filter expect src to be aligned on eight bytes;
+// otherwise they don't have any alignment requirements for src.
+#define VP3_LOOP_FILTER_NO_UNALIGNED_SUPPORT (ARCH_ARM || ARCH_MIPS)
+
 typedef struct VP3DSPContext {
     /**
      * Copy 8xH pixels from source to destination buffer using a bilinear
diff --git a/tests/checkasm/vp3dsp.c b/tests/checkasm/vp3dsp.c
index 81f096dc2a..4f985ae62a 100644
--- a/tests/checkasm/vp3dsp.c
+++ b/tests/checkasm/vp3dsp.c
@@ -68,8 +68,8 @@ static void vp3_check_loop_filter(void)
 #define TEST(NAME) .name = #NAME, .offset = offsetof(VP3DSPContext, NAME)
         { TEST(v_loop_filter_unaligned), 2, 1, 0, 7, 1, 0 },
         { TEST(h_loop_filter_unaligned), 0, 7, 2, 1, 1, 1 },
-        { TEST(v_loop_filter),           2, 1, 0, 7, 8, 0 },
-        { TEST(h_loop_filter),           0, 7, 2, 1, 8, 1 },
+        { TEST(v_loop_filter),           2, 1, 0, 7, VP3_LOOP_FILTER_NO_UNALIGNED_SUPPORT ? 8 : 1, 0 },
+        { TEST(h_loop_filter),           0, 7, 2, 1, VP3_LOOP_FILTER_NO_UNALIGNED_SUPPORT ? 8 : 1, 1 },
     };
     declare_func(void, uint8_t *src, ptrdiff_t stride, int *bounding_values);
 
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2025-10-10 14:03 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-10 14:02 [FFmpeg-devel] [PATCH] Port VP3 loopfilters to SSE2 (PR #20686) mkver via ffmpeg-devel

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git