Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_deblock: add luma and chroma full
@ 2024-02-21 11:10 J. Dekker
  2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock J. Dekker
  2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 3/3] avcodec/aarch64: add hevc deblock NEON J. Dekker
  0 siblings, 2 replies; 10+ messages in thread
From: J. Dekker @ 2024-02-21 11:10 UTC (permalink / raw)
  To: ffmpeg-devel

Signed-off-by: J. Dekker <jdek@itanimul.li>
---
 tests/checkasm/hevc_deblock.c | 246 +++++++++++++++++++++++++++++-----
 1 file changed, 215 insertions(+), 31 deletions(-)

diff --git a/tests/checkasm/hevc_deblock.c b/tests/checkasm/hevc_deblock.c
index 66fc8d5646..91e57f5cf5 100644
--- a/tests/checkasm/hevc_deblock.c
+++ b/tests/checkasm/hevc_deblock.c
@@ -19,9 +19,9 @@
 #include <string.h>
 
 #include "libavutil/intreadwrite.h"
+#include "libavutil/macros.h"
 #include "libavutil/mem_internal.h"
 
-#include "libavcodec/avcodec.h"
 #include "libavcodec/hevcdsp.h"
 
 #include "checkasm.h"
@@ -29,10 +29,11 @@
 static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
 
 #define SIZEOF_PIXEL ((bit_depth + 7) / 8)
-#define BUF_STRIDE (8 * 2)
-#define BUF_LINES (8)
-#define BUF_OFFSET (BUF_STRIDE * BUF_LINES)
-#define BUF_SIZE (BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
+#define BUF_STRIDE (16 * 2)
+#define BUF_LINES (16)
+// large buffer sizes based on high bit depth
+#define BUF_OFFSET (2 * BUF_STRIDE * BUF_LINES)
+#define BUF_SIZE (2 * BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
 
 #define randomize_buffers(buf0, buf1, size)                 \
     do {                                                    \
@@ -45,57 +46,240 @@ static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
         }                                                   \
     } while (0)
 
-static void check_deblock_chroma(HEVCDSPContext *h, int bit_depth)
+static void check_deblock_chroma(HEVCDSPContext *h, int bit_depth, int c)
 {
-    int32_t tc[2] = { 0, 0 };
+    // see tctable[] in hevc_filter.c, we check full range
+    int32_t tc[2] = { rnd() % 25, rnd() % 25 };
     // no_p, no_q can only be { 0,0 } for the simpler assembly (non *_c
     // variant) functions, see deblocking_filter_CTB() in hevc_filter.c
-    uint8_t no_p[2] = { 0, 0 };
-    uint8_t no_q[2] = { 0, 0 };
+    uint8_t no_p[2] = { rnd() & c, rnd() & c };
+    uint8_t no_q[2] = { rnd() & c, rnd() & c };
     LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
     LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
 
     declare_func(void, uint8_t *pix, ptrdiff_t stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q);
 
-    if (check_func(h->hevc_h_loop_filter_chroma, "hevc_h_loop_filter_chroma%d", bit_depth)) {
-        for (int i = 0; i < 4; i++) {
-            randomize_buffers(buf0, buf1, BUF_SIZE);
-            // see betatable[] in hevc_filter.c
-            tc[0] = (rnd() & 63) + (rnd() & 1);
-            tc[1] = (rnd() & 63) + (rnd() & 1);
+    if (check_func(c ? h->hevc_h_loop_filter_chroma_c : h->hevc_h_loop_filter_chroma,
+                         "hevc_h_loop_filter_chroma%d%s", bit_depth, c ? "_full" : ""))
+    {
+        randomize_buffers(buf0, buf1, BUF_SIZE);
 
-            call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
-            call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+        call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+        call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+        if (memcmp(buf0, buf1, BUF_SIZE))
+            fail();
+        bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+    }
+
+    if (check_func(c ? h->hevc_v_loop_filter_chroma_c : h->hevc_v_loop_filter_chroma,
+                         "hevc_v_loop_filter_chroma%d%s", bit_depth, c ? "_full" : ""))
+    {
+        randomize_buffers(buf0, buf1, BUF_SIZE);
+
+        call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+        call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+        if (memcmp(buf0, buf1, BUF_SIZE))
+            fail();
+        bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+    }
+}
+
+#define P3 buf[-4 * xstride]
+#define P2 buf[-3 * xstride]
+#define P1 buf[-2 * xstride]
+#define P0 buf[-1 * xstride]
+#define Q0 buf[0 * xstride]
+#define Q1 buf[1 * xstride]
+#define Q2 buf[2 * xstride]
+#define Q3 buf[3 * xstride]
+
+#define TC25(x) ((tc[x] * 5 + 1) >> 1)
+#define MASK(x) (uint16_t)(x & ((1 << (bit_depth)) - 1))
+#define GET(x) ((SIZEOF_PIXEL == 1) ? *(uint8_t*)(&x) : *(uint16_t*)(&x))
+#define SET(x, y) do { \
+    uint16_t z = MASK(y); \
+    if (SIZEOF_PIXEL == 1) \
+        *(uint8_t*)(&x) = z; \
+    else \
+        *(uint16_t*)(&x) = z; \
+} while (0)
+#define RANDCLIP(x, diff) av_clip(GET(x) - (diff), 0, \
+    (1 << (bit_depth)) - 1) + rnd() % FFMAX(2 * (diff), 1)
+
+// NOTE: this function doesn't work 'correctly' in that it won't always choose
+// strong/strong or weak/weak, in most cases it tends to but will sometimes mix
+// weak/strong or even skip sometimes. This is more useful to test correctness
+// for these functions, though it does make benching them difficult. The easiest
+// way to bench these functions is to check an overall decode since there are too
+// many paths and ways to trigger the deblock: we would have to bench all
+// permutations of weak/strong/skip/nd_q/nd_p/no_q/no_p and it quickly becomes
+// too much.
+static void randomize_luma_buffers(int type, int *beta, int32_t tc[2],
+   uint8_t *buf, ptrdiff_t xstride, ptrdiff_t ystride, int bit_depth)
+{
+    int i, j, b3, tc25, tc25diff, b3diff;
+    // both tc & beta are unscaled inputs
+    // minimum useful value is 1, full range 0-24
+    tc[0] = (rnd() % 25) + 1;
+    tc[1] = (rnd() % 25) + 1;
+    // minimum useful value for 8bit is 8
+    *beta = (rnd() % 57) + 8;
+
+    switch (type) {
+    case 0: // strong
+        for (j = 0; j < 2; j++) {
+            tc25 = TC25(j) << (bit_depth - 8);
+            tc25diff = FFMAX(tc25 - 1, 0);
+            // 4 lines per tc
+            for (i = 0; i < 4; i++) {
+                b3 = (*beta << (bit_depth - 8)) >> 3;
+
+                SET(P0, rnd() % (1 << bit_depth));
+                SET(Q0, RANDCLIP(P0, tc25diff));
+
+                // p3 - p0 up to beta3 budget
+                b3diff = rnd() % b3;
+                SET(P3, RANDCLIP(P0, b3diff));
+                // q3 - q0, reduced budget
+                b3diff = rnd() % FFMAX(b3 - b3diff, 1);
+                SET(Q3, RANDCLIP(Q0, b3diff));
+
+                // same concept, budget across 4 pixels
+                b3 -= b3diff = rnd() % FFMAX(b3, 1);
+                SET(P2, RANDCLIP(P0, b3diff));
+                b3 -= b3diff = rnd() % FFMAX(b3, 1);
+                SET(Q2, RANDCLIP(Q0, b3diff));
+
+                // extra reduced budget for weighted pixels
+                b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
+                SET(P1, RANDCLIP(P0, b3diff));
+                b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
+                SET(Q1, RANDCLIP(Q0, b3diff));
+
+                buf += ystride;
+            }
+        }
+        break;
+    case 1: // weak
+        for (j = 0; j < 2; j++) {
+            tc25 = TC25(j) << (bit_depth - 8);
+            tc25diff = FFMAX(tc25 - 1, 0);
+            // 4 lines per tc
+            for (i = 0; i < 4; i++) {
+                // Weak filtering is signficantly simpler to activate as
+                // we only need to satisfy d0 + d3 < beta, which
+                // can be simplified to d0 + d0 < beta. Using the above
+                // derivations but substiuting b3 for b1 and ensuring
+                // that P0/Q0 are at least 1/2 tc25diff apart (tending
+                // towards 1/2 range).
+                b3 = (*beta << (bit_depth - 8)) >> 1;
+
+                SET(P0, rnd() % (1 << bit_depth));
+                SET(Q0, RANDCLIP(P0, tc25diff >> 1) +
+                    (tc25diff >> 1) * (P0 < (1 << (bit_depth - 1))) ? 1 : -1);
+
+                // p3 - p0 up to beta3 budget
+                b3diff = rnd() % b3;
+                SET(P3, RANDCLIP(P0, b3diff));
+                // q3 - q0, reduced budget
+                b3diff = rnd() % FFMAX(b3 - b3diff, 1);
+                SET(Q3, RANDCLIP(Q0, b3diff));
+
+                // same concept, budget across 4 pixels
+                b3 -= b3diff = rnd() % FFMAX(b3, 1);
+                SET(P2, RANDCLIP(P0, b3diff));
+                b3 -= b3diff = rnd() % FFMAX(b3, 1);
+                SET(Q2, RANDCLIP(Q0, b3diff));
+
+                // extra reduced budget for weighted pixels
+                b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
+                SET(P1, RANDCLIP(P0, b3diff));
+                b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
+                SET(Q1, RANDCLIP(Q0, b3diff));
+
+                buf += ystride;
+            }
+        }
+        break;
+    case 2: // none
+        *beta = 0; // ensure skip
+        for (i = 0; i < 8; i++) {
+            // we can just fill with completely random data, nothing should be touched.
+            SET(P3, rnd()); SET(P2, rnd()); SET(P1, rnd()); SET(P0, rnd());
+            SET(Q0, rnd()); SET(Q1, rnd()); SET(Q2, rnd()); SET(Q3, rnd());
+            buf += ystride;
+        }
+        break;
+    }
+}
+
+static void check_deblock_luma(HEVCDSPContext *h, int bit_depth, int c)
+{
+    const char *type;
+    const char *types[3] = { "strong", "weak", "skip" };
+    int beta;
+    int32_t tc[2] = {0};
+    uint8_t no_p[2] = { rnd() & c, rnd() & c };
+    uint8_t no_q[2] = { rnd() & c, rnd() & c };
+    LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
+    uint8_t *ptr0 = buf0 + BUF_OFFSET,
+            *ptr1 = buf1 + BUF_OFFSET;
+
+    declare_func(void, uint8_t *pix, ptrdiff_t stride, int beta, int32_t *tc, uint8_t *no_p, uint8_t *no_q);
+
+    for (int j = 0; j < 3; j++) {
+        type = types[j];
+        if (check_func(c ? h->hevc_h_loop_filter_luma_c : h->hevc_h_loop_filter_luma,
+                             "hevc_h_loop_filter_luma%d_%s%s", bit_depth, type, c ? "_full" : ""))
+        {
+            randomize_luma_buffers(j, &beta, tc, buf0 + BUF_OFFSET, 16 * SIZEOF_PIXEL, SIZEOF_PIXEL, bit_depth);
+            memcpy(buf1, buf0, BUF_SIZE);
+
+            call_ref(ptr0, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
+            call_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
             if (memcmp(buf0, buf1, BUF_SIZE))
                 fail();
+            bench_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
         }
-        bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
-    }
 
-    if (check_func(h->hevc_v_loop_filter_chroma, "hevc_v_loop_filter_chroma%d", bit_depth)) {
-        for (int i = 0; i < 4; i++) {
-            randomize_buffers(buf0, buf1, BUF_SIZE);
-            // see betatable[] in hevc_filter.c
-            tc[0] = (rnd() & 63) + (rnd() & 1);
-            tc[1] = (rnd() & 63) + (rnd() & 1);
+        if (check_func(c ? h->hevc_v_loop_filter_luma_c : h->hevc_v_loop_filter_luma,
+                             "hevc_v_loop_filter_luma%d_%s%s", bit_depth, type, c ? "_full" : ""))
+        {
+            randomize_luma_buffers(j, &beta, tc, buf0 + BUF_OFFSET, SIZEOF_PIXEL, 16 * SIZEOF_PIXEL, bit_depth);
+            memcpy(buf1, buf0, BUF_SIZE);
 
-            call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
-            call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+            call_ref(ptr0, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
+            call_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
             if (memcmp(buf0, buf1, BUF_SIZE))
                 fail();
+            bench_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
         }
-        bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
     }
 }
 
 void checkasm_check_hevc_deblock(void)
 {
+    HEVCDSPContext h;
     int bit_depth;
-
     for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
-        HEVCDSPContext h;
         ff_hevc_dsp_init(&h, bit_depth);
-        check_deblock_chroma(&h, bit_depth);
+        check_deblock_chroma(&h, bit_depth, 0);
     }
     report("chroma");
+    for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+        ff_hevc_dsp_init(&h, bit_depth);
+        check_deblock_chroma(&h, bit_depth, 1);
+    }
+    report("chroma_full");
+    for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+        ff_hevc_dsp_init(&h, bit_depth);
+        check_deblock_luma(&h, bit_depth, 0);
+    }
+    report("luma");
+    for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+        ff_hevc_dsp_init(&h, bit_depth);
+        check_deblock_luma(&h, bit_depth, 1);
+    }
+    report("luma_full");
 }
-- 
2.43.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock
  2024-02-21 11:10 [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_deblock: add luma and chroma full J. Dekker
@ 2024-02-21 11:10 ` J. Dekker
  2024-02-23  3:14   ` Andreas Rheinhardt
  2024-02-23 12:45   ` Nuo Mi
  2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 3/3] avcodec/aarch64: add hevc deblock NEON J. Dekker
  1 sibling, 2 replies; 10+ messages in thread
From: J. Dekker @ 2024-02-21 11:10 UTC (permalink / raw)
  To: ffmpeg-devel

Over/underflow in some cases.

Signed-off-by: J. Dekker <jdek@itanimul.li>
---
 libavcodec/x86/hevcdsp_init.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 31e81eb11f..11cb1b3bfd 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -1205,10 +1205,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
-            if (ARCH_X86_64) {
-                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
-                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
-            }
+            // FIXME: 12-bit luma deblock over/underflows in some cases
+            // if (ARCH_X86_64) {
+            //     c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
+            //     c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
+            // }
             SAO_BAND_INIT(12, sse2);
             SAO_EDGE_INIT(12, sse2);
 
-- 
2.43.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH 3/3] avcodec/aarch64: add hevc deblock NEON
  2024-02-21 11:10 [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_deblock: add luma and chroma full J. Dekker
  2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock J. Dekker
@ 2024-02-21 11:10 ` J. Dekker
  2024-02-21 12:08   ` Martin Storsjö
  1 sibling, 1 reply; 10+ messages in thread
From: J. Dekker @ 2024-02-21 11:10 UTC (permalink / raw)
  To: ffmpeg-devel

Benched using single-threaded full decode on an Ampere Altra.

Bpp Before  After  Speedup
8   73,3s   65,2s  1.124x
10  114,2s  104,0s 1.098x
12  125,8s  115,7s 1.087x

Signed-off-by: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_deblock_neon.S | 421 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
 2 files changed, 439 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S
index 8227f65649..1995f21807 100644
--- a/libavcodec/aarch64/hevcdsp_deblock_neon.S
+++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
@@ -181,3 +181,424 @@ hevc_h_loop_filter_chroma 12
 hevc_v_loop_filter_chroma 8
 hevc_v_loop_filter_chroma 10
 hevc_v_loop_filter_chroma 12
+
+.macro hevc_loop_filter_luma_body bitdepth
+function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
+.if \bitdepth > 8
+        lsl             w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8
+.else
+        uxtl            v0.8h, v0.8b
+        uxtl            v1.8h, v1.8b
+        uxtl            v2.8h, v2.8b
+        uxtl            v3.8h, v3.8b
+        uxtl            v4.8h, v4.8b
+        uxtl            v5.8h, v5.8b
+        uxtl            v6.8h, v6.8b
+        uxtl            v7.8h, v7.8b
+.endif
+        ldr             w7, [x3] // tc[0]
+        ldr             w8, [x3, #4] // tc[1]
+        dup             v18.4h, w7
+        dup             v19.4h, w8
+        trn1            v18.2d, v18.2d, v19.2d
+.if \bitdepth > 8
+        shl             v18.8h, v18.8h, #(\bitdepth - 8)
+.endif
+        dup             v27.8h, w2 // beta
+        // tc25
+        shl             v19.8h, v18.8h, #2 // * 4
+        add             v19.8h, v19.8h, v18.8h // (tc * 5)
+        srshr           v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1
+        sshr            v17.8h, v27.8h, #2 // beta2
+
+        ////// beta_2 check
+        // dp0  = abs(P2  - 2 * P1  + P0)
+        add             v22.8h, v3.8h, v1.8h
+        shl             v23.8h, v2.8h, #1
+        sabd            v30.8h, v22.8h, v23.8h
+        // dq0  = abs(Q2  - 2 * Q1  + Q0)
+        add             v21.8h, v6.8h, v4.8h
+        shl             v26.8h, v5.8h, #1
+        sabd            v31.8h, v21.8h, v26.8h
+        // d0   = dp0 + dq0
+        add             v20.8h, v30.8h, v31.8h
+        shl             v25.8h, v20.8h, #1
+        // (d0 << 1) < beta_2
+        cmgt            v23.8h, v17.8h, v25.8h
+
+        ////// beta check
+        // d0 + d3 < beta
+        mov             x9, #0xFFFF00000000FFFF
+        dup             v24.2d, x9
+        and             v25.16b, v24.16b, v20.16b
+        addp            v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1
+        addp            v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1
+        cmgt            v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1]
+        mov             w9, v25.s[0]
+        cmp             w9, #0
+        sxtl            v26.4s, v25.4h
+        sxtl            v16.2d, v26.2s // full skip mask
+        b.eq            3f // skip both blocks
+
+        // TODO: we can check the full skip mask with the weak/strong mask to
+        // potentially skip weak or strong calculation entirely if we only have one
+
+        ////// beta_3 check
+        // abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3
+        sshr            v17.8h, v17.8h, #1 // beta_3
+        sabd            v20.8h, v0.8h, v3.8h
+        saba            v20.8h, v7.8h, v4.8h
+        cmgt            v21.8h, v17.8h, v20.8h
+
+        and             v23.16b, v23.16b, v21.16b
+
+        ////// tc25 check
+        // abs(P0  -  Q0) < tc25
+        sabd            v20.8h, v3.8h, v4.8h
+        cmgt            v21.8h, v19.8h, v20.8h
+
+        and             v23.16b, v23.16b, v21.16b
+
+        ////// Generate low/high line max from lines 0/3/4/7
+        // mask out lines 2/3/5/6
+        not             v20.16b, v24.16b // 0x0000FFFFFFFF0000
+        orr             v23.16b, v23.16b, v20.16b
+
+        // generate weak/strong mask
+        uminp           v23.8h, v23.8h, v23.8h // extend to singles
+        sxtl            v23.4s, v23.4h
+        uminp           v26.4s, v23.4s, v23.4s // check lines
+        // extract to gpr
+        ext             v25.16b, v26.16b, v26.16b, #2
+        zip1            v17.4s, v26.4s, v26.4s
+        mov             w12, v25.s[0]
+        mov             w11, #0x0000FFFF
+        mov             w13, #0xFFFF0000
+        // FFFF FFFF -> strong strong
+        // FFFF 0000 -> strong weak
+        // 0000 FFFF -> weak   strong
+        // 0000 0000 -> weak   weak
+        cmp             w12, w13
+        b.hi            0f // only strong/strong, skip weak nd_p/nd_q calc
+
+        ////// weak nd_p/nd_q
+        // d0+d3
+        and             v30.16b, v30.16b, v24.16b // d0 __ __ d3 d4 __ __ d7
+        and             v31.16b, v31.16b, v24.16b
+        addp            v30.8h, v30.8h, v30.8h // [d0+__ __+d3 d4+__ __+d7] [ ... ]
+        addp            v31.8h, v31.8h, v31.8h // [d0+d3 d4+d7]
+        addp            v30.4h, v30.4h, v30.4h
+        addp            v31.4h, v31.4h, v31.4h
+
+        // ((beta + (beta >> 1)) >> 3)
+        sshr            v21.8h, v27.8h, #1
+        add             v21.8h, v21.8h, v27.8h
+        sshr            v21.8h, v21.8h, #3
+
+        // nd_p = dp0 + dp3 < ((beta + (beta >> 1)) >> 3)
+        cmgt            v30.8h, v21.8h, v30.8h
+        // nd_q = dq0 + dq3 < ((beta + (beta >> 1)) >> 3)
+        cmgt            v31.8h, v21.8h, v31.8h
+
+        sxtl            v30.4s, v30.4h
+        sxtl            v31.4s, v31.4h
+        sxtl            v28.2d, v30.2s
+        sxtl            v29.2d, v31.2s
+
+        cmp             w12, w11
+        b.lo            1f // can only be weak weak, skip strong
+
+0:      // STRONG FILTER
+
+        // P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc3, tc3);
+        add             v21.8h, v2.8h, v3.8h   // (p1 + p0
+        add             v21.8h, v4.8h, v21.8h  //     + q0)
+        shl             v21.8h, v21.8h, #1     //           * 2
+        add             v22.8h, v1.8h, v5.8h   //   (p2 + q1)
+        add             v21.8h, v22.8h, v21.8h // +
+        srshr            v21.8h, v21.8h, #3     //               >> 3
+        sub             v21.8h, v21.8h, v3.8h  //                    - p0
+
+        // P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
+
+        add             v22.8h, v1.8h, v2.8h
+        add             v23.8h, v3.8h, v4.8h
+        add             v22.8h, v22.8h, v23.8h
+        srshr            v22.8h, v22.8h, #2
+        sub             v22.8h, v22.8h, v2.8h
+
+        // P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc, tc);
+
+        add             v23.8h, v0.8h, v1.8h // p3 + p2
+        add             v24.8h, v3.8h, v4.8h // p0 + q0
+        shl             v23.8h, v23.8h, #1 // * 2
+        add             v23.8h, v23.8h, v24.8h
+        add             v24.8h, v1.8h, v2.8h // p2 + p1
+        add             v23.8h, v23.8h, v24.8h
+        srshr            v23.8h, v23.8h, #3
+        sub             v23.8h, v23.8h, v1.8h
+
+        // Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc3, tc3);
+        add             v24.8h, v3.8h, v4.8h   // (p0 + q0
+        add             v24.8h, v5.8h, v24.8h  //     + q1)
+        shl             v24.8h, v24.8h, #1     //           * 2
+        add             v25.8h, v2.8h, v6.8h   //   (p1 + q2)
+        add             v24.8h, v25.8h, v24.8h // +
+        srshr           v24.8h, v24.8h, #3     //               >> 3
+        sub             v24.8h, v24.8h, v4.8h  //                    - q0
+
+        // Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
+
+        add             v25.8h, v6.8h, v5.8h
+        add             v26.8h, v3.8h, v4.8h
+        add             v25.8h, v25.8h, v26.8h
+        srshr           v25.8h, v25.8h, #2
+        sub             v25.8h, v25.8h, v5.8h
+
+        // Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc, tc);
+
+        add             v26.8h, v7.8h, v6.8h
+        add             v27.8h, v6.8h, v5.8h
+        shl             v26.8h, v26.8h, #1
+        add             v26.8h, v26.8h, v27.8h
+        add             v27.8h, v3.8h, v4.8h
+        add             v26.8h, v26.8h, v27.8h
+        srshr           v26.8h, v26.8h, #3
+        sub             v26.8h, v26.8h, v6.8h
+
+        // this clip should work properly
+        shl             v30.8h, v18.8h, #1 // tc2
+        neg             v31.8h, v30.8h // -tc2
+        clip            v31.8h, v30.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h
+
+        and             v21.16b, v21.16b, v16.16b
+        and             v22.16b, v22.16b, v16.16b
+        and             v23.16b, v23.16b, v16.16b
+        and             v24.16b, v24.16b, v16.16b
+        and             v25.16b, v25.16b, v16.16b
+        and             v26.16b, v26.16b, v16.16b
+
+        add             v23.8h, v23.8h, v1.8h // careful
+        add             v22.8h, v22.8h, v2.8h
+        add             v21.8h, v21.8h, v3.8h
+        add             v24.8h, v24.8h, v4.8h
+        add             v25.8h, v25.8h, v5.8h
+        add             v26.8h, v26.8h, v6.8h
+
+        cmp             w12, w13
+        b.hi            2f // only strong/strong, skip weak
+
+1:      // WEAK FILTER
+
+        // delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4
+.if \bitdepth < 12
+        sub             v27.8h, v4.8h, v3.8h // q0 - p0
+        shl             v30.8h, v27.8h, #3 // * 8
+        add             v27.8h, v27.8h, v30.8h // 9 * (q0 - p0)
+
+        sub             v30.8h, v5.8h, v2.8h // q1 - p1
+        shl             v31.8h, v30.8h, #1 // * 2
+
+        sub             v27.8h, v27.8h, v31.8h
+        sub             v27.8h, v27.8h, v30.8h // - 3 * (q1 - p1)
+        srshr           v27.8h, v27.8h, #4
+.else
+        ssubl           v19.4s, v4.4h, v3.4h // q0 - p0
+        ssubl2          v20.4s, v4.8h, v3.8h
+
+        shl             v30.4s, v19.4s, #3 // * 8
+        shl             v31.4s, v20.4s, #3
+
+        add             v30.4s, v30.4s, v19.4s // 9 * (q0 - p0)
+        add             v31.4s, v31.4s, v20.4s
+
+        ssubl           v19.4s, v5.4h, v2.4h // q1 - p1
+        ssubl2          v20.4s, v5.8h, v2.8h
+
+        sub             v30.4s, v30.4s, v19.4s
+        sub             v31.4s, v31.4s, v20.4s
+
+        shl             v19.4s, v19.4s, #1
+        shl             v20.4s, v20.4s, #1
+
+        sub             v30.4s, v30.4s, v19.4s
+        sub             v31.4s, v31.4s, v20.4s
+
+        sqrshrn         v27.4h, v30.4s, #4
+        sqrshrn2        v27.8h, v31.4s, #4
+.endif
+
+        // delta0 10tc check mask
+        shl             v30.8h, v18.8h, #1 // * 2
+        shl             v31.8h, v18.8h, #3 // * 8
+        add             v30.8h, v30.8h, v31.8h // 10 * tc
+        abs             v31.8h, v27.8h
+        cmgt            v20.8h, v30.8h, v31.8h // abs(delta0) < 10 * tc
+
+        and             v20.16b, v20.16b, v16.16b // combine with full mask
+
+        neg             v31.8h, v18.8h // -tc
+        clip            v31.8h, v18.8h, v27.8h // delta0 = av_clip(delta0, -tc, tc)
+
+        // deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2)
+        add             v30.8h, v1.8h, v3.8h
+        srshr           v30.8h, v30.8h, #1
+        sub             v30.8h, v30.8h, v2.8h
+        add             v30.8h, v30.8h, v27.8h
+        sshr            v30.8h, v30.8h, #1
+
+        // p3 p2 p1 p0 q0 q1 q2 q3
+        // v0 v1 v2 v3 v4 v5 v6 v7
+
+        // deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
+        add             v31.8h, v6.8h, v4.8h
+        srshr           v31.8h, v31.8h, #1
+        sub             v31.8h, v31.8h, v5.8h
+        sub             v31.8h, v31.8h, v27.8h
+        sshr            v31.8h, v31.8h, #1
+
+        // apply nd_p nd_q mask to deltap1/deltaq1
+        and             v30.16b, v30.16b, v28.16b
+        and             v31.16b, v31.16b, v29.16b
+
+        // apply full skip mask to deltap1/deltaq1/delta0
+        and             v30.16b, v30.16b, v20.16b
+        and             v27.16b, v27.16b, v20.16b
+        and             v31.16b, v31.16b, v20.16b
+
+        // clip P1/Q1 to -tc_2, tc_2
+        sshr            v18.8h, v18.8h, #1 // tc2
+        neg             v28.8h, v18.8h
+        clip            v28.8h, v18.8h, v30.8h, v31.8h
+
+        // P0 = av_clip_pixel(p0 + delta0)
+        // Q0 = av_clip_pixel(q0 - delta0)
+        add             v29.8h, v3.8h, v27.8h // P0
+        sub             v27.8h, v4.8h, v27.8h // Q0
+
+        // P1 = av_clip_pixel(p1 + deltap1)
+        // Q1 = av_clip_pixel(q1 + deltaq1)
+        add             v30.8h, v2.8h, v30.8h // P1
+        add             v31.8h, v5.8h, v31.8h // Q1
+
+2:      // MIX WEAK/STRONG
+
+        mov             v19.16b, v1.16b
+        mov             v20.16b, v6.16b
+        // copy selection mask
+        mov             v1.16b, v17.16b
+        mov             v2.16b, v17.16b
+        mov             v3.16b, v17.16b
+        mov             v4.16b, v17.16b
+        mov             v5.16b, v17.16b
+        mov             v6.16b, v17.16b
+        // select
+        bsl             v1.16b, v23.16b, v19.16b // P2 strong/orig
+        bsl             v2.16b, v22.16b, v30.16b // P1 strong/weak
+        bsl             v3.16b, v21.16b, v29.16b // P0 strong/weak
+        bsl             v4.16b, v24.16b, v27.16b // Q0 strong/weak
+        bsl             v5.16b, v25.16b, v31.16b // Q1 strong/weak
+        bsl             v6.16b, v26.16b, v20.16b // Q2 strong/orig
+        // NOTE: Q3/P3 are unchanged
+
+.if \bitdepth > 8
+        movi            v19.8h, #0
+        dup             v20.8h, w14
+        clip            v19.8h, v20.8h, v1.8h, v2.8h, v3.8h, v4.8h, v5.8h, v6.8h
+.else
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        sqxtun          v2.8b, v2.8h
+        sqxtun          v3.8b, v3.8h
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        sqxtun          v7.8b, v7.8h
+.endif
+        ret
+3:      ret x6
+endfunc
+.endm
+
+hevc_loop_filter_luma_body 8
+hevc_loop_filter_luma_body 10
+hevc_loop_filter_luma_body 12
+
+// hevc_v_loop_filter_luma(uint8_t *pix, ptrdiff_t stride, int beta, const int32_t *tc, const uint8_t *no_p, const uint8_t *no_q)
+
+.macro hevc_loop_filter_luma dir bitdepth
+function ff_hevc_\dir\()_loop_filter_luma_\bitdepth\()_neon, export=1
+        mov             x6, x30
+.if \dir == v
+.if \bitdepth > 8
+        sub             x0, x0, #8
+.else
+        sub             x0, x0, #4
+.endif
+.else
+        sub             x0, x0, x1, lsl #2 // -4 * xstride, idk
+.endif
+        mov             x10, x0
+.if \bitdepth > 8
+        ld1             {v0.8h}, [x0], x1
+        ld1             {v1.8h}, [x0], x1
+        ld1             {v2.8h}, [x0], x1
+        ld1             {v3.8h}, [x0], x1
+        ld1             {v4.8h}, [x0], x1
+        ld1             {v5.8h}, [x0], x1
+        ld1             {v6.8h}, [x0], x1
+        ld1             {v7.8h}, [x0]
+        mov             w14, #((1 << \bitdepth) - 1)
+.if \dir == v
+        transpose_8x8H  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+.endif
+.else
+        ld1             {v0.8b}, [x0], x1
+        ld1             {v1.8b}, [x0], x1
+        ld1             {v2.8b}, [x0], x1
+        ld1             {v3.8b}, [x0], x1
+        ld1             {v4.8b}, [x0], x1
+        ld1             {v5.8b}, [x0], x1
+        ld1             {v6.8b}, [x0], x1
+        ld1             {v7.8b}, [x0]
+.if \dir == v
+        transpose_8x8B  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+.endif
+.endif
+        bl hevc_loop_filter_luma_body_\bitdepth\()_neon
+.if \bitdepth > 8
+.if \dir == v
+        transpose_8x8H  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+.endif
+        st1             {v0.8h}, [x10], x1
+        st1             {v1.8h}, [x10], x1
+        st1             {v2.8h}, [x10], x1
+        st1             {v3.8h}, [x10], x1
+        st1             {v4.8h}, [x10], x1
+        st1             {v5.8h}, [x10], x1
+        st1             {v6.8h}, [x10], x1
+        st1             {v7.8h}, [x10]
+.else
+.if \dir == v
+        transpose_8x8B  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+.endif
+        st1             {v0.8b}, [x10], x1
+        st1             {v1.8b}, [x10], x1
+        st1             {v2.8b}, [x10], x1
+        st1             {v3.8b}, [x10], x1
+        st1             {v4.8b}, [x10], x1
+        st1             {v5.8b}, [x10], x1
+        st1             {v6.8b}, [x10], x1
+        st1             {v7.8b}, [x10]
+.endif
+        ret x6
+endfunc
+.endm
+
+hevc_loop_filter_luma h 8
+hevc_loop_filter_luma h 10
+hevc_loop_filter_luma h 12
+
+hevc_loop_filter_luma v 8
+hevc_loop_filter_luma v 10
+hevc_loop_filter_luma v 12
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 687b6cc5c3..04692aa98e 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -38,6 +38,18 @@ void ff_hevc_h_loop_filter_chroma_10_neon(uint8_t *_pix, ptrdiff_t _stride,
                                           const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
 void ff_hevc_h_loop_filter_chroma_12_neon(uint8_t *_pix, ptrdiff_t _stride,
                                           const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_v_loop_filter_luma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int beta,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_v_loop_filter_luma_10_neon(uint8_t *_pix, ptrdiff_t _stride, int beta,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_v_loop_filter_luma_12_neon(uint8_t *_pix, ptrdiff_t _stride, int beta,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_h_loop_filter_luma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int beta,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_h_loop_filter_luma_10_neon(uint8_t *_pix, ptrdiff_t _stride, int beta,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_h_loop_filter_luma_12_neon(uint8_t *_pix, ptrdiff_t _stride, int beta,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
 void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs,
                                      ptrdiff_t stride);
 void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, const int16_t *coeffs,
@@ -291,6 +303,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
     if (!have_neon(cpu_flags)) return;
 
     if (bit_depth == 8) {
+        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_8_neon;
+        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_8_neon;
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_8_neon;
         c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_8_neon;
         c->add_residual[0]             = ff_hevc_add_residual_4x4_8_neon;
@@ -379,6 +393,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 
     }
     if (bit_depth == 10) {
+        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_10_neon;
+        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_10_neon;
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_10_neon;
         c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_10_neon;
         c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
@@ -395,6 +411,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_10_neon;
     }
     if (bit_depth == 12) {
+        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_12_neon;
+        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_12_neon;
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_12_neon;
         c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_12_neon;
         c->add_residual[0]             = ff_hevc_add_residual_4x4_12_neon;
-- 
2.43.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/3] avcodec/aarch64: add hevc deblock NEON
  2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 3/3] avcodec/aarch64: add hevc deblock NEON J. Dekker
@ 2024-02-21 12:08   ` Martin Storsjö
  0 siblings, 0 replies; 10+ messages in thread
From: Martin Storsjö @ 2024-02-21 12:08 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, 21 Feb 2024, J. Dekker wrote:

> Benched using single-threaded full decode on an Ampere Altra.
>
> Bpp Before  After  Speedup
> 8   73,3s   65,2s  1.124x
> 10  114,2s  104,0s 1.098x
> 12  125,8s  115,7s 1.087x
>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_deblock_neon.S | 421 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
> 2 files changed, 439 insertions(+)

> +0:      // STRONG FILTER
> +
> +        // P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc3, tc3);
> +        add             v21.8h, v2.8h, v3.8h   // (p1 + p0
> +        add             v21.8h, v4.8h, v21.8h  //     + q0)
> +        shl             v21.8h, v21.8h, #1     //           * 2
> +        add             v22.8h, v1.8h, v5.8h   //   (p2 + q1)
> +        add             v21.8h, v22.8h, v21.8h // +
> +        srshr            v21.8h, v21.8h, #3     //               >> 3
> +        sub             v21.8h, v21.8h, v3.8h  //                    - p0
> +

The srshr line is incorrectly indented here (and elsewhere)

> +        sqxtun          v4.8b, v4.8h
> +        sqxtun          v5.8b, v5.8h
> +        sqxtun          v6.8b, v6.8h
> +        sqxtun          v7.8b, v7.8h
> +.endif
> +        ret
> +3:      ret x6

Please indent the "x6" here like other operands

> +.macro hevc_loop_filter_luma dir bitdepth
> +function ff_hevc_\dir\()_loop_filter_luma_\bitdepth\()_neon, export=1
> +        mov             x6, x30
> +.if \dir == v

In GAS assembler, .if does a numerical comparison - it can't do string 
comparisons.

The right way to do this is to do ".ifc \dir, v", which does a string 
comparison.

(If you really do need to do this like a numerical comparison, it's 
possible to define e.g. "v" as a numeric symbol as well, see e.g. 
https://code.videolan.org/videolan/dav1d/-/merge_requests/1603/diffs?commit_id=d4746c908c56cb2e8545efd348b8cdc13f2f2253 
but that's not really the nicest way to do it.)

This issue breaks compilation with Clang. With gas-preprocessor (for 
MSVC), it manages to build correctly, but does the wrong thing.


To avoid me having to test all these build configurations manually, 
remembering to check all these corner case build configurations and check 
indentation and all, I've set up a PoC for testing such things on Github 
Actions.

If you have a repo on github, grab my commits from 
https://github.com/mstorsjo/FFmpeg/commits/gha-aarch64 (there are a couple 
of them), add your changes on top of these, and push it as a branch to 
your own github repo, then check the output from the actions.

Here's the output of a run with the patches you just posted: 
https://github.com/mstorsjo/FFmpeg/actions/runs/7988312683

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock
  2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock J. Dekker
@ 2024-02-23  3:14   ` Andreas Rheinhardt
  2024-02-24  9:54     ` J. Dekker
  2024-02-23 12:45   ` Nuo Mi
  1 sibling, 1 reply; 10+ messages in thread
From: Andreas Rheinhardt @ 2024-02-23  3:14 UTC (permalink / raw)
  To: ffmpeg-devel

J. Dekker:
> Over/underflow in some cases.
> 
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
>  libavcodec/x86/hevcdsp_init.c | 9 +++++----
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
> index 31e81eb11f..11cb1b3bfd 100644
> --- a/libavcodec/x86/hevcdsp_init.c
> +++ b/libavcodec/x86/hevcdsp_init.c
> @@ -1205,10 +1205,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
>          if (EXTERNAL_SSE2(cpu_flags)) {
>              c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
>              c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
> -            if (ARCH_X86_64) {
> -                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
> -                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
> -            }
> +            // FIXME: 12-bit luma deblock over/underflows in some cases
> +            // if (ARCH_X86_64) {
> +            //     c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
> +            //     c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
> +            // }
>              SAO_BAND_INIT(12, sse2);
>              SAO_EDGE_INIT(12, sse2);
>  

If you disable them here, you should also ensure that they are not
assembled at all.

- Andreas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock
  2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock J. Dekker
  2024-02-23  3:14   ` Andreas Rheinhardt
@ 2024-02-23 12:45   ` Nuo Mi
  2024-02-24  9:49     ` J. Dekker
  1 sibling, 1 reply; 10+ messages in thread
From: Nuo Mi @ 2024-02-23 12:45 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, Feb 21, 2024 at 7:10 PM J. Dekker <jdek@itanimul.li> wrote:

> Over/underflow in some cases.
>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
>  libavcodec/x86/hevcdsp_init.c | 9 +++++----
>  1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
> index 31e81eb11f..11cb1b3bfd 100644
> --- a/libavcodec/x86/hevcdsp_init.c
> +++ b/libavcodec/x86/hevcdsp_init.c
> @@ -1205,10 +1205,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const
> int bit_depth)
>          if (EXTERNAL_SSE2(cpu_flags)) {
>              c->hevc_v_loop_filter_chroma =
> ff_hevc_v_loop_filter_chroma_12_sse2;
>              c->hevc_h_loop_filter_chroma =
> ff_hevc_h_loop_filter_chroma_12_sse2;
> -            if (ARCH_X86_64) {
> -                c->hevc_v_loop_filter_luma =
> ff_hevc_v_loop_filter_luma_12_sse2;
> -                c->hevc_h_loop_filter_luma =
> ff_hevc_h_loop_filter_luma_12_sse2;
> -            }
> +            // FIXME: 12-bit luma deblock over/underflows in some cases
> +            // if (ARCH_X86_64) {
> +            //     c->hevc_v_loop_filter_luma =
> ff_hevc_v_loop_filter_luma_12_sse2;
> +            //     c->hevc_h_loop_filter_luma =
> ff_hevc_h_loop_filter_luma_12_sse2;
> +            // }
>              SAO_BAND_INIT(12, sse2);
>              SAO_EDGE_INIT(12, sse2);
>
Hi Dekker,
VVC will utilize this function as well.
Could you please share the HEVC clip or data that caused the overflow?
We'll make efforts to address it during the VVC porting

Thank you.

>
> --
> 2.43.2
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock
  2024-02-23 12:45   ` Nuo Mi
@ 2024-02-24  9:49     ` J. Dekker
  2024-02-24 10:46       ` Martin Storsjö
  0 siblings, 1 reply; 10+ messages in thread
From: J. Dekker @ 2024-02-24  9:49 UTC (permalink / raw)
  To: ffmpeg-devel


Nuo Mi <nuomi2021@gmail.com> writes:

> On Wed, Feb 21, 2024 at 7:10 PM J. Dekker <jdek@itanimul.li> wrote:
>
>> Over/underflow in some cases.
>>
>> Signed-off-by: J. Dekker <jdek@itanimul.li>
>> ---
>>  libavcodec/x86/hevcdsp_init.c | 9 +++++----
>>  1 file changed, 5 insertions(+), 4 deletions(-)
>>
>> diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
>> index 31e81eb11f..11cb1b3bfd 100644
>> --- a/libavcodec/x86/hevcdsp_init.c
>> +++ b/libavcodec/x86/hevcdsp_init.c
>> @@ -1205,10 +1205,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const
>> int bit_depth)
>>          if (EXTERNAL_SSE2(cpu_flags)) {
>>              c->hevc_v_loop_filter_chroma =
>> ff_hevc_v_loop_filter_chroma_12_sse2;
>>              c->hevc_h_loop_filter_chroma =
>> ff_hevc_h_loop_filter_chroma_12_sse2;
>> -            if (ARCH_X86_64) {
>> -                c->hevc_v_loop_filter_luma =
>> ff_hevc_v_loop_filter_luma_12_sse2;
>> -                c->hevc_h_loop_filter_luma =
>> ff_hevc_h_loop_filter_luma_12_sse2;
>> -            }
>> +            // FIXME: 12-bit luma deblock over/underflows in some cases
>> +            // if (ARCH_X86_64) {
>> +            //     c->hevc_v_loop_filter_luma =
>> ff_hevc_v_loop_filter_luma_12_sse2;
>> +            //     c->hevc_h_loop_filter_luma =
>> ff_hevc_h_loop_filter_luma_12_sse2;
>> +            // }
>>              SAO_BAND_INIT(12, sse2);
>>              SAO_EDGE_INIT(12, sse2);
>>
> Hi Dekker,
> VVC will utilize this function as well.
> Could you please share the HEVC clip or data that caused the overflow?
> We'll make efforts to address it during the VVC porting
>

You can just run ./tests/checkasm/checkasm --test=hevc_deblock to
find a failing case. My guess is that delta0 overflows before the right
shift, see the ARM64 asm which specfically widens this calculation on 12
bit variant but I'm not 100%, I don't know x86 asm.

-- 
jd
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock
  2024-02-23  3:14   ` Andreas Rheinhardt
@ 2024-02-24  9:54     ` J. Dekker
  0 siblings, 0 replies; 10+ messages in thread
From: J. Dekker @ 2024-02-24  9:54 UTC (permalink / raw)
  To: ffmpeg-devel


Andreas Rheinhardt <andreas.rheinhardt@outlook.com> writes:
> J. Dekker:
>>              SAO_BAND_INIT(12, sse2);
>>              SAO_EDGE_INIT(12, sse2);
>>  
>
> If you disable them here, you should also ensure that they are not
> assembled at all.
>
> - Andreas

Sure, will do on push if no other things to resolve in the latest set.

-- 
jd
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock
  2024-02-24  9:49     ` J. Dekker
@ 2024-02-24 10:46       ` Martin Storsjö
  2024-02-24 18:32         ` J. Dekker
  0 siblings, 1 reply; 10+ messages in thread
From: Martin Storsjö @ 2024-02-24 10:46 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Sat, 24 Feb 2024, J. Dekker wrote:

>
> Nuo Mi <nuomi2021@gmail.com> writes:
>
>> On Wed, Feb 21, 2024 at 7:10 PM J. Dekker <jdek@itanimul.li> wrote:
>>
>>> Over/underflow in some cases.
>>>
>>> Signed-off-by: J. Dekker <jdek@itanimul.li>
>>> ---
>>>  libavcodec/x86/hevcdsp_init.c | 9 +++++----
>>>  1 file changed, 5 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
>>> index 31e81eb11f..11cb1b3bfd 100644
>>> --- a/libavcodec/x86/hevcdsp_init.c
>>> +++ b/libavcodec/x86/hevcdsp_init.c
>>> @@ -1205,10 +1205,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const
>>> int bit_depth)
>>>          if (EXTERNAL_SSE2(cpu_flags)) {
>>>              c->hevc_v_loop_filter_chroma =
>>> ff_hevc_v_loop_filter_chroma_12_sse2;
>>>              c->hevc_h_loop_filter_chroma =
>>> ff_hevc_h_loop_filter_chroma_12_sse2;
>>> -            if (ARCH_X86_64) {
>>> -                c->hevc_v_loop_filter_luma =
>>> ff_hevc_v_loop_filter_luma_12_sse2;
>>> -                c->hevc_h_loop_filter_luma =
>>> ff_hevc_h_loop_filter_luma_12_sse2;
>>> -            }
>>> +            // FIXME: 12-bit luma deblock over/underflows in some cases
>>> +            // if (ARCH_X86_64) {
>>> +            //     c->hevc_v_loop_filter_luma =
>>> ff_hevc_v_loop_filter_luma_12_sse2;
>>> +            //     c->hevc_h_loop_filter_luma =
>>> ff_hevc_h_loop_filter_luma_12_sse2;
>>> +            // }
>>>              SAO_BAND_INIT(12, sse2);
>>>              SAO_EDGE_INIT(12, sse2);
>>>
>> Hi Dekker,
>> VVC will utilize this function as well.
>> Could you please share the HEVC clip or data that caused the overflow?
>> We'll make efforts to address it during the VVC porting
>>
>
> You can just run ./tests/checkasm/checkasm --test=hevc_deblock to
> find a failing case.

To clarify, this is with the new checkasm test added in this patchset, not 
currently in git master - otherwise fate would be failing for everybody on 
x86.

> My guess is that delta0 overflows before the right
> shift, see the ARM64 asm which specfically widens this calculation on 12
> bit variant but I'm not 100%, I don't know x86 asm.

Are you sure the input is within valid range? It's always possible that 
checkasm produces inputs that the real decoder wouldn't - but it's also 
possible that this is a real decoder bug that just hasn't been triggered 
by any other test yet.

// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock
  2024-02-24 10:46       ` Martin Storsjö
@ 2024-02-24 18:32         ` J. Dekker
  0 siblings, 0 replies; 10+ messages in thread
From: J. Dekker @ 2024-02-24 18:32 UTC (permalink / raw)
  To: ffmpeg-devel


Martin Storsjö <martin@martin.st> writes:
> [...]
>
> Are you sure the input is within valid range? It's always possible that
> checkasm produces inputs that the real decoder wouldn't - but it's also
> possible that this is a real decoder bug that just hasn't been triggered by any
> other test yet.
>
> // Martin

The checkasm was just written to just to trigger all the theoretical
edgecases. I know there is a decent range of values which pass the d0 +
d3 < beta check and overflow in (9 * (q0 - p0) - 3 * (q1 - p1) + 8) for
int16_t. I'm not 100% sure that these values can be output by the
decoder, and even if so they're rare.

-- 
jd
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-02-24 19:04 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-21 11:10 [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_deblock: add luma and chroma full J. Dekker
2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock J. Dekker
2024-02-23  3:14   ` Andreas Rheinhardt
2024-02-24  9:54     ` J. Dekker
2024-02-23 12:45   ` Nuo Mi
2024-02-24  9:49     ` J. Dekker
2024-02-24 10:46       ` Martin Storsjö
2024-02-24 18:32         ` J. Dekker
2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 3/3] avcodec/aarch64: add hevc deblock NEON J. Dekker
2024-02-21 12:08   ` Martin Storsjö

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git